In [1]:
# default_exp pictures

To have isfile and join available we need to import all from os.path

In [25]:
#hide
from nbdev.showdoc import *
import os
from os.path import *
import pandas as pd
from urllib.parse import quote_plus


#export 

import os
import pandas as pd
from os.path import isfile, join


The first step is to get all directories. The reason for this is that the same photo might be stored in different directories. But there also might be photos with the same name but with different content. We will need to compare all photos with the same name in the different directories.  
Therefore we start in building a pandas DataFrame with all essential data.  

It will contain the directory, the file name, the size and the different time stamps. If this information is not sufficient we will also need to read the picture pixel by pixel and compare. 

We now start with the directory list. The root is the mapped MyCloud fileserver with the "My Pictures" folder. We will create an iteratively created list of subdirectories and append them to 'my_Dirs'.

In [3]:
#export

mypath = r"/media/mycloud/My Pictures"

def fast_scandir(dirname):
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders
my_Dirs = fast_scandir(mypath)

In [4]:
#hide 
print(len(my_Dirs))
for dirname in my_Dirs:
    print(dirname)

76
/media/mycloud/My Pictures/Pics_20120130
/media/mycloud/My Pictures/My Scans
/media/mycloud/My Pictures/IPhone_Mama_FOTOS
/media/mycloud/My Pictures/27.09.04
/media/mycloud/My Pictures/TMP
/media/mycloud/My Pictures/Oma und Opa
/media/mycloud/My Pictures/Album Art
/media/mycloud/My Pictures/MCC Opening Mexico
/media/mycloud/My Pictures/iCloud Photos
/media/mycloud/My Pictures/Hochzeit
/media/mycloud/My Pictures/Galaxy Note II
/media/mycloud/My Pictures/Family
/media/mycloud/My Pictures/Casamento Andreia
/media/mycloud/My Pictures/BlackBerry
/media/mycloud/My Pictures/Arthur
/media/mycloud/My Pictures/Andrea Haus
/media/mycloud/My Pictures/Ana
/media/mycloud/My Pictures/Aline
/media/mycloud/My Pictures/2006_11_25
/media/mycloud/My Pictures/2006_11_20
/media/mycloud/My Pictures/2006_11_09
/media/mycloud/My Pictures/moved_from_root
/media/mycloud/My Pictures/Saved Pictures
/media/mycloud/My Pictures/ControlCenter4
/media/mycloud/My Pictures/Camera Roll
/media/mycloud/My Pictures/Angeli

This cell gets all the files which are in the above collected directories.  
The next step is to create lists which contain the 
- directory name, the file name and absolute path to each file
- statistical data, like the file size and creation time stamp.

In [5]:
#export
stat_data = []
entry_data = []

for dirname in my_Dirs:
    with os.scandir(dirname) as dir_content:
        for entry in dir_content:
            if isfile(join(dirname, entry)):
                info = entry.stat()        
                stat_data.append(info)
                entry_data.append([dirname, entry.name, entry.path])

The next step is to create pandas DataFrames out of the lists, and To make this more readable we change the column names to be more recognizeable. Then we concatenate them to a bigger DataFrame.  
.

In [6]:
#hide
print(len(stat_data))
print(len(entry_data))

print(entry_data[50:60])

6027
6027
[['/media/mycloud/My Pictures/Oma und Opa', 'P9020041.JPG', '/media/mycloud/My Pictures/Oma und Opa/P9020041.JPG'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0426.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0426.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0419.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0419.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0415.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0415.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0403.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0403.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0398.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0398.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0363.jpg', '/media/mycloud/My Pictures/MCC Opening Mexico/IMG_0363.jpg'], ['/media/mycloud/My Pictures/MCC Opening Mexico', 'IMG_0359.jpg', '/media/mycloud/My Pictures/MCC Op

In [7]:
#export
df1 = pd.DataFrame(entry_data)

In [8]:
#hide
df1.head()

Unnamed: 0,0,1,2
0,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0008.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...
1,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0007.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...
2,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0006.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...
3,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0005.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...
4,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0004.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...


Renaming of DataFrame 1:

> note: the parameter 'inplace=True' is necessary otherwise the DataFrame immediately forgets the renaming again. 

In [9]:
#export 
df1.rename(columns={0: "directory", 1: "file_name", 2: "absolute_file_name"}, inplace=True)
# df1.head()

Renaming of DataFrame 2:

> note: the rename function did not allow to insert a line break to make the code more readable. Therefore I decided to split the code into 2 lines.

In [10]:
#export
df2 = pd.DataFrame(stat_data)
df2.rename(columns={ 0 : "st_mode", 1: "st_ino", 2 : "st_dev", 3 : "st_nlink", 4 : "st_uid", 5: "st_gid",}, inplace=True) 
df2.rename(columns={ 6 : "st_size", 7 : "st_atime", 8 : "st_mtime", 9 : "st_ctime"}, inplace=True) 

In [11]:
#export 
df = pd.concat([df1, df2], axis=1)
df.head()

Unnamed: 0,directory,file_name,absolute_file_name,st_mode,st_ino,st_dev,st_nlink,st_uid,st_gid,st_size,st_atime,st_mtime,st_ctime
0,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0008.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...,33261,157615585,62,1,0,0,801274,1226915253,1226915253,1226915253
1,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0007.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...,33261,157615586,62,1,0,0,826741,1226915162,1226915162,1226915162
2,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0006.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...,33261,157615587,62,1,0,0,750390,1226915101,1226915101,1226915101
3,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0005.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...,33261,157615588,62,1,0,0,783234,1226915046,1226915046,1226915046
4,/media/mycloud/My Pictures/My Scans,Val_Pagamento_2008_0004.jpg,/media/mycloud/My Pictures/My Scans/Val_Pagame...,33261,157615589,62,1,0,0,774076,1226914986,1226914986,1226914986


Now we start processing the data. The first step is to sort the files according to the name. By this I hope to identify a lot of duplicates. 

In [12]:
#export
df.sort_values("file_name", inplace=True)
df.head()

Unnamed: 0,directory,file_name,absolute_file_name,st_mode,st_ino,st_dev,st_nlink,st_uid,st_gid,st_size,st_atime,st_mtime,st_ctime
1475,/media/mycloud/My Pictures/Arthur,1.jpg,/media/mycloud/My Pictures/Arthur/1.jpg,33261,157617023,62,1,0,0,152121,985070906,985070906,985070906
6026,/media/mycloud/My Pictures/Album Art/Alfred Sc...,1112D2sBmTL._SL500_AA300_.jpg,/media/mycloud/My Pictures/Album Art/Alfred Sc...,33261,157597219,62,1,0,0,5118,1357350218,1357350218,1357350218
20,/media/mycloud/My Pictures/TMP,111FA374-6FF3-450E-804F-E5D27776A583.jpg,/media/mycloud/My Pictures/TMP/111FA374-6FF3-4...,33261,157617644,62,1,0,0,1469153,1539622706,1539622706,1539622706
19,/media/mycloud/My Pictures/TMP,1134100C-37B7-4A23-A666-D55944B46F87.jpg,/media/mycloud/My Pictures/TMP/1134100C-37B7-4...,33261,157617619,62,1,0,0,1683508,1532252065,1532252065,1532252065
22,/media/mycloud/My Pictures/TMP,11664784-3132-4D1E-90E1-4E5CC4DC7B1B.jpg,/media/mycloud/My Pictures/TMP/11664784-3132-4...,33261,157607182,62,1,0,0,1924451,1540552075,1540552075,1540552075


In [13]:
#export  
df.duplicated(subset=['file_name'])

1475    False
6026    False
20      False
19      False
22      False
        ...  
1808    False
1809    False
1813    False
6006    False
6015    False
Length: 6027, dtype: bool

In [14]:
#export
df_duplicates = df[df['file_name'].duplicated(keep=False)]
len(df_duplicates)

# df[df.duplicated(['ID'], keep=False)]
# data.sort_values(by=['Order ID'], inplace=True)
# df = data[data['Order ID'].duplicated(keep=False)]


113

In [15]:
#export
df_duplicates[80:113]

Unnamed: 0,directory,file_name,absolute_file_name,st_mode,st_ino,st_dev,st_nlink,st_uid,st_gid,st_size,st_atime,st_mtime,st_ctime
1694,/media/mycloud/My Pictures/2006_11_09,DSCN0832.JPG,/media/mycloud/My Pictures/2006_11_09/DSCN0832...,33261,157617242,62,1,0,0,176845,1162765318,1162765318,1162765318
1650,/media/mycloud/My Pictures/2006_11_20,DSCN0832.JPG,/media/mycloud/My Pictures/2006_11_20/DSCN0832...,33261,157617198,62,1,0,0,176845,1162754518,1162754518,1162754518
1693,/media/mycloud/My Pictures/2006_11_09,DSCN0833.JPG,/media/mycloud/My Pictures/2006_11_09/DSCN0833...,33261,157617241,62,1,0,0,175699,1162765340,1162765340,1162765340
1649,/media/mycloud/My Pictures/2006_11_20,DSCN0833.JPG,/media/mycloud/My Pictures/2006_11_20/DSCN0833...,33261,157617197,62,1,0,0,175699,1162754540,1162754540,1162754540
1692,/media/mycloud/My Pictures/2006_11_09,DSCN0834.JPG,/media/mycloud/My Pictures/2006_11_09/DSCN0834...,33261,157617240,62,1,0,0,194938,1162765464,1162765464,1162765464
1648,/media/mycloud/My Pictures/2006_11_20,DSCN0834.JPG,/media/mycloud/My Pictures/2006_11_20/DSCN0834...,33261,157617196,62,1,0,0,194938,1162754664,1162754664,1162754664
1647,/media/mycloud/My Pictures/2006_11_20,DSCN0835.JPG,/media/mycloud/My Pictures/2006_11_20/DSCN0835...,33261,157617195,62,1,0,0,191698,1162754680,1162754680,1162754680
1691,/media/mycloud/My Pictures/2006_11_09,DSCN0835.JPG,/media/mycloud/My Pictures/2006_11_09/DSCN0835...,33261,157617239,62,1,0,0,191698,1162765480,1162765480,1162765480
6011,/media/mycloud/My Pictures/Album Art/steve Win...,Folder.jpg,/media/mycloud/My Pictures/Album Art/steve Win...,33261,157597204,62,1,0,0,13005,1357392803,1357392803,1357392803
6007,/media/mycloud/My Pictures/Album Art/Van Morri...,Folder.jpg,/media/mycloud/My Pictures/Album Art/Van Morri...,33261,157597200,62,1,0,0,71860,1357347319,1357347319,1357347319


In [17]:
from pymongo import MongoClient
MongoClient()

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [27]:
host=['localhost:27017']
uri = "mongodb://"
user = 'alois'
password = 'Priv2021'
"mongodb://%s:%s@%s" % (
    quote_plus(user), quote_plus(password), host)
'''client = MongoClient(
    host = 'alois-PCx0Dx:27027',
    username = 'alois',
    password = 'Priv2021')'''

"client = MongoClient(\n    host = 'alois-PCx0Dx:27027',\n    username = 'alois',\n    password = 'Priv2021')"

In [28]:
database_names = client.list_database_names()