# Flatten photo library

In [42]:
# MTS, MOV, mpg, mp4 are video files
# .psd and psb are photoshop files
# bmp, BMP, NEF, JPG, jpg, tif, PNG, pdf are photo files

import pandas as pd
import numpy as np
import os
from exif import Image as im
import datetime
from pathlib import Path
import sys

# ---- get the name of the library that I am checking
# library = sys.argv[1]
path_list = [str(i) for i in Path(os.path.abspath('library')).glob('**/*')]
file_list = [i for i in path_list if os.path.isfile(i)]
ext_list = [i[i.rfind('.'):] for i in file_list]

# ---- create the dataframe
df = pd.DataFrame({'filepath': file_list, 'file_ext': ext_list})

# ---- define the file type
image_file_types = ['.bmp', '.jpg', '.jpeg', '.nef', '.tif', '.png', '.pdf']
video_file_types = ['.mts', '.mov', '.mpg', '.mp4']
photoshop_file_types = ['.psd', '.psb']

df['file_type'] = np.where(df.file_ext.str.lower().isin(image_file_types), 'image', 
                          np.where(df.file_ext.str.lower().isin(video_file_types), 'video', 
                                  np.where(df.file_ext.str.lower().isin(photoshop_file_types), 'photoshop', 'other')))

# ---- identify the metadata
df['datetime'] = np.nan
open_idx = df[df.file_type == 'image'].index

# for i in open_idx:
#     try:
#         with open(df.loc[i,'filepath'], 'rb') as image_file:
#             exif_im = im(image_file)
#             if exif_im.has_exif:
#                 try:
#                     df.loc[i,'datetime'] = datetime.datetime.strptime(exif_im['datetime_original'], '%Y:%m:%d %H:%M:%S')
#                 except:
#                     pass
#             else:
#                 pass
#     except:
#         pass

In [45]:
df.filepath.values

array(['/Users/alex/OneDrive - Duke University/8_coding_projects/photo_organization/library/.DS_Store',
       '/Users/alex/OneDrive - Duke University/8_coding_projects/photo_organization/library/ob_photos_backup2/Nikon Transfer 2/118/_DSC3521.JPG'],
      dtype=object)

In [191]:
import pandas as pd
import numpy as np
import os
import datetime
import shutil
import sys

# ---- get database
library = sys.argv[1]
df = pd.read_hdf('photo_database.h5', library)
if 'datetime' not in df.columns:
    raise Exception('database does not contain image metadata')
    
# ---- check that the dates are valid
invalid_dt = df[(df.datetime > datetime.datetime.today()) | df.datetime < datetime.datetime.utcfromtimestamp(0)].index
df.loc[invalid_dt, 'datetime'] = np.nan

# ---- identify the root directory
root = os.path.abspath('ob_photo_archive')
if not os.path.isdir(root):
    raise Exception('root directory not located')

# ---- create folders for files that do not have metadata
for file_type in df.file_type.unique():
    try:
        os.mkdir(os.path.join(root, file_type))
        print('created sub-folder for {}'.format(file_type))
    except FileExistsError:
        pass
        
# ---- create folder for images that do no have metadata
try:
    os.mkdir(os.path.join(root, 'image', 'unknown'))
    print('created sub-folder for images without metadata')
except FileExistsError:
    pass

# ---- create folders for each unique year
for year in df.datetime.dt.year.unique():
    if pd.notnull(year):
        try:
            os.mkdir(os.path.join(root, 'image', str(int(year))))
            print('created sub-folder: {}'.format(str(int(year))))
        except FileExistsError:
            pass
            
# ---- create destination codes
def create_destinations(df):
    seed_since_epoch = int((datetime.datetime.now() - datetime.datetime.utcfromtimestamp(0)).total_seconds())
    rng = np.random.RandomState(seed_since_epoch)

    df['img_id'] = ['img_id_'+str(i).zfill(10) for i in rng.randint(0,1e10, len(df))]
    df['file_name'] = [df.filepath.loc[i][df.filepath.loc[i].rfind('/')+1:] for i in df.filepath.index]
    df['file_name'] = np.where(pd.notnull(df.datetime), 'date_'+df.datetime.dt.strftime('%Y%m%d') + '_' + df.img_id + df.file_ext.str.lower(), df.file_name)
    destination = []
    for i in df.index:
        if pd.isnull(df.loc[i, 'datetime']):
            destination.append(os.path.join(root, df.loc[i, 'file_type'], df.loc[i, 'file_name']))
        else:
            destination.append(os.path.join(root, df.loc[i, 'file_type'], df.loc[i, 'datetime'].strftime('%Y'), df.loc[i, 'file_name']))
    df['destination'] = destination
    return df

def move_files(df):
    for idx in df.index:
        # ---- make sure that the source file exists
        if os.path.exists(df.loc[idx, 'filepath']):
            # ---- check to make sure that I am not overwriting anything
            if not os.path.exists(df.loc[idx, 'destination']):
                shutil.move(df.loc[idx,'filepath'], df.loc[idx, 'destination'])
            # ---- if I am, rename the file and then move it
            else:
                if pd.isnull(df.loc[idx, 'datetime']):
                    df.loc[idx, 'destination'] = os.path.join(root, df.loc[idx, 'file_type'], 'DUPLICATE' + df.loc[idx, 'file_name'])
                else:
                    df.loc[idx, 'destination'] = os.path.join(root, df.loc[idx, 'file_type'], df.loc[idx, 'datetime'].strftime('%Y'), 'DUPLICATE' + df.loc[idx, 'file_name'])

                shutil.move(df.loc[idx,'filepath'], df.loc[idx, 'destination'])
    return

df = create_destinations(df)
move_files(df)

created sub-folder: 2015
created sub-folder: 2000


In [23]:
import pandas as pd
import numpy as np
import datetime

df = pd.read_hdf('photo_database.h5', 'ob_photos_backup2')
df['import'] = True
df = df.append(pd.read_hdf('photo_database.h5', 'ob_photos_backup1'), ignore_index=True, sort=False)

In [11]:
lib.file_type.value_counts()

image        90837
other         1109
video          203
photoshop       37
Name: file_type, dtype: int64

In [12]:
pd.notnull(lib[lib.file_type == 'image'].datetime).mean()

0.9964111540451578

In [25]:
invalid_dt = df[(df.datetime > datetime.datetime.today()) | (df.datetime < datetime.datetime.utcfromtimestamp(0))].index
df.loc[invalid_dt, 'datetime'] = np.nan

seed_since_epoch = int((datetime.datetime.now() - datetime.datetime.utcfromtimestamp(0)).total_seconds())
rng = np.random.RandomState(seed_since_epoch)

df['img_id'] = ['img_id_'+str(i).zfill(10) for i in rng.randint(0,1e10, len(df))]

df['file_name'] = [df.filepath.loc[i][df.filepath.loc[i].rfind('/')+1:] for i in df.filepath.index]
df['file_name'] = np.where(pd.notnull(df.datetime), 'date_'+df.datetime.dt.strftime('%Y%m%d') + '_' + df.img_id + df.file_ext.str.lower(), df.file_name)

In [30]:
import os
root = os.path.abspath('ob_photo_archive')
destination = []
for i in df.index:
    if pd.isnull(df.loc[i, 'datetime']):
        if df.loc[i, 'file_type'] == 'image':
            destination.append(os.path.join(root, df.loc[i, 'file_type'], 'unknown', df.loc[i, 'file_name']))
        else:
            destination.append(os.path.join(root, df.loc[i, 'file_type'], df.loc[i, 'file_name']))
    else:
        destination.append(os.path.join(root, df.loc[i, 'file_type'], df.loc[i, 'datetime'].strftime('%Y'), df.loc[i, 'file_name']))
df['destination'] = destination

In [41]:
df[(df.duplicated(subset= 'destination')) & (df['import'] == True)].file_type.value_counts()

other        1071
video          48
image          42
photoshop       1
Name: file_type, dtype: int64