# Process Subdirectories II

#### Updated: May 12, 2023

#  

Run for continuous processing of subdirectories, concurrent with the downloading of subdirectories.

In [55]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
from random import randrange

In [56]:
import warnings
warnings.filterwarnings('ignore')

Set parameters:

In [57]:
instance = 1 
user = 'Rav Super' + str(instance)
process_on_VDI = True
wait = 2 #in minutes
stop_loop_threshold = 2640 #max while loops to prevent infinite loop

Set directories:

In [58]:
rootDir_local = 'G:/rnaidoo/Alouette_I/SuperVDI' + str(instance) + '/BATCH_II_Run1/'
rootDir_L = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
downloadedDir = rootDir_local + '02_downloaded/'
processingDir = rootDir_local + '03_processing/'
result_localDir = rootDir_local + '05a_result_local/'
if process_on_VDI:
    processedDir = rootDir_L + '04_processed/' 
    unprocessedDir = rootDir_L + '04a_unprocessed/'
    resultDir = rootDir_L + '05_result/' 
    logDir = rootDir_L + '06_log/'
    move_to_L = True
else:
    processedDir = rootDir_local + '04_processed/' 
    unprocessedDir = rootDir_local + '04a_unprocessed/' 
    resultDir = rootDir_local + '05_result/' 
    logDir = rootDir_local + '06_log/'
    move_to_L = False

#  

#### Functions:

In [59]:
def move_images(old_dir, new_dir, directory, subdir, copy_to_other_drive=False, delete_old_dir=False):
    oldDir = old_dir + directory + '/' + subdir + '/'
    newDir = new_dir + directory + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    
    if copy_to_other_drive:
        if os.path.exists(oldDir):
            for file in os.listdir(oldDir):
                shutil.copyfile(oldDir+file, newDir+file)
    else:
        if os.path.exists(oldDir):
            for file in os.listdir(oldDir):
                os.rename(oldDir+file, newDir+file)
    
    if delete_old_dir:
        if os.path.exists(oldDir):
            shutil.rmtree(old_dir + directory + '/' + subdir + '/')
            if len(os.listdir(old_dir + directory + '/')) == 0:
                shutil.rmtree(old_dir + directory + '/')

#  

#### Check if any subdirectories are waiting to be processed, then process them:

In [None]:
stop_condition = False
stop_condition_counter = 0

while stop_condition == False:
    if len(os.listdir(downloadedDir)) > 0:
        for directory in os.listdir(downloadedDir):
            if 'R' in directory:
                for subdirectory in os.listdir(downloadedDir + directory):
                    start = time.time()
                    subdir_path_end = directory + '/' + subdirectory + '/'

                    #Move to '03_processing'
                    move_images(old_dir=downloadedDir, new_dir=processingDir, directory=directory, subdir=subdirectory)
                    
                    #Clear intermediate results in result_localDir
                    for file in os.listdir(result_localDir):
                        if 'df' in file:
                            os.remove(result_localDir + file)
                        else:
                            shutil.rmtree(result_localDir + file)

                    #Process
                    print('')
                    print('Processing ' + subdir_path_end + ' subdirectory...')
                    !python scan2data/user_input.py $processingDir $result_localDir

                    #Consolidate results
                    if os.path.exists(result_localDir + 'df_dot.csv'):
                        df_dot = pd.read_csv(result_localDir + 'df_dot.csv')
                        n_dot = len(df_dot)
                        df_dot['processed_image_class'] = 'dot'
                        os.remove(result_localDir + 'df_dot.csv')
                    else:
                        df_dot = pd.DataFrame()
                        n_dot = 0

                    if os.path.exists(result_localDir + 'df_num.csv'):
                        df_num = pd.read_csv(result_localDir + 'df_num.csv')
                        n_num = len(df_num)
                        df_num['processed_image_class'] = 'num'
                        os.remove(result_localDir + 'df_num.csv')
                    else:
                        df_num = pd.DataFrame()
                        n_num = 0

                    if os.path.exists(result_localDir + 'df_loss.csv'):
                        df_loss = pd.read_csv(result_localDir + 'df_loss.csv')
                        n_loss = len(df_loss)
                        df_loss['processed_image_class'] = 'loss'
                        os.remove(result_localDir + 'df_loss.csv')
                    else:
                        df_loss = pd.DataFrame()
                        n_loss = 0

                    if os.path.exists(result_localDir + 'df_outlier.csv'):
                        df_outlier = pd.read_csv(result_localDir + 'df_outlier.csv')
                        n_outlier = len(df_outlier)
                        df_outlier['processed_image_class'] = 'outlier'
                        os.remove(result_localDir + 'df_outlier.csv')
                    else:
                        df_outlier = pd.DataFrame()
                        n_outlier = 0

                    df_tot = pd.concat([df_dot, df_num, df_loss, df_outlier])
                    if len(df_tot) > 0:
                        df_tot['Directory'] = directory
                        df_tot['Subdirectory'] = subdirectory
                        if 'file_name' in df_tot.columns:
                            df_tot['filename'] = df_tot['file_name'].str.replace(processingDir + directory + '/' + subdirectory, '')
                            df_tot['filename'] = df_tot['filename'].str.replace('\\', '')
                            df_tot['filename'] = df_tot['filename'].str.replace('/', '')
                        else:
                            df_tot['filename'] = 'unknown'
                        df_tot = df_tot.drop(columns=['file_name', 'mapped_coord', 'subdir_name', 'raw', 'ionogram', 'raw_metadata', 
                                                      'trimmed_metadata', 'padded', 'dilated_metadata'], errors='ignore')
                    os.makedirs(resultDir + directory + '/', exist_ok=True)
                    df_tot.to_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv', index=False)
                    
                    #move mapped_coords to '05_result'
                    mapped_coords_localDir = result_localDir + 'mapped_coords/'
                    mapped_coordsDir = resultDir + 'mapped_coords/'
                    move_images(old_dir=mapped_coords_localDir, new_dir=mapped_coordsDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L)
                    
                    end = time.time()
                    t = end - start
                    print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')
                    print('')
                    
                    #Record performance
                    n_processed = n_dot + n_num + n_loss + n_outlier
                    df_result_ = pd.DataFrame({
                        'Directory': directory,
                        'Subdirectory': subdirectory,
                        'Images_processed': n_processed,
                        'Images_dot': n_dot,
                        'Images_num': n_num,
                        'Images_loss': n_loss,
                        'Images_outlier': n_outlier,
                        'Process_time': t,
                        'Process_timestamp': datetime.fromtimestamp(end),
                        'User': user,
                        'subdir_id': directory + '_' + subdirectory
                    }, index=[0])
                    if os.path.exists(logDir + 'process_log.csv'):
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        df_update = pd.concat([df_log, df_result_], axis=0, ignore_index=True)
                        df_update.to_csv(logDir + 'process_log.csv', index=False)
                    else:
                        if len(df_result_) > 0:
                            df_result_.to_csv(logDir + 'process_log.csv', index=False)
                    
                    #Backup 'process_log' (10% of the time)
                    if randrange(10) == 7:
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
                        os.makedirs(logDir + 'backups/', exist_ok=True)
                        df_log.to_csv(logDir + 'backups/' + 'process_log-' + datetime_str + '.csv', index=False)
                    
                    #Move to '04_processed' or '04a_unprocessed'
                    if n_processed > 0:
                        move_images(old_dir=processingDir, new_dir=processedDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L, delete_old_dir=True)
                    else:
                        move_images(old_dir=processingDir, new_dir=unprocessedDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L, delete_old_dir=True)
                    
                    stop_condition_counter += 1
    
    else:
        #Wait
        print('Wait ' + str(wait) + ' min')
        time.sleep(wait*60)

        
    #Check stop conditions
    if stop_condition_counter == stop_loop_threshold:
        print('Stop!')
        stop_condition = True
                    

#  

#### Re-process list of subdirectories:

In [7]:
df_reprocess = pd.read_csv(logDir + 'image_inventory.csv') #reprocess_list.csv
df_reprocess = df_reprocess.sample(frac=1)
print(len(df_reprocess))
df_reprocess.head()

2638


Unnamed: 0,Roll,Subdirectory,images,subdir_id
319,R014207832,3627-19A,306,R014207832_3627-19A
2561,R014207979F,284,437,R014207979F_284
840,R014207930F,615,74,R014207930F_615
741,R014207909F,707,370,R014207909F_707
88,R014207816,3396-A13,303,R014207816_3396-A13


In [60]:
reprocess_list = ['R014207838_4559-50'] #df_reprocess['subdir_id']

In [61]:
for subdir in reprocess_list:
    
    start = time.time()
    
    subdir_id_parts = subdir.split('_')
    directory = subdir_id_parts[0]
    subdirectory = subdir_id_parts[1]
    subdir_path_end = directory + '/' + subdirectory + '/'
    
    #Clear any old subdirectories in processingDir
    for file in os.listdir(processingDir):
        if 'R' in file:
            shutil.rmtree(processingDir + file)
    
    #Clear intermediate results in result_localDir
    for file in os.listdir(result_localDir):
        if 'df' in file:
            os.remove(result_localDir + file)
        else:
            shutil.rmtree(result_localDir + file)
    
    #Retrieve subdirectory
    if os.path.exists(processedDir + subdir_path_end):
        move_images(old_dir=processedDir, new_dir=processingDir, directory=directory, subdir=subdirectory, copy_to_other_drive=True)
    elif os.path.exists(unprocessedDir + subdir_path_end):
        move_images(old_dir=unprocessedDir, new_dir=processingDir, directory=directory, subdir=subdirectory, copy_to_other_drive=True)
    else:
        print('Cannot find subdirectory ' + subdir + '!')
        continue
    
    #Process
    print('')
    print('Processing ' + subdir_path_end + ' subdirectory...')
    !python scan2data/user_input.py $processingDir $result_localDir

    #Consolidate results
    if os.path.exists(result_localDir + 'df_dot.csv'):
        df_dot = pd.read_csv(result_localDir + 'df_dot.csv')
        n_dot = len(df_dot)
        df_dot['processed_image_class'] = 'dot'
        os.remove(result_localDir + 'df_dot.csv')
    else:
        df_dot = pd.DataFrame()
        n_dot = 0

    if os.path.exists(result_localDir + 'df_num.csv'):
        df_num = pd.read_csv(result_localDir + 'df_num.csv')
        n_num = len(df_num)
        df_num['processed_image_class'] = 'num'
        os.remove(result_localDir + 'df_num.csv')
    else:
        df_num = pd.DataFrame()
        n_num = 0

    if os.path.exists(result_localDir + 'df_loss.csv'):
        df_loss = pd.read_csv(result_localDir + 'df_loss.csv')
        n_loss = len(df_loss)
        df_loss['processed_image_class'] = 'loss'
        os.remove(result_localDir + 'df_loss.csv')
    else:
        df_loss = pd.DataFrame()
        n_loss = 0

    if os.path.exists(result_localDir + 'df_outlier.csv'):
        df_outlier = pd.read_csv(result_localDir + 'df_outlier.csv')
        n_outlier = len(df_outlier)
        df_outlier['processed_image_class'] = 'outlier'
        os.remove(result_localDir + 'df_outlier.csv')
    else:
        df_outlier = pd.DataFrame()
        n_outlier = 0

    df_tot = pd.concat([df_dot, df_num, df_loss, df_outlier])
    if len(df_tot) > 0:
        df_tot['Directory'] = directory
        df_tot['Subdirectory'] = subdirectory
        if 'file_name' in df_tot.columns:
            df_tot['filename'] = df_tot['file_name'].str.replace(processingDir + directory + '/' + subdirectory, '')
            df_tot['filename'] = df_tot['filename'].str.replace('\\', '')
            df_tot['filename'] = df_tot['filename'].str.replace('/', '')
        else:
            df_tot['filename'] = 'unknown'
        df_tot = df_tot.drop(columns=['file_name', 'mapped_coord', 'subdir_name', 'raw', 'ionogram', 'raw_metadata', 
                                      'trimmed_metadata', 'padded', 'dilated_metadata'], errors='ignore')
    os.makedirs(resultDir + directory + '/', exist_ok=True)
    df_tot.to_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv', index=False)

    #move mapped_coords to '05_result'
    mapped_coords_localDir = result_localDir + 'mapped_coords/'
    mapped_coordsDir = resultDir + 'mapped_coords/'
    move_images(old_dir=mapped_coords_localDir, new_dir=mapped_coordsDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L)
    
    end = time.time()
    t = end - start
    print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')
    print('')

    #Record performance
    n_processed = n_dot + n_num + n_loss + n_outlier
    df_result_ = pd.DataFrame({
        'Directory': directory,
        'Subdirectory': subdirectory,
        'Images_processed': n_processed,
        'Images_dot': n_dot,
        'Images_num': n_num,
        'Images_loss': n_loss,
        'Images_outlier': n_outlier,
        'Process_time': t,
        'Process_timestamp': datetime.fromtimestamp(end),
        'User': user,
        'subdir_id': directory + '_' + subdirectory
    }, index=[0])
    if os.path.exists(logDir + 'process_log.csv'):
        df_log = pd.read_csv(logDir + 'process_log.csv')
        df_update = pd.concat([df_log, df_result_], axis=0, ignore_index=True)
        df_update.to_csv(logDir + 'process_log.csv', index=False)
    else:
        if len(df_result_) > 0:
            df_result_.to_csv(logDir + 'process_log.csv', index=False)

    #Backup 'process_log' (10% of the time)
    if randrange(10) == 7:
        df_log = pd.read_csv(logDir + 'process_log.csv')
        datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
        os.makedirs(logDir + 'backups/', exist_ok=True)
        df_log.to_csv(logDir + 'backups/' + 'process_log-' + datetime_str + '.csv', index=False)

    #Move to '04_processed' or '04a_unprocessed'
    if n_processed > 0:
        move_images(old_dir=processingDir, new_dir=processedDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L, delete_old_dir=True)
    else:
        move_images(old_dir=processingDir, new_dir=unprocessedDir, directory=directory, subdir=subdirectory, copy_to_other_drive=move_to_L, delete_old_dir=True)
    


Processing R014207838/4559-50/ subdirectory...


Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\SuperVDI1\Alouette_extract\scan2data\process_directory.py", line 134, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\SuperVDI1\Alouette_extract\scan2data\process_directory.py", line 66, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_bottomside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\SuperVDI1\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 158, in get_bottomside_metadata
    df_num_subse

Processing time for subdirectory: 1.4 min

