# Process Subdirectories II

#### Updated: Dec 17, 2022

#  

Run for continuous processing of subdirectories, concurrent with the downloading of subdirectories.

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
from random import randrange

In [2]:
import warnings
warnings.filterwarnings('ignore')

Set parameters:

In [3]:
user = 'Rav'
process_on_VDI = True
wait = 2 #in minutes
stop_loop_threshold = 2640 #max while loops to prevent infinite loop

Set directories:

In [4]:
rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #C: is not persistent on VDI
rootDir_L = 'L:/DATA/Alouette_I/'
downloadedDir = rootDir_local + '02_downloaded/'
processingDir = rootDir_local + '03_processing/'
if process_on_VDI:
    processedDir = rootDir_L + '04_processed/' 
    resultDir = rootDir_L + '05_result/' 
    logDir = rootDir_L + '06_log/'
    move_to_L = True
else:
    processedDir = rootDir_local + '04_processed/' 
    resultDir = rootDir_local + '05_result/' 
    logDir = rootDir_local + '06_log/'
    move_to_L = False

#  

#### Functions:

In [5]:
def move_images(old_dir, new_dir, roll, subdir, copy_to_other_drive=False):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    
    if copy_to_other_drive:
        for file in os.listdir(oldDir):
            shutil.copyfile(oldDir+file, newDir+file)
    else:
        for file in os.listdir(oldDir):
            os.rename(oldDir+file, newDir+file)
    
    shutil.rmtree(old_dir + roll + '/' + subdir + '/')
    if len(os.listdir(old_dir + roll + '/')) == 0:
        shutil.rmtree(old_dir + roll + '/')

#  

#### Check if subdirectories is waiting to be processed, then process them:

In [None]:
stop_condition = False
stop_condition_counter = 0

while stop_condition == False:
    if len(os.listdir(downloadedDir)) > 0:
        df_result = pd.DataFrame()
        for roll in os.listdir(downloadedDir):
            if 'R' in roll:
                for subdirectory in os.listdir(downloadedDir + roll):
                    start = time.time()
                    subdir_path_end = roll + '/' + subdirectory + '/'

                    #Move to '03_processing'
                    move_images(old_dir=downloadedDir, new_dir=processingDir, roll=roll, subdir=subdirectory)

                    #Process
                    print('')
                    print('Processing ' + subdir_path_end + ' subdirectory...')
                    !python scan2data/user_input.py $processingDir $resultDir

                    #Consolidate results
                    if os.path.exists(resultDir + 'df_dot.csv'):
                        df_dot = pd.read_csv(resultDir + 'df_dot.csv')
                        n_dot = len(df_dot)
                        df_dot['processed_image_class'] = 'dot'
                        os.remove(resultDir + 'df_dot.csv')
                    else:
                        df_dot = pd.DataFrame()
                        n_dot = 0

                    if os.path.exists(resultDir + 'df_num.csv'):
                        df_num = pd.read_csv(resultDir + 'df_num.csv')
                        n_num = len(df_num)
                        df_num['processed_image_class'] = 'num'
                        os.remove(resultDir + 'df_num.csv')
                    else:
                        df_num = pd.DataFrame()
                        n_num = 0

                    if os.path.exists(resultDir + 'df_loss.csv'):
                        df_loss = pd.read_csv(resultDir + 'df_loss.csv')
                        n_loss = len(df_loss)
                        df_loss['processed_image_class'] = 'loss'
                        os.remove(resultDir + 'df_loss.csv')
                    else:
                        df_loss = pd.DataFrame()
                        n_loss = 0

                    if os.path.exists(resultDir + 'df_outlier.csv'):
                        df_outlier = pd.read_csv(resultDir + 'df_outlier.csv')
                        n_outlier = len(df_outlier)
                        df_outlier['processed_image_class'] = 'outlier'
                        os.remove(resultDir + 'df_outlier.csv')
                    else:
                        df_outlier = pd.DataFrame()
                        n_outlier = 0

                    df_tot = pd.concat([df_dot, df_num, df_loss, df_outlier])
                    if len(df_tot) > 0:
                        df_tot['Roll'] = roll
                        df_tot['Subdirectory'] = subdirectory
                        if 'file_name' in df_tot.columns:
                            df_tot['filename'] = df_tot['file_name'].str.replace(processingDir + roll + '/' + subdirectory, '')
                            df_tot['filename'] = df_tot['filename'].str.replace('\\', '')
                            df_tot['filename'] = df_tot['filename'].str.replace('/', '')
                        else:
                            df_tot['filename'] = 'unknown'
                        df_tot = df_tot.drop(columns=['file_name', 'mapped_coord', 'subdir_name', 'raw', 'ionogram', 'raw_metadata', 
                                                      'trimmed_metadata', 'padded', 'dilated_metadata'], errors='ignore')
                    os.makedirs(resultDir + roll + '/', exist_ok=True)
                    df_tot.to_csv(resultDir + roll + '/' + 'result-' + roll + '_' + subdirectory + '.csv', index=False)

                    end = time.time()
                    t = end - start
                    print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')
                    print('')
                    
                    #Record performance
                    df_result_ = pd.DataFrame({
                        'Roll': roll,
                        'Subdirectory': subdirectory,
                        'Images_processed': n_dot + n_num + n_loss + n_outlier,
                        'Images_dot': n_dot,
                        'Images_num': n_num,
                        'Images_loss': n_loss,
                        'Images_outlier': n_outlier,
                        'Process_time': t,
                        'Process_timestamp': datetime.fromtimestamp(end),
                        'User': user,
                        'subdir_id': roll + '_' + subdirectory
                    }, index=[0])
                    df_result = pd.concat([df_result, df_result_], axis=0, ignore_index=True)
                    if os.path.exists(logDir + 'process_log.csv'):
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
                        df_update.to_csv(logDir + 'process_log.csv', index=False)
                    else:
                        if len(df_result) > 0:
                            df_result.to_csv(logDir + 'process_log.csv', index=False)
                    
                    #Backup 'process_log' (10% of the time)
                    if randrange(10) == 7:
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
                        os.makedirs(logDir + 'backups/', exist_ok=True)
                        df_log.to_csv(logDir + 'backups/' + 'process_log-' + datetime_str + '.csv', index=False)
                    
                    #Move to '04_processed'
                    #print("Moving images to '04_processed'")
                    move_images(old_dir=processingDir, new_dir=processedDir, roll=roll, subdir=subdirectory, copy_to_other_drive=move_to_L)
                    
                    stop_condition_counter += 1
    
    else:
        #Wait
        print('Wait ' + str(wait) + ' min')
        time.sleep(wait*60)

        
    #Check stop conditions
    if stop_condition_counter == stop_loop_threshold:
        print('Stop!')
        stop_condition = True
                    


Processing R014207953/2676-13B/ subdirectory...
Processing time for subdirectory: 8.0 min


Processing R014207966/1187-1B/ subdirectory...
Processing time for subdirectory: 17.8 min


Processing R014207821/3322-15A/ subdirectory...


Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.3 min

Wait 2 min
Wait 2 min

Processing R014207956/2402-7A/ subdirectory...
Processing time for subdirectory: 8.7 min


Processing R014207948/1715-9B/ subdirectory...
Processing time for subdirectory: 7.1 min


Processing R014207949/2129-1A/ subdirectory...
Processing time for subdirectory: 22.3 min


Processing R014207978F/310/ subdirectory...
Processing time for subdirectory: 8.4 min


Processing R014207943/2101-5B/ subdirectory...
Processing time for subdirectory: 6.7 min


Processing R014207959/2345-7A/ subdirectory...
Processing time for subdirectory: 4.5 min


Processing R014207907F/522/ subdirectory...
Processing time for subdirectory: 14.9 min


Processing R014207943/2062-1A/ subdirectory...
Processing time for subdirectory: 2.4 min

Wait 2 min
Wait 2 min

Processing R014207967/1396-13B/ subdirectory...
Processing time for subdirectory: 8.1 min


Processing R014207956/2390-7A/ subdirectory...
Processing time for subdirectory: 5.0 min

Wait 2

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.3 min

Wait 2 min

Processing R014207816/3391-A13/ subdirectory...
Processing time for subdirectory: 2.3 min

Wait 2 min
Wait 2 min

Processing R014207815/3507-19A/ subdirectory...
Processing time for subdirectory: 8.5 min


Processing R014207959/2333-1B/ subdirectory...
Processing time for subdirectory: 8.3 min


Processing R014207962/1470-5A/ subdirectory...
Processing time for subdirectory: 6.4 min


Processing R014207841/3116-13B/ subdirectory...
Processing time for subdirectory: 3.8 min

Wait 2 min

Processing R014207959/2288-3A/ subdirectory...
Processing time for subdirectory: 11.0 min


Processing R014207966/1190-1B/ subdirectory...
Processing time for subdirectory: 20.3 min


Processing R014207909F/730/ subdirectory...
Processing time for subdirectory: 7.2 min


Processing R014207909F/729/ subdirectory...
Processing time for subdirectory: 8.5 min


Processing R014207954/2198-4B/ subdirectory...
Processing time for subdirectory: 33.5 min


Pr

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.2 min

Wait 2 min
Wait 2 min
Wait 2 min

Processing R014207964/1620-4A/ subdirectory...
Processing time for subdirectory: 2.3 min

Wait 2 min

Processing R014207947/1908-18A/ subdirectory...
Processing time for subdirectory: 2.0 min

Wait 2 min
Wait 2 min

Processing R014207841/3100-13B/ subdirectory...
Processing time for subdirectory: 11.9 min


Processing R014207960/2531-5B/ subdirectory...
Processing time for subdirectory: 11.3 min


Processing R014207816/3427-50A/ subdirectory...
Processing time for subdirectory: 4.0 min


Processing R014207929F/472/ subdirectory...
Processing time for subdirectory: 7.8 min


Processing R014207942/1935-4/ subdirectory...
Processing time for subdirectory: 5.2 min


Processing R014207815/3534-5A/ subdirectory...
Processing time for subdirectory: 3.1 min


Processing R014207967/1440-13B/ subdirectory...
Processing time for subdirectory: 5.9 min


Processing R014207949/2161-4B/ subdirectory...
Processing time for su