# Process Subdirectories II

#### Updated: Dec 8, 2022

#  

Run for continuous processing of subdirectories, concurrent with the downloading of subdirectories.

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
from random import randrange

In [2]:
import warnings
warnings.filterwarnings('ignore')

Set parameters:

In [3]:
user = 'Rav'
process_on_VDI = True
wait = 2 #in minutes
stop_loop_threshold = 2640 #max while loops to prevent infinite loop

Set directories:

In [4]:
rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #C: is not persistent on VDI
rootDir_U = 'U:/Data_Science/Projects_data/Alouette_I/'
downloadedDir = rootDir_local + '02_downloaded/'
processingDir = rootDir_local + '03_processing/'
if process_on_VDI:
    processedDir = rootDir_U + '04_processed/' 
    resultDir = rootDir_U + '05_result/' 
    logDir = '//scientific/L-MP-Data/Massive files/Python/rnaidoo/Alouette_I/' #DO NOT CHANGE
    move_to_U = True
else:
    processedDir = rootDir_local + '04_processed/' 
    resultDir = rootDir_local + '05_result/' 
    logDir = resultDir
    move_to_U = False

#  

#### Functions:

In [5]:
def move_images(old_dir, new_dir, roll, subdir, copy_to_other_drive=False):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    
    if copy_to_other_drive:
        for file in os.listdir(oldDir):
            shutil.copyfile(oldDir+file, newDir+file)
    else:
        for file in os.listdir(oldDir):
            os.rename(oldDir+file, newDir+file)
    
    shutil.rmtree(old_dir + roll + '/' + subdir + '/')
    if len(os.listdir(old_dir + roll + '/')) == 0:
        shutil.rmtree(old_dir + roll + '/')

#  

#### Check if subdirectories is waiting to be processed, then process them:

In [None]:
stop_condition = False
stop_condition_counter = 0

while stop_condition == False:
    if len(os.listdir(downloadedDir)) > 0:
        df_result = pd.DataFrame()
        for roll in os.listdir(downloadedDir):
            if 'R' in roll:
                for subdirectory in os.listdir(downloadedDir + roll):
                    start = time.time()
                    subdir_path_end = roll + '/' + subdirectory + '/'

                    #Move to '03_processing'
                    move_images(old_dir=downloadedDir, new_dir=processingDir, roll=roll, subdir=subdirectory)

                    #Process
                    print('')
                    print('Processing ' + subdir_path_end + ' subdirectory...')
                    !python scan2data/user_input.py $processingDir $resultDir

                    #Consolidate results
                    if os.path.exists(resultDir + 'df_dot.csv'):
                        df_dot = pd.read_csv(resultDir + 'df_dot.csv')
                        n_dot = len(df_dot)
                        df_dot['processed_image_class'] = 'dot'
                        os.remove(resultDir + 'df_dot.csv')
                    else:
                        df_dot = pd.DataFrame()
                        n_dot = 0

                    if os.path.exists(resultDir + 'df_num.csv'):
                        df_num = pd.read_csv(resultDir + 'df_num.csv')
                        n_num = len(df_num)
                        df_num['processed_image_class'] = 'num'
                        os.remove(resultDir + 'df_num.csv')
                    else:
                        df_num = pd.DataFrame()
                        n_num = 0

                    if os.path.exists(resultDir + 'df_loss.csv'):
                        df_loss = pd.read_csv(resultDir + 'df_loss.csv')
                        n_loss = len(df_loss)
                        df_loss['processed_image_class'] = 'loss'
                        os.remove(resultDir + 'df_loss.csv')
                    else:
                        df_loss = pd.DataFrame()
                        n_loss = 0

                    if os.path.exists(resultDir + 'df_outlier.csv'):
                        df_outlier = pd.read_csv(resultDir + 'df_outlier.csv')
                        n_outlier = len(df_outlier)
                        df_outlier['processed_image_class'] = 'outlier'
                        os.remove(resultDir + 'df_outlier.csv')
                    else:
                        df_outlier = pd.DataFrame()
                        n_outlier = 0

                    df_tot = pd.concat([df_dot, df_num, df_loss, df_outlier])
                    df_tot['Roll'] = roll
                    df_tot['Subdirectory'] = subdirectory
                    df_tot = df_tot.drop(columns=['file_name', 'mapped_coord', 'subdir_name', 'raw', 'ionogram', 'raw_metadata', 
                                                  'trimmed_metadata', 'padded', 'dilated_metadata'], errors='ignore')
                    df_tot.to_csv(resultDir + 'result-' + roll + '_' + subdirectory + '.csv', index=False)

                    end = time.time()
                    t = end - start
                    print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')
                    print('')
                    
                    #Record performance
                    df_result_ = pd.DataFrame({
                        'Roll': roll,
                        'Subdirectory': subdirectory,
                        'Images_processed': n_dot + n_num + n_loss + n_outlier,
                        'Images_dot': n_dot,
                        'Images_num': n_num,
                        'Images_loss': n_loss,
                        'Images_outlier': n_outlier,
                        'Process_time': t,
                        'Process_timestamp': datetime.fromtimestamp(end),
                        'User': user,
                        'subdir_id': roll + '_' + subdirectory
                    }, index=[0])
                    df_result = pd.concat([df_result, df_result_], axis=0, ignore_index=True)
                    if os.path.exists(logDir + 'process_log.csv'):
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
                        df_update.to_csv(logDir + 'process_log.csv', index=False)
                    else:
                        if len(df_result) > 0:
                            df_result.to_csv(logDir + 'process_log.csv', index=False)
                    
                    #Backup 'process_log' (10% of the time)
                    if randrange(10) == 7:
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
                        os.makedirs(logDir + 'backups/', exist_ok=True)
                        df_log.to_csv(logDir + 'backups/' + 'process_log-' + datetime_str + '.csv', index=False)
                    
                    #Move to '04_processed'
                    #print("Moving images to '04_processed'")
                    move_images(old_dir=processingDir, new_dir=processedDir, roll=roll, subdir=subdirectory, copy_to_other_drive=move_to_U)
                    
                    stop_condition_counter += 1
    
    else:
        #Wait
        print('Wait ' + str(wait) + ' min')
        time.sleep(wait*60)

        
    #Check stop conditions
    if stop_condition_counter == stop_loop_threshold:
        print('Stop!')
        stop_condition = True
                    

Wait 2 min

Processing R014207975/1130-B/ subdirectory...
Processing time for subdirectory: 11.1 min


Processing R014207949/2119-5B/ subdirectory...
Processing time for subdirectory: 15.6 min


Processing R014207978F/352/ subdirectory...
Processing time for subdirectory: 6.4 min


Processing R014207821/3319-15A/ subdirectory...
Processing time for subdirectory: 7.5 min


Processing R014207946/1839-1B/ subdirectory...
Processing time for subdirectory: 9.6 min


Processing R014207970/1327-5A/ subdirectory...
Processing time for subdirectory: 7.8 min


Processing R014207954/2227-18B/ subdirectory...
Processing time for subdirectory: 2.7 min

Wait 2 min

Processing R014207955/2833-50B/ subdirectory...
Processing time for subdirectory: 14.5 min


Processing R014207949/2143-1A/ subdirectory...
Processing time for subdirectory: 11.6 min


Processing R014207965/1649-18B/ subdirectory...
Processing time for subdirectory: 6.0 min


Processing R014207951/2453-1B/ subdirectory...
Processing time 

Processing time for subdirectory: 23.3 min


Processing R014207960/2536-8A/ subdirectory...
Processing time for subdirectory: 15.2 min


Processing R014207907F/493/ subdirectory...
Processing time for subdirectory: 9.1 min


Processing R014207938/932-A/ subdirectory...
Processing time for subdirectory: 6.3 min


Processing R014207929F/486/ subdirectory...
Processing time for subdirectory: 10.4 min


Processing R014207953/2672-9A/ subdirectory...
Processing time for subdirectory: 6.3 min


Processing R014207965/1653-6B/ subdirectory...
Processing time for subdirectory: 2.5 min

Wait 2 min

Processing R014207948/1716-1A/ subdirectory...
Processing time for subdirectory: 10.7 min


Processing R014207842/3290-5A/ subdirectory...
Processing time for subdirectory: 6.2 min


Processing R014207959/2297-7A/ subdirectory...
Processing time for subdirectory: 1.2 min

Wait 2 min
Wait 2 min

Processing R014207930F/620/ subdirectory...
Processing time for subdirectory: 9.5 min


Processing R01420790

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.2 min

Wait 2 min
Wait 2 min
Wait 2 min

Processing R014207938/933-B/ subdirectory...
Processing time for subdirectory: 31.8 min


Processing R014207955/2848-50B/ subdirectory...
Processing time for subdirectory: 13.4 min


Processing R014207967/1435-13B/ subdirectory...
Processing time for subdirectory: 6.4 min


Processing R014207840/3046-13B/ subdirectory...
Processing time for subdirectory: 3.7 min

Wait 2 min
Wait 2 min

Processing R014207966/1205-5A/ subdirectory...
Processing time for subdirectory: 15.9 min


Processing R014207840/3025-50A/ subdirectory...
Processing time for subdirectory: 8.0 min


Processing R014207955/2847-50B/ subdirectory...
Processing time for subdirectory: 6.0 min


Processing R014207946/1852-13A/ subdirectory...
Processing time for subdirectory: 3.3 min


Processing R014207832/3620-43A/ subdirectory...
Processing time for subdirectory: 11.4 min


Processing R014207823/3593-50A/ subdirectory...
Processing time for subdi


Processing R014207958/2819-7A/ subdirectory...
Processing time for subdirectory: 3.9 min

Wait 2 min

Processing R014207957/2628-18B/ subdirectory...
Processing time for subdirectory: 6.7 min


Processing R014207975/1123-12B/ subdirectory...
Processing time for subdirectory: 12.1 min


Processing R014207975/1131-1A/ subdirectory...
Processing time for subdirectory: 4.8 min

Wait 2 min

Processing R014207930F/633/ subdirectory...
Processing time for subdirectory: 7.6 min


Processing R014207943/2051-1A/ subdirectory...
Processing time for subdirectory: 2.7 min


Processing R014207957/2613-18B/ subdirectory...
Processing time for subdirectory: 4.2 min

Wait 2 min

Processing R014207823/3590-50A/ subdirectory...
Processing time for subdirectory: 8.8 min


Processing R014207816/3372-A38/ subdirectory...
Processing time for subdirectory: 0.9 min

Wait 2 min
Wait 2 min

Processing R014207964/1626-18B/ subdirectory...
Processing time for subdirectory: 4.6 min


Processing R014207907F/518/ su

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.4 min

Wait 2 min
Wait 2 min

Processing R014207970/1346-5A/ subdirectory...
Processing time for subdirectory: 8.4 min


Processing R014207962/1484-1B/ subdirectory...
Processing time for subdirectory: 7.7 min


Processing R014207908F/596/ subdirectory...
Processing time for subdirectory: 11.2 min


Processing R014207832/3656-8A/ subdirectory...
Processing time for subdirectory: 9.5 min


Processing R014207824/3153-14A/ subdirectory...
Processing time for subdirectory: 3.9 min


Processing R014207844/2915-43B/ subdirectory...
Processing time for subdirectory: 8.7 min


Processing R014207909F/694/ subdirectory...
Processing time for subdirectory: 6.0 min


Processing R014207947/1897-3A/ subdirectory...
Processing time for subdirectory: 3.0 min


Processing R014207842/3261-50A/ subdirectory...
Processing time for subdirectory: 9.2 min


Processing R014207908F/594/ subdirectory...
Processing time for subdirectory: 10.4 min


Processing R014207938/915-A/

libpng error: Read Error


Processing time for subdirectory: 9.6 min


Processing R014207955/2840-50B/ subdirectory...
Processing time for subdirectory: 6.8 min


Processing R014207823/3603-8A/ subdirectory...
Processing time for subdirectory: 4.9 min


Processing R014207955/2846-50B/ subdirectory...
Processing time for subdirectory: 7.9 min


Processing R014207968/1264-5A/ subdirectory...
Processing time for subdirectory: 11.3 min


Processing R014207930F/678/ subdirectory...
Processing time for subdirectory: 4.4 min

Wait 2 min

Processing R014207816/3424-A50/ subdirectory...
Processing time for subdirectory: 5.3 min

Wait 2 min

Processing R014207958/2798-6A/ subdirectory...
Processing time for subdirectory: 5.6 min


Processing R014207965/1666-18B/ subdirectory...
Processing time for subdirectory: 2.9 min

Wait 2 min

Processing R014207841/3117-8A/ subdirectory...
Processing time for subdirectory: 7.9 min


Processing R014207938/907-A/ subdirectory...
Processing time for subdirectory: 6.1 min


Processing R0


Processing R014207953/2651-15B/ subdirectory...
Processing time for subdirectory: 7.0 min


Processing R014207978F/363/ subdirectory...
Processing time for subdirectory: 14.9 min


Processing R014207943/2062-1A/ subdirectory...
Processing time for subdirectory: 2.7 min


Processing R014207815/3504-19A/ subdirectory...
Processing time for subdirectory: 8.5 min


Processing R014207955/2886-43B/ subdirectory...
Processing time for subdirectory: 7.9 min


Processing R014207949/2163-4B/ subdirectory...
Processing time for subdirectory: 7.6 min


Processing R014207930F/667/ subdirectory...
Processing time for subdirectory: 5.7 min


Processing R014207975/1104-B/ subdirectory...
Processing time for subdirectory: 10.3 min


Processing R014207909F/729/ subdirectory...
Processing time for subdirectory: 8.6 min


Processing R014207978F/357/ subdirectory...
Processing time for subdirectory: 6.2 min


Processing R014207943/2069-3A/ subdirectory...


libpng error: Read Error


Processing time for subdirectory: 10.9 min


Processing R014207951/2427-12A/ subdirectory...
Processing time for subdirectory: 8.7 min


Processing R014207844/2936-43B/ subdirectory...
Processing time for subdirectory: 16.5 min


Processing R014207943/2049-1B/ subdirectory...
Processing time for subdirectory: 5.7 min

Wait 2 min

Processing R014207980/1003-A/ subdirectory...
Processing time for subdirectory: 5.1 min


Processing R014207958/2805-7A/ subdirectory...
Processing time for subdirectory: 2.6 min

Wait 2 min
Wait 2 min

Processing R014207938/944-B/ subdirectory...
Processing time for subdirectory: 10.6 min


Processing R014207940F/401/ subdirectory...
Processing time for subdirectory: 10.3 min


Processing R014207978F/354/ subdirectory...
Processing time for subdirectory: 3.8 min


Processing R014207840/3055-53A-2/ subdirectory...
Processing time for subdirectory: 18.0 min


Processing R014207946/1850-13A/ subdirectory...
Processing time for subdirectory: 4.5 min


Processing 

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.3 min

Wait 2 min
Wait 2 min
Wait 2 min

Processing R014207978F/343/ subdirectory...
Processing time for subdirectory: 4.1 min

Wait 2 min

Processing R014207942/1968-5B/ subdirectory...
Processing time for subdirectory: 6.0 min


Processing R014207948/1743-9/ subdirectory...
Processing time for subdirectory: 5.1 min


Processing R014207909F/727/ subdirectory...
Processing time for subdirectory: 7.3 min


Processing R014207960/2541-8A/ subdirectory...
Processing time for subdirectory: 15.7 min


Processing R014207908F/548/ subdirectory...
Processing time for subdirectory: 10.9 min


Processing R014207951/2441-12A/ subdirectory...
Processing time for subdirectory: 26.5 min


Processing R014207842/3283-5A/ subdirectory...
Processing time for subdirectory: 5.7 min


Processing R014207949/2138-5B/ subdirectory...
Processing time for subdirectory: 10.9 min


Processing R014207815/3541-12A/ subdirectory...
Processing time for subdirectory: 6.4 min


Proces