# Process Subdirectories II

#### Updated: Dec 17, 2022

#  

Run for continuous processing of subdirectories, concurrent with the downloading of subdirectories.

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
from random import randrange

In [2]:
import warnings
warnings.filterwarnings('ignore')

Set parameters:

In [3]:
user = 'Rav'
process_on_VDI = True
wait = 2 #in minutes
stop_loop_threshold = 2640 #max while loops to prevent infinite loop

Set directories:

In [4]:
rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #C: is not persistent on VDI
rootDir_L = 'L:/DATA/Alouette_I/'
downloadedDir = rootDir_local + '02_downloaded/'
processingDir = rootDir_local + '03_processing/'
if process_on_VDI:
    processedDir = rootDir_L + '04_processed/' 
    resultDir = rootDir_L + '05_result/' 
    logDir = rootDir_L + '06_log/'
    move_to_L = True
else:
    processedDir = rootDir_local + '04_processed/' 
    resultDir = rootDir_local + '05_result/' 
    logDir = rootDir_local + '06_log/'
    move_to_L = False

#  

#### Functions:

In [5]:
def move_images(old_dir, new_dir, roll, subdir, copy_to_other_drive=False):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    
    if copy_to_other_drive:
        for file in os.listdir(oldDir):
            shutil.copyfile(oldDir+file, newDir+file)
    else:
        for file in os.listdir(oldDir):
            os.rename(oldDir+file, newDir+file)
    
    shutil.rmtree(old_dir + roll + '/' + subdir + '/')
    if len(os.listdir(old_dir + roll + '/')) == 0:
        shutil.rmtree(old_dir + roll + '/')

#  

#### Check if subdirectories is waiting to be processed, then process them:

In [None]:
stop_condition = False
stop_condition_counter = 0

while stop_condition == False:
    if len(os.listdir(downloadedDir)) > 0:
        df_result = pd.DataFrame()
        for roll in os.listdir(downloadedDir):
            if 'R' in roll:
                for subdirectory in os.listdir(downloadedDir + roll):
                    start = time.time()
                    subdir_path_end = roll + '/' + subdirectory + '/'

                    #Move to '03_processing'
                    move_images(old_dir=downloadedDir, new_dir=processingDir, roll=roll, subdir=subdirectory)

                    #Process
                    print('')
                    print('Processing ' + subdir_path_end + ' subdirectory...')
                    !python scan2data/user_input.py $processingDir $resultDir

                    #Consolidate results
                    if os.path.exists(resultDir + 'df_dot.csv'):
                        df_dot = pd.read_csv(resultDir + 'df_dot.csv')
                        n_dot = len(df_dot)
                        df_dot['processed_image_class'] = 'dot'
                        os.remove(resultDir + 'df_dot.csv')
                    else:
                        df_dot = pd.DataFrame()
                        n_dot = 0

                    if os.path.exists(resultDir + 'df_num.csv'):
                        df_num = pd.read_csv(resultDir + 'df_num.csv')
                        n_num = len(df_num)
                        df_num['processed_image_class'] = 'num'
                        os.remove(resultDir + 'df_num.csv')
                    else:
                        df_num = pd.DataFrame()
                        n_num = 0

                    if os.path.exists(resultDir + 'df_loss.csv'):
                        df_loss = pd.read_csv(resultDir + 'df_loss.csv')
                        n_loss = len(df_loss)
                        df_loss['processed_image_class'] = 'loss'
                        os.remove(resultDir + 'df_loss.csv')
                    else:
                        df_loss = pd.DataFrame()
                        n_loss = 0

                    if os.path.exists(resultDir + 'df_outlier.csv'):
                        df_outlier = pd.read_csv(resultDir + 'df_outlier.csv')
                        n_outlier = len(df_outlier)
                        df_outlier['processed_image_class'] = 'outlier'
                        os.remove(resultDir + 'df_outlier.csv')
                    else:
                        df_outlier = pd.DataFrame()
                        n_outlier = 0

                    df_tot = pd.concat([df_dot, df_num, df_loss, df_outlier])
                    if len(df_tot) > 0:
                        df_tot['Roll'] = roll
                        df_tot['Subdirectory'] = subdirectory
                        if 'file_name' in df_tot.columns:
                            df_tot['filename'] = df_tot['file_name'].str.replace(processingDir + roll + '/' + subdirectory, '')
                            df_tot['filename'] = df_tot['filename'].str.replace('\\', '')
                            df_tot['filename'] = df_tot['filename'].str.replace('/', '')
                        else:
                            df_tot['filename'] = 'unknown'
                        df_tot = df_tot.drop(columns=['file_name', 'mapped_coord', 'subdir_name', 'raw', 'ionogram', 'raw_metadata', 
                                                      'trimmed_metadata', 'padded', 'dilated_metadata'], errors='ignore')
                    os.makedirs(resultDir + roll + '/', exist_ok=True)
                    df_tot.to_csv(resultDir + roll + '/' + 'result-' + roll + '_' + subdirectory + '.csv', index=False)

                    end = time.time()
                    t = end - start
                    print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')
                    print('')
                    
                    #Record performance
                    df_result_ = pd.DataFrame({
                        'Roll': roll,
                        'Subdirectory': subdirectory,
                        'Images_processed': n_dot + n_num + n_loss + n_outlier,
                        'Images_dot': n_dot,
                        'Images_num': n_num,
                        'Images_loss': n_loss,
                        'Images_outlier': n_outlier,
                        'Process_time': t,
                        'Process_timestamp': datetime.fromtimestamp(end),
                        'User': user,
                        'subdir_id': roll + '_' + subdirectory
                    }, index=[0])
                    df_result = pd.concat([df_result, df_result_], axis=0, ignore_index=True)
                    if os.path.exists(logDir + 'process_log.csv'):
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
                        df_update.to_csv(logDir + 'process_log.csv', index=False)
                    else:
                        if len(df_result) > 0:
                            df_result.to_csv(logDir + 'process_log.csv', index=False)
                    
                    #Backup 'process_log' (10% of the time)
                    if randrange(10) == 7:
                        df_log = pd.read_csv(logDir + 'process_log.csv')
                        datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
                        os.makedirs(logDir + 'backups/', exist_ok=True)
                        df_log.to_csv(logDir + 'backups/' + 'process_log-' + datetime_str + '.csv', index=False)
                    
                    #Move to '04_processed'
                    #print("Moving images to '04_processed'")
                    move_images(old_dir=processingDir, new_dir=processedDir, roll=roll, subdir=subdirectory, copy_to_other_drive=move_to_L)
                    
                    stop_condition_counter += 1
    
    else:
        #Wait
        print('Wait ' + str(wait) + ' min')
        time.sleep(wait*60)

        
    #Check stop conditions
    if stop_condition_counter == stop_loop_threshold:
        print('Stop!')
        stop_condition = True
                    


Processing R014207823/3592-50A/ subdirectory...
Processing time for subdirectory: 9.9 min


Processing R014207907F/527/ subdirectory...


Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 51, in process_subdirectory
    col_peaks, row_peaks, mapping_Hz, mapping_km = get_grid_mappings(stack) #from grid_mapping
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\ionogram_grid_determination\grid_mapping.py", line 153, in get_grid_mappings
    col_peaks = indices_highest_peaks(weighed_sum, 0)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\ionogram_grid_determination\g

Processing time for subdirectory: 0.3 min

Wait 2 min
Wait 2 min
Wait 2 min
Wait 2 min

Processing R014207978F/313/ subdirectory...
Processing time for subdirectory: 6.5 min


Processing R014207954/2171-1A/ subdirectory...
Processing time for subdirectory: 14.9 min


Processing R014207909F/736/ subdirectory...
Processing time for subdirectory: 3.7 min

Wait 2 min
Wait 2 min

Processing R014207930F/647/ subdirectory...
Processing time for subdirectory: 14.1 min


Processing R014207975/1143-5-A/ subdirectory...
Processing time for subdirectory: 7.7 min


Processing R014207815/3489-15A/ subdirectory...
Processing time for subdirectory: 15.1 min


Processing R014207974/773-B/ subdirectory...
Processing time for subdirectory: 2.8 min

Wait 2 min
Wait 2 min

Processing R014207948/1736-18B/ subdirectory...
Processing time for subdirectory: 5.9 min


Processing R014207953/2674-13B/ subdirectory...
Processing time for subdirectory: 9.0 min


Processing R014207974/780-B/ subdirectory...
Processi


Processing R014207967/1432-13B/ subdirectory...
Processing time for subdirectory: 4.4 min


Processing R014207954/2178-1A/ subdirectory...
Processing time for subdirectory: 13.5 min


Processing R014207841/3068-8A/ subdirectory...
Processing time for subdirectory: 1.2 min

Wait 2 min

Processing R014207840/3056-13B/ subdirectory...
Processing time for subdirectory: 0.8 min

Wait 2 min
Wait 2 min

Processing R014207956/2377-7A/ subdirectory...
Processing time for subdirectory: 5.1 min

Wait 2 min

Processing R014207842/3250-38A/ subdirectory...
Processing time for subdirectory: 4.4 min

Wait 2 min

Processing R014207824/3153-14A/ subdirectory...
Processing time for subdirectory: 4.0 min

Wait 2 min

Processing R014207946/1835-3A/ subdirectory...
Processing time for subdirectory: 3.5 min

Wait 2 min

Processing R014207907F/498/ subdirectory...
Processing time for subdirectory: 5.7 min


Processing R014207815/3519-43A/ subdirectory...
Processing time for subdirectory: 13.1 min


Processi

libpng error: Read Error


Processing time for subdirectory: 9.5 min


Processing R014207960/2542-5B/ subdirectory...
Processing time for subdirectory: 13.4 min


Processing R014207840/3009-43B/ subdirectory...
Processing time for subdirectory: 9.6 min


Processing R014207979F/287/ subdirectory...
Processing time for subdirectory: 3.0 min

Wait 2 min

Processing R014207930F/661/ subdirectory...
Processing time for subdirectory: 4.9 min

Wait 2 min

Processing R014207840/3058-53A-2/ subdirectory...
Processing time for subdirectory: 10.2 min


Processing R014207951/2451-1B/ subdirectory...
Processing time for subdirectory: 8.0 min


Processing R014207959/2317-3A/ subdirectory...
Processing time for subdirectory: 17.4 min


Processing R014207948/1734-18B/ subdirectory...
Processing time for subdirectory: 7.5 min


Processing R014207929F/480/ subdirectory...
Processing time for subdirectory: 12.9 min


Processing R014207824/3179-14A/ subdirectory...
Processing time for subdirectory: 3.4 min

Wait 2 min

Processing R


Processing R014207955/2832-50B/ subdirectory...
Processing time for subdirectory: 9.7 min


Processing R014207960/2573-8A/ subdirectory...
Processing time for subdirectory: 2.1 min


Processing R014207970/1332-18B/ subdirectory...
Processing time for subdirectory: 4.9 min

Wait 2 min

Processing R014207844/2925-43B/ subdirectory...
Processing time for subdirectory: 12.4 min


Processing R014207930F/648/ subdirectory...
Processing time for subdirectory: 13.2 min


Processing R014207979F/285/ subdirectory...
Processing time for subdirectory: 5.8 min


Processing R014207815/3514-14A/ subdirectory...
Processing time for subdirectory: 11.0 min


Processing R014207908F/548/ subdirectory...
Processing time for subdirectory: 10.9 min


Processing R014207956/2394-1B/ subdirectory...
Processing time for subdirectory: 7.9 min


Processing R014207947/1918-4/ subdirectory...
Processing time for subdirectory: 1.6 min

Wait 2 min
Wait 2 min

Processing R014207958/2786-2RR/ subdirectory...
Processing

Traceback (most recent call last):
  File "c:\DevSoftware\Anaconda38\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'trimmed_metadata'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\DevSoftware\Anaconda38\lib\site-packages\pandas\core\generic.py", line 3826, in _set_item
    loc = self._info_axis.get_loc(key)
  File "c:\DevSoftware\Anaconda38\lib\site-packages\pandas\core\indexes\base.py", line 3082, in get_loc
    raise KeyError(key) from err
KeyError

Processing time for subdirectory: 0.2 min

Wait 2 min
Wait 2 min

Processing R014207974/741-B/ subdirectory...
Processing time for subdirectory: 9.3 min


Processing R014207964/1567-1B/ subdirectory...
Processing time for subdirectory: 8.6 min


Processing R014207823/3604-8A/ subdirectory...
Processing time for subdirectory: 7.3 min


Processing R014207956/2358-7A/ subdirectory...
Processing time for subdirectory: 5.4 min


Processing R014207975/1122-12B/ subdirectory...
Processing time for subdirectory: 6.6 min


Processing R014207844/2926-43B/ subdirectory...
Processing time for subdirectory: 16.3 min


Processing R014207940F/400/ subdirectory...
Processing time for subdirectory: 6.4 min


Processing R014207908F/582/ subdirectory...
Processing time for subdirectory: 14.3 min


Processing R014207947/1896-9B/ subdirectory...
Processing time for subdirectory: 5.0 min


Processing R014207844/2911-43B/ subdirectory...
Processing time for subdirectory: 12.4 min


Processing R014207978F/311

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 7.9 min


Processing R014207951/2416-9A/ subdirectory...
Processing time for subdirectory: 12.7 min


Processing R014207953/2648-1A/ subdirectory...
Processing time for subdirectory: 9.5 min


Processing R014207960/2541-8A/ subdirectory...
Processing time for subdirectory: 15.4 min


Processing R014207938/952-A/ subdirectory...
Processing time for subdirectory: 5.6 min


Processing R014207975/1118-12B/ subdirectory...
Processing time for subdirectory: 7.1 min


Processing R014207942/1954-5A/ subdirectory...
Processing time for subdirectory: 3.6 min


Processing R014207958/2768-4A/ subdirectory...
Processing time for subdirectory: 3.0 min

Wait 2 min

Processing R014207840/3061-08A/ subdirectory...
Processing time for subdirectory: 5.4 min


Processing R014207953/2704-13B/ subdirectory...
Processing time for subdirectory: 6.7 min


Processing R014207958/2793-6A/ subdirectory...
Processing time for subdirectory: 2.5 min

Wait 2 min
Wait 2 min

Processing

Traceback (most recent call last):
  File "scan2data/user_input.py", line 35, in <module>
    main()
  File "scan2data/user_input.py", line 25, in main
    process_directory.process_extract_management(dir_csv_output, master_dir, regex_raw, sample_subdir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 323, in process_extract_management
    df_processed, df_loss, df_outlier = process_subdirectory(sample_subdir, regex_raw)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\process_directory.py", line 69, in process_subdirectory
    df_img_bottom, df_loss_meta_bottom, dict_mapping_bottom, dict_hist_bottom = get_bottomside_metadata(df_img_bottom, subdir_path) #from metadata_translation.translate_leftside_metadata
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_extract\scan2data\metadata_translation\translate_leftside_metadata.py", line 157, in get_bottomside_metadata
    df_dot_subset = df_img[np.array(df_img['is_d

Processing time for subdirectory: 0.2 min

Wait 2 min
Wait 2 min
Wait 2 min

Processing R014207943/2071-5B/ subdirectory...
Processing time for subdirectory: 6.6 min


Processing R014207947/1905-6A/ subdirectory...
Processing time for subdirectory: 1.5 min

Wait 2 min
Wait 2 min

Processing R014207929F/440/ subdirectory...
Processing time for subdirectory: 13.6 min


Processing R014207964/1609-4A/ subdirectory...
Processing time for subdirectory: 8.4 min


Processing R014207908F/596/ subdirectory...
Processing time for subdirectory: 10.9 min


Processing R014207929F/434/ subdirectory...
Processing time for subdirectory: 12.8 min


Processing R014207842/3263-43A/ subdirectory...
Processing time for subdirectory: 6.2 min


Processing R014207968/1223-5A/ subdirectory...
Processing time for subdirectory: 14.8 min


Processing R014207975/1101-B/ subdirectory...
Processing time for subdirectory: 9.6 min


Processing R014207953/2669-15B/ subdirectory...
Processing time for subdirectory: 11.5 