# Process Subdirectories

#### Updated: Dec 5, 2022

#  

Inspect '02_downloaded' folder for fully downloaded subdirectories, move these subdirectories to '03_processing'. Process subdirectories. Record processing performance in a 'process_log'. Then move processed subdirectories to '04_processed' folder. 

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #C: is not persistent on VDI
rootDir_U = rootDir_local #'U:/Data_Science/Projects_data/Alouette_I/'
downloadedDir = rootDir_local + '02_downloaded/'
processingDir = rootDir_local + '03_processing/'
processedDir = rootDir_U + '04_processed/'
resultDir = rootDir_U + '05_result/'

#  

#### Functions:

In [4]:
def move_images(old_dir, new_dir, roll, subdir):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    for file in os.listdir(oldDir):
        os.rename(oldDir+file, newDir+file)
    shutil.rmtree(old_dir + roll + '/')

#  

#### Process new fully downloaded subdirectories:

Move one fully downloaded subdirectory from '02_downloaded' to '03_processing', process, then move images to '04_processing' - do this one subdirectory at a time. 

In [5]:
for roll in os.listdir(downloadedDir):
    if 'R' in roll:
        for subdirectory in os.listdir(downloadedDir + roll):
            start = time.time()
            subdir_path_end = roll + '/' + subdirectory + '/'
            
            #Move to '03_processing'
            move_images(old_dir=downloadedDir, new_dir=processingDir, roll=roll, subdir=subdirectory)
            
            #Process
            print('Processing ' + subdir_path_end + ' subdirectory...')
            !python scan2data/user_input.py $processingDir $resultDir
            
            #Move to '04_processed'
            print("Moving images to '04_processed'")
            move_images(old_dir=processingDir, new_dir=processedDir, roll=roll, subdir=subdirectory)

            end = time.time()
            t = end - start
            print('Processing time for subdirectory: ' + str(round(t/60, 1)) + ' min')

Processing R014207815/3488-15A/ subdirectory...
Moving images to '04_processed'
Processing time for subdirectory: 18.4 min


Traceback (most recent call last):
  File "scan2data/user_input.py", line 45, in <module>
    main()
  File "scan2data/user_input.py", line 29, in main
    process_directory.process_extract_management(dir_csv_output,
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_I\code\Alouette_extract\scan2data\process_directory.py", line 296, in process_extract_management
    df_num_subset = process_df_bottomside_metadata(processed, subdir_name, master_dir)
  File "C:\Users\rnaidoo\Documents\Projects\Alouette_I\code\Alouette_extract\scan2data\process_directory.py", line 176, in process_df_bottomside_metadata
    df_final_data = df_processed[['file_name', 'fmin', 'max_depth', 'dict_metadata', 'mapped_coord']]
  File "c:\DevSoftware\Anaconda38\lib\site-packages\pandas\core\frame.py", line 3030, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
  File "c:\DevSoftware\Anaconda38\lib\site-packages\pandas\core\indexing.py", line 1266, in _get_listlike_in

Processing on CSA laptop --> 18.4 min

#  

#### Development:

In [None]:
rollnames = []
for file in os.listdir(dataDir):
    if 'R' in file:
        rollnames.append(file)

In [None]:
if len(rollnames) == 1:
    start = time.time()
    !python scan2data/user_input.py $dataDir $outputDir
    end = time.time()
    t = end - start
    print('Runtime: ' + str(round(t/60, 1)) + ' min')

Record performance:

In [None]:
df_num = pd.read_csv(outputDir + 'num_data.csv')
df_loss = pd.read_csv(outputDir + 'loss.csv')
df_outlier = pd.read_csv(outputDir + 'outlier.csv')
n_num = len(df_num)
n_loss = len(df_loss)
n_outlier = len(df_outlier)
images_processed = n_num + n_loss + n_outlier
print('Process Rate: ' + str(round(images_processed/t, 1)) + ' images/s')

In [None]:
df_result = pd.DataFrame({
    'Roll': rollnames[0],
    'Processing_Time': t,
    'Images_processed': images_processed,
    'Process_timestamp': datetime.fromtimestamp(end)
}, index=[0])

In [None]:
if os.path.exists(outputDir + 'process_log.csv'):
    df_log = pd.read_csv(outputDir + 'process_log.csv')
    df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
    df_update.to_csv(outputDir + 'process_log.csv', index=False)
else:
    df_result.to_csv(outputDir + 'process_log.csv', index=False)

#  

Organize numpy arrays into folders by roll and subdirectory:

In [None]:
for file in os.listdir(outputDir):
    if 'mapped_coords' in file:
        fn = file.replace('mapped_coords-', '')
        fn = fn.replace('.npy', '')
        fn_parts = fn.split('_')
        newDir = outputDir + fn_parts[0] + '/' + fn_parts[1] + '/'
        os.makedirs(newDir, exist_ok=True)
        os.rename(outputDir+file, newDir+file)