# Check for Outstanding Subdirectories to Process

#### Updated: May 17, 2023

#  

After the completion of the base phase (stage 1) and OCR phase (stage 2) of Alouette processing, check for any outstanding subdirectories to process. 

In [1]:
import os
import pandas as pd

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_I_Run2/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### After stage 1 processing:

Check from process_log:

In [3]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_process_log.head()

2728


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207821,3343-38A,320.0,0.0,314.0,0.0,6.0,195.038564,2023-06-02 21:09:03.663018,Rav_Super2,R014207821_3343-38A
1,R014207942,1985-1A,296.0,0.0,272.0,9.0,15.0,356.434266,2023-06-02 21:15:21.548098,Rav_Super2,R014207942_1985-1A
2,R014207978F,349,437.0,425.0,1.0,10.0,1.0,705.768378,2023-06-02 21:16:49.776073,Rav_Super1,R014207978F_349
3,R014207948,1689-9B,347.0,0.0,313.0,5.0,29.0,433.711368,2023-06-02 21:18:01.641518,Rav_Super5,R014207948_1689-9B
4,R014207823,3598-50A,220.0,0.0,150.0,65.0,5.0,424.822451,2023-06-02 21:18:24.384342,Rav_Super6,R014207823_3598-50A


Look at duplicates:

In [4]:
df_p1_ = df_process_log[df_process_log.duplicated(subset=['subdir_id'], keep=False)]
df_p1_ = df_p1_.sort_values(['subdir_id', 'Process_timestamp'])
print(len(df_p1_))
df_p1_.head(10)

149


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
1967,R014207815,3514-14A,333.0,0.0,290.0,34.0,9.0,886.695919,2023-06-06 21:46:33.730804,Rav_Super12,R014207815_3514-14A
1969,R014207815,3514-14A,333.0,0.0,290.0,34.0,9.0,893.089049,2023-06-06 21:47:20.034317,Rav_Super8,R014207815_3514-14A
1897,R014207821,3365-38A,317.0,0.0,311.0,0.0,6.0,741.998461,2023-06-06 20:57:00.714088,Rav_HP1,R014207821_3365-38A
1904,R014207821,3365-38A,317.0,0.0,311.0,0.0,6.0,800.167283,2023-06-06 21:02:09.606330,Rav_Super15,R014207821_3365-38A
2121,R014207823,3577-19A,332.0,0.0,322.0,1.0,9.0,1227.748486,2023-06-06 23:44:17.650200,Rav_HP2,R014207823_3577-19A
2135,R014207823,3577-19A,332.0,0.0,322.0,1.0,9.0,1252.620257,2023-06-06 23:53:31.879150,Rav_Super15,R014207823_3577-19A
2510,R014207824,3154-13B,335.0,0.0,296.0,35.0,4.0,260.058473,2023-06-07 04:43:29.823125,Rav_HP3,R014207824_3154-13B
2515,R014207824,3154-13B,335.0,0.0,296.0,35.0,4.0,287.72744,2023-06-07 04:45:32.105304,Rav_Super10,R014207824_3154-13B
2698,R014207832,3618-43-2,298.0,0.0,270.0,3.0,25.0,1263.808501,2023-06-07 07:11:08.925006,Rav_Super16,R014207832_3618-43-2
2701,R014207832,3618-43-2,298.0,0.0,270.0,3.0,25.0,1269.098681,2023-06-07 07:12:26.765113,Rav_Super14,R014207832_3618-43-2


Find any subdirectories that have not been processed:

In [5]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.head()

2638


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207815,3488-15A,273,R014207815_3488-15A
1,R014207815,3489-15A,281,R014207815_3489-15A
2,R014207815,3490-15A,198,R014207815_3490-15A
3,R014207815,3491-8A,289,R014207815_3491-8A
4,R014207815,3492-8A,334,R014207815_3492-8A


In [6]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

[]

#  

Check from resultDir:

In [7]:
df_result = pd.DataFrame()
i = 0
for file in os.listdir(resultDir):
    if 'R' in file:
        directory = file
        for file2 in os.listdir(resultDir + directory + '/'):
            if 'result-' in file2:
                fn_parts = file2.split('_')
                subdirectory = fn_parts[1].replace('.csv', '')
                try:
                    df_load = pd.read_csv(resultDir + directory + '/' + file2, sep=',')
                    n = len(df_load)
                except pd.errors.EmptyDataError:
                    n = 0
                row = pd.DataFrame({
                    'Directory': directory,
                    'Subdirectory': subdirectory,
                    'images_result': n,
                    'subdir_id': directory + '_' + subdirectory
                }, index=[i])
                df_result = pd.concat([df_result, row])
                i += 1
                if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + directory + '/' + subdirectory)

100: Now inspecting...R014207816/3407-143
200: Now inspecting...R014207823/3567-43A
300: Now inspecting...R014207824/3187-14A
400: Now inspecting...R014207840/3047-53A-2-RR
500: Now inspecting...R014207842/3267-A3
600: Now inspecting...R014207907F/289
700: Now inspecting...R014207908F/586
800: Now inspecting...R014207929F/452
900: Now inspecting...R014207930F/677
1000: Now inspecting...R014207939/882-A
1100: Now inspecting...R014207942/1945-5A
1200: Now inspecting...R014207943/2107-5B
1300: Now inspecting...R014207947/1906-5B
1400: Now inspecting...R014207949/2126-5B
1500: Now inspecting...R014207951/2466-8A
1600: Now inspecting...R014207954/2206-18B
1700: Now inspecting...R014207956/2368-1B
1800: Now inspecting...R014207958/2770-12A
1900: Now inspecting...R014207959/2332-7A
2000: Now inspecting...R014207962/1471-1B
2100: Now inspecting...R014207965/1634-18B
2200: Now inspecting...R014207966/1193-5-A
2300: Now inspecting...R014207968/1235-3A
2400: Now inspecting...R014207974/749-A
2500

In [12]:
print(len(df_result))
df_result.sample(10)

2638


Unnamed: 0,Directory,Subdirectory,images_result,subdir_id
633,R014207907F,519,398,R014207907F_519
2485,R014207975,1135-B,377,R014207975_1135-B
1092,R014207942,1938-5B,316,R014207942_1938-5B
1168,R014207943,2075-5B,332,R014207943_2075-5B
1788,R014207957,2640-18B,324,R014207957_2640-18B
2071,R014207964,1606-1B,334,R014207964_1606-1B
827,R014207929F,482,403,R014207929F_482
1762,R014207957,2613-18B,324,R014207957_2613-18B
1037,R014207940F,382,437,R014207940F_382
2420,R014207974,770-A,371,R014207974_770-A


In [13]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_res = df_result['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_res))
subdir_ids_rem

[]

In [14]:
df_p1_res = df_p1.merge(df_result, on=['Directory', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_p1_res))
df_p1_res.head()

2639


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
0,R014207821,3343-38A,320.0,0.0,314.0,0.0,6.0,195.038564,2023-06-02 21:09:03.663018,Rav_Super2,R014207821_3343-38A,320.0
1,R014207942,1985-1A,296.0,0.0,272.0,9.0,15.0,356.434266,2023-06-02 21:15:21.548098,Rav_Super2,R014207942_1985-1A,296.0
2,R014207978F,349,437.0,425.0,1.0,10.0,1.0,705.768378,2023-06-02 21:16:49.776073,Rav_Super1,R014207978F_349,437.0
3,R014207948,1689-9B,347.0,0.0,313.0,5.0,29.0,433.711368,2023-06-02 21:18:01.641518,Rav_Super5,R014207948_1689-9B,347.0
4,R014207823,3598-50A,220.0,0.0,150.0,65.0,5.0,424.822451,2023-06-02 21:18:24.384342,Rav_Super6,R014207823_3598-50A,220.0


In [15]:
df1 = df_p1_res.loc[df_p1_res['Images_processed'] == 0]
df2 = df_p1_res.loc[df_p1_res['images_result'] == 0]
df3 = df_p1_res.loc[(df_p1_res['Images_processed'] == 0) | (df_p1_res['images_result'] == 0)]
print(len(df1))
print(len(df2))
print(len(df3))
df3.head()

104
104
104


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
5,R014207824,3171-14A,0.0,0.0,0.0,0.0,0.0,594.142107,2023-06-02 21:19:37.236785,Rav_Super3,R014207824_3171-14A,0.0
6,R014207940F,413,0.0,0.0,0.0,0.0,0.0,167.884712,2023-06-02 21:19:48.954696,Rav_Super1,R014207940F_413,0.0
7,R014207960,2570-18B,0.0,0.0,0.0,0.0,0.0,164.801753,2023-06-02 21:20:59.767172,Rav_Super5,R014207960_2570-18B,0.0
10,R014207970,1335-5B,0.0,0.0,0.0,0.0,0.0,31.692768,2023-06-02 21:22:39.822443,Rav_Super8,R014207970_1335-5B,0.0
11,R014207841,3124-8A,0.0,0.0,0.0,0.0,0.0,425.005448,2023-06-02 21:22:40.182444,Rav_Super2,R014207841_3124-8A,0.0


#  

Check mapped_coords:

In [16]:
df_mc = pd.DataFrame()
i = 0
for file in os.listdir(resultDir + 'mapped_coords/'):
    if 'R' in file:
        directory = file
        for file2 in os.listdir(resultDir + 'mapped_coords/' + directory + '/'):
            subdirectory = file2
            n = 0
            for file3 in os.listdir(resultDir + 'mapped_coords/' + directory + '/' + subdirectory + '/'):
                if 'mapped_coords' in file3:
                    n += 1
            row = pd.DataFrame({
                'Directory': directory,
                'Subdirectory': subdirectory,
                'mapped_coords_result': n,
                'subdir_id': directory + '_' + subdirectory
            }, index=[i])
            df_mc = pd.concat([df_mc, row])
            i += 1
            if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + directory + '/' + subdirectory)

100: Now inspecting...R014207816/3407-143
200: Now inspecting...R014207823/3567-43A
300: Now inspecting...R014207824/3187-14A
400: Now inspecting...R014207840/3047-53A-2-RR
500: Now inspecting...R014207842/3267-A3
600: Now inspecting...R014207907F/289
700: Now inspecting...R014207908F/586
800: Now inspecting...R014207929F/452
900: Now inspecting...R014207930F/677
1000: Now inspecting...R014207939/882-A
1100: Now inspecting...R014207942/1945-5A
1200: Now inspecting...R014207943/2107-5B
1300: Now inspecting...R014207947/1906-5B
1400: Now inspecting...R014207949/2126-5B
1500: Now inspecting...R014207951/2466-8A
1600: Now inspecting...R014207954/2206-18B
1700: Now inspecting...R014207956/2368-1B
1800: Now inspecting...R014207958/2770-12A
1900: Now inspecting...R014207959/2332-7A
2000: Now inspecting...R014207962/1471-1B
2100: Now inspecting...R014207965/1634-18B
2200: Now inspecting...R014207966/1193-5-A
2300: Now inspecting...R014207968/1235-3A
2400: Now inspecting...R014207974/749-A
2500

In [17]:
print(len(df_mc))
df_mc.sample(10)

2638


Unnamed: 0,Directory,Subdirectory,mapped_coords_result,subdir_id
2454,R014207975,1104-B,348,R014207975_1104-B
813,R014207929F,467,352,R014207929F_467
514,R014207842,3282-5A,267,R014207842_3282-5A
846,R014207930F,621,354,R014207930F_621
2142,R014207965,1677-13B,301,R014207965_1677-13B
295,R014207824,3183-14A,290,R014207824_3183-14A
551,R014207844,2899-43B,309,R014207844_2899-43B
1371,R014207948,1738-6,303,R014207948_1738-6
1265,R014207947,1872-1B,274,R014207947_1872-1B
667,R014207908F,554,412,R014207908F_554


In [18]:
df_p1_mc = df_p1.merge(df_mc, on=['Directory', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_p1_mc))
df_p1_mc.head()

2639


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
0,R014207821,3343-38A,320.0,0.0,314.0,0.0,6.0,195.038564,2023-06-02 21:09:03.663018,Rav_Super2,R014207821_3343-38A,314.0
1,R014207942,1985-1A,296.0,0.0,272.0,9.0,15.0,356.434266,2023-06-02 21:15:21.548098,Rav_Super2,R014207942_1985-1A,272.0
2,R014207978F,349,437.0,425.0,1.0,10.0,1.0,705.768378,2023-06-02 21:16:49.776073,Rav_Super1,R014207978F_349,426.0
3,R014207948,1689-9B,347.0,0.0,313.0,5.0,29.0,433.711368,2023-06-02 21:18:01.641518,Rav_Super5,R014207948_1689-9B,313.0
4,R014207823,3598-50A,220.0,0.0,150.0,65.0,5.0,424.822451,2023-06-02 21:18:24.384342,Rav_Super6,R014207823_3598-50A,150.0


In [19]:
df1 = df_p1_mc.loc[df_p1_mc['Images_processed'] == 0]
df2 = df_p1_mc.loc[df_p1_mc['mapped_coords_result'] == 0]
df2a = df_p1_mc.loc[df_p1_mc['mapped_coords_result'].isnull()]
df3 = df_p1_mc.loc[(df_p1_mc['Images_processed'] == 0) | (df_p1_mc['mapped_coords_result'] == 0)]
print(len(df1))
print(len(df2))
print(len(df2a))
print(len(df3))
df3

104
105
1
105


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
5,R014207824,3171-14A,0.0,0.0,0.0,0.0,0.0,594.142107,2023-06-02 21:19:37.236785,Rav_Super3,R014207824_3171-14A,0.0
6,R014207940F,413,0.0,0.0,0.0,0.0,0.0,167.884712,2023-06-02 21:19:48.954696,Rav_Super1,R014207940F_413,0.0
7,R014207960,2570-18B,0.0,0.0,0.0,0.0,0.0,164.801753,2023-06-02 21:20:59.767172,Rav_Super5,R014207960_2570-18B,0.0
10,R014207970,1335-5B,0.0,0.0,0.0,0.0,0.0,31.692768,2023-06-02 21:22:39.822443,Rav_Super8,R014207970_1335-5B,0.0
11,R014207841,3124-8A,0.0,0.0,0.0,0.0,0.0,425.005448,2023-06-02 21:22:40.182444,Rav_Super2,R014207841_3124-8A,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1022,R014207907F,301,0.0,0.0,0.0,0.0,0.0,637.266533,2023-06-06 01:02:22.826809,Rav_HP1,R014207907F_301,0.0
1510,R014207979F,302,0.0,0.0,0.0,0.0,0.0,197.839377,2023-06-06 16:22:37.158074,Rav_Super9,R014207979F_302,0.0
1970,R014207907F,527,0.0,0.0,0.0,0.0,0.0,122.154753,2023-06-06 21:59:59.938344,Rav_HP4,R014207907F_527,0.0
1983,R014207842,3259-50A,0.0,0.0,0.0,0.0,0.0,221.647546,2023-06-06 22:09:10.679185,Rav_HP3,R014207842_3259-50A,0.0


In [20]:
df4 = df_p1_mc.loc[(df_p1_mc['Images_processed'] != 0) & (df_p1_mc['mapped_coords_result'].isnull())]
print(len(df4))
df4

1


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
2638,B,,,,,,,,,,,


#  

#### After stage 2 processing:

Check from process_log_OCR:

In [22]:
df_process_log_OCR = pd.read_csv(logDir + 'process_log_OCR.csv')
print(len(df_process_log_OCR))
df_process_log_OCR.head()

135


Unnamed: 0,Directory,Subdirectory,Process_time,Process_timestamp,User,subdir_id
0,R014207827,3822-43A,990.21604,2023-05-15 18:06:19.824654,Rav Super1,R014207827_3822-43A
1,R014207820,3000-43A,1062.28344,2023-05-15 18:24:02.150093,Rav Super1,R014207820_3000-43A
2,R014207835,4060-19,940.294587,2023-05-15 18:39:42.472678,Rav Super1,R014207835_4060-19
3,R014207963,1544-1B,814.29774,2023-05-15 18:53:16.795418,Rav Super1,R014207963_1544-1B
4,R014207711,39,384.984926,2023-05-15 18:59:41.800343,Rav Super1,R014207711_39


In [23]:
df_inventory_proc = pd.read_csv(logDir + 'image_inventory_processed.csv')
print(len(df_inventory_proc))
df_inventory_proc.head()

1977


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207711,24,256,R014207711_24
1,R014207711,25,499,R014207711_25
2,R014207711,27,96,R014207711_27
3,R014207711,28,103,R014207711_28
4,R014207711,29,341,R014207711_29


In [24]:
subdir_ids_tot = df_inventory_proc['subdir_id'].unique()
df_p1 = df_process_log_OCR.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

['R014207819_4764-50',
 'R014207837_4011-50A',
 'R014207838_4566-37',
 'R014207818_4215-69-2',
 'R014207963_1540-1B',
 'R014207837_4020-53A',
 'R014207817_4194-12A',
 'R014207811_5075',
 'R014207831_3935-43A',
 'R014207952_2728-13B',
 'R028224481_4884',
 'R014207822_3432-50A',
 'R014207961_1070-B',
 'R014207945_2252-3A',
 'R014207818_4220-69',
 'R014207819_4751-08',
 'R014207952_2766-4A',
 'R014207819_4784-69',
 'R014207828_4427-69A-2',
 'R014207963_1551-5A',
 'R014207834_4496-43A',
 'R014207818_4216-69A-2',
 'R014207819_4763-50',
 'R014207838_4529-43',
 'R014207835_4037-53',
 'R014207831_3947-5A',
 'R014207952_2749-4A',
 'R014207827_3794-69A-2',
 'R014207833_4354-50A',
 'R014207822_3445-15A',
 'R014207961_1031-A',
 'R014207813_5047',
 'R014207822_3467-19A-4',
 'R014207835_4084-38',
 'R028224481_4867-50',
 'R014207825_3737-15A',
 'R014207829_4318-8',
 'R014207944_2029-13B',
 'R014207819_4791-69',
 'R014207813_4987',
 'R014207827_3825-43A',
 'R014207827_3829-50A',
 'R014207820_2998-43A'