# Check for Outstanding Subdirectories to Process

#### Updated: May 17, 2023

#  

After the completion of the base phase (stage 1) and OCR phase (stage 2) of Alouette processing, check for any outstanding subdirectories to process. 

In [1]:
import os
import pandas as pd

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### After stage 1 processing:

Check from process_log:

In [3]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_process_log.head()

2487


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207837,4013-50,162.0,0.0,150.0,4.0,8.0,277.122729,2023-05-13 11:13:35.969795,Rav Super3,R014207837_4013-50
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965
2,R014207820,2993-43A,16.0,0.0,12.0,0.0,4.0,232.333798,2023-05-13 11:15:58.948288,Rav Super8,R014207820_2993-43A
3,R014207813,5037,339.0,0.0,0.0,278.0,61.0,267.199561,2023-05-13 11:17:12.488731,Rav Super9,R014207813_5037
4,R014207822,3436-50-A,267.0,0.0,257.0,0.0,10.0,541.358854,2023-05-13 11:23:17.637033,Rav Super5,R014207822_3436-50-A


Look at duplicates:

In [4]:
df_p1_ = df_process_log[df_process_log.duplicated(subset=['subdir_id'], keep=False)]
df_p1_ = df_p1_.sort_values(['subdir_id', 'Process_timestamp'])
print(len(df_p1_))
df_p1_.head(10)

131


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
1821,R014207711,33,0.0,0.0,0.0,0.0,0.0,2458.348005,2023-05-14 23:30:26.223723,Rav Super5,R014207711_33
1852,R014207711,33,0.0,0.0,0.0,0.0,0.0,2586.298205,2023-05-15 00:12:28.562922,Rav Super3,R014207711_33
2458,R014207711,67,0.0,0.0,0.0,0.0,0.0,186.983064,2023-05-15 14:01:10.103033,Rav Super5,R014207711_67
2459,R014207711,67,0.0,0.0,0.0,0.0,0.0,159.323204,2023-05-15 14:02:42.043575,Rav Super12,R014207711_67
2445,R014207711,71,0.0,0.0,0.0,0.0,0.0,200.472937,2023-05-15 13:31:20.200963,Rav Super6,R014207711_71
2447,R014207711,71,0.0,0.0,0.0,0.0,0.0,175.142264,2023-05-15 13:34:04.423286,Rav Super12,R014207711_71
2397,R014207711,74,70.0,0.0,4.0,16.0,50.0,802.48196,2023-05-15 12:21:25.651621,Rav Super2,R014207711_74
2402,R014207711,74,70.0,0.0,4.0,16.0,50.0,774.804267,2023-05-15 12:31:26.937965,Rav Super1,R014207711_74
715,R014207769,SEPT-21-TO-SEPT-19-1963-64,0.0,0.0,0.0,0.0,0.0,190.395738,2023-05-14 01:07:08.463160,Rav Super11,R014207769_SEPT-21-TO-SEPT-19-1963-64
717,R014207769,SEPT-21-TO-SEPT-19-1963-64,0.0,0.0,0.0,0.0,0.0,187.095774,2023-05-14 01:08:48.895507,Rav Super12,R014207769_SEPT-21-TO-SEPT-19-1963-64


Find any subdirectories that have not been processed:

In [5]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.head()

2416


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207709,145,53,R014207709_145
1,R014207709,146,63,R014207709_146
2,R014207709,147,50,R014207709_147
3,R014207709,148,16,R014207709_148
4,R014207709,149,28,R014207709_149


In [7]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

[]

#  

Check from resultDir:

In [10]:
df_result = pd.DataFrame()
i = 0
for file in os.listdir(resultDir):
    if 'R' in file:
        directory = file
        for file2 in os.listdir(resultDir + directory + '/'):
            if 'result-' in file2:
                fn_parts = file2.split('_')
                subdirectory = fn_parts[1].replace('.csv', '')
                try:
                    df_load = pd.read_csv(resultDir + directory + '/' + file2, sep=',')
                    n = len(df_load)
                except pd.errors.EmptyDataError:
                    n = 0
                row = pd.DataFrame({
                    'Directory': directory,
                    'Subdirectory': subdirectory,
                    'images_result': n,
                    'subdir_id': directory + '_' + subdirectory
                }, index=[i])
                df_result = pd.concat([df_result, row])
                i += 1
                if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + directory + '/' + subdirectory)

100: Now inspecting...R014207711/63
200: Now inspecting...R014207766/MAR-16-TO-MAR-24-1965
300: Now inspecting...R014207807/4926
400: Now inspecting...R014207808/4607-15
500: Now inspecting...R014207811/5097
600: Now inspecting...R014207817/4176-03
700: Now inspecting...R014207819/4755-15
800: Now inspecting...R014207820/2997-43A
900: Now inspecting...R014207825/3760-38A
1000: Now inspecting...R014207827/3800-50A
1100: Now inspecting...R014207828/4441-50A
1200: Now inspecting...R014207830/3701-43A
1300: Now inspecting...R014207833/4341-03A
1400: Now inspecting...R014207834/4501-43-2
1500: Now inspecting...R014207836/4720-70-2
1600: Now inspecting...R014207838/4521-69A
1700: Now inspecting...R014207839/4142-50
1800: Now inspecting...R014207886/B-2499
1900: Now inspecting...R014207941/1763-1
2000: Now inspecting...R014207944/2046-13A
2100: Now inspecting...R014207950/2507-8-A
2200: Now inspecting...R014207961/1046
2300: Now inspecting...R014207969/1268-13B
2400: Now inspecting...R0282244

In [12]:
print(len(df_result))
df_result.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,images_result,subdir_id
2335,R014207969,1306-18B,344,R014207969_1306-18B
2206,R014207961,1053,317,R014207961_1053
224,R014207766,SEPT-13-TO-OCT-25-1965,0,R014207766_SEPT-13-TO-OCT-25-1965
14,R014207709,C-107-16,0,R014207709_C-107-16
408,R014207808,4616-50,281,R014207808_4616-50
2413,R028224481,4924,508,R028224481_4924
1356,R014207834,4458-15A,255,R014207834_4458-15A
19,R014207709,C-113-50,0,R014207709_C-113-50
1438,R014207835,4061-19A,299,R014207835_4061-19A
1921,R014207941,1786-3B,306,R014207941_1786-3B


In [13]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_res = df_result['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_res))
subdir_ids_rem

[]

In [14]:
df_p1_res = df_p1.merge(df_result, on=['Directory', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_p1_res))
df_p1_res.head()

2417


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
0,R014207837,4013-50,162.0,0.0,150.0,4.0,8.0,277.122729,2023-05-13 11:13:35.969795,Rav Super3,R014207837_4013-50,162.0
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965,0.0
2,R014207820,2993-43A,16.0,0.0,12.0,0.0,4.0,232.333798,2023-05-13 11:15:58.948288,Rav Super8,R014207820_2993-43A,16.0
3,R014207813,5037,339.0,0.0,0.0,278.0,61.0,267.199561,2023-05-13 11:17:12.488731,Rav Super9,R014207813_5037,339.0
4,R014207822,3436-50-A,267.0,0.0,257.0,0.0,10.0,541.358854,2023-05-13 11:23:17.637033,Rav Super5,R014207822_3436-50-A,267.0


In [15]:
df1 = df_p1_res.loc[df_p1_res['Images_processed'] == 0]
df2 = df_p1_res.loc[df_p1_res['images_result'] == 0]
df3 = df_p1_res.loc[(df_p1_res['Images_processed'] == 0) | (df_p1_res['images_result'] == 0)]
print(len(df1))
print(len(df2))
print(len(df3))
df3.head()

439
439
439


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965,0.0
12,R014207769,MAR-31-TO-APR-17-1965,0.0,0.0,0.0,0.0,0.0,154.184504,2023-05-13 11:32:34.685801,Rav Super11,R014207769_MAR-31-TO-APR-17-1965,0.0
17,R014207822,3440-50A,0.0,0.0,0.0,0.0,0.0,217.161499,2023-05-13 11:36:34.537185,Rav Super11,R014207822_3440-50A,0.0
23,R014207766,OCT-3-TO-OCT-21-1964,0.0,0.0,0.0,0.0,0.0,170.518515,2023-05-13 11:43:49.565733,Rav Super8,R014207766_OCT-3-TO-OCT-21-1964,0.0
27,R014207843,3197-14A,0.0,0.0,0.0,0.0,0.0,147.763625,2023-05-13 11:48:30.426955,Rav Super3,R014207843_3197-14A,0.0


#  

Check mapped_coords:

In [16]:
df_mc = pd.DataFrame()
i = 0
for file in os.listdir(resultDir + 'mapped_coords/'):
    if 'R' in file:
        directory = file
        for file2 in os.listdir(resultDir + 'mapped_coords/' + directory + '/'):
            subdirectory = file2
            n = 0
            for file3 in os.listdir(resultDir + 'mapped_coords/' + directory + '/' + subdirectory + '/'):
                if 'mapped_coords' in file3:
                    n += 1
            row = pd.DataFrame({
                'Directory': directory,
                'Subdirectory': subdirectory,
                'mapped_coords_result': n,
                'subdir_id': directory + '_' + subdirectory
            }, index=[i])
            df_mc = pd.concat([df_mc, row])
            i += 1
            if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + directory + '/' + subdirectory)

100: Now inspecting...R014207711/63
200: Now inspecting...R014207766/MAR-16-TO-MAR-24-1965
300: Now inspecting...R014207807/4926
400: Now inspecting...R014207808/4607-15
500: Now inspecting...R014207811/5097
600: Now inspecting...R014207817/4176-03
700: Now inspecting...R014207819/4755-15
800: Now inspecting...R014207820/2997-43A
900: Now inspecting...R014207825/3760-38A
1000: Now inspecting...R014207827/3800-50A
1100: Now inspecting...R014207828/4441-50A
1200: Now inspecting...R014207830/3701-43A
1300: Now inspecting...R014207833/4341-03A
1400: Now inspecting...R014207834/4501-43-2
1500: Now inspecting...R014207836/4720-70-2
1600: Now inspecting...R014207838/4521-69A
1700: Now inspecting...R014207839/4142-50
1800: Now inspecting...R014207886/B-2499
1900: Now inspecting...R014207941/1763-1
2000: Now inspecting...R014207944/2046-13A
2100: Now inspecting...R014207950/2507-8-A
2200: Now inspecting...R014207961/1046
2300: Now inspecting...R014207969/1268-13B
2400: Now inspecting...R0282244

In [18]:
print(len(df_mc))
df_mc.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,mapped_coords_result,subdir_id
1322,R014207833,4364-50,257,R014207833_4364-50
1533,R014207837,3975-50A,288,R014207837_3975-50A
1623,R014207838,4546-50,259,R014207838_4546-50
1567,R014207837,4009-50A,223,R014207837_4009-50A
933,R014207826,3854-43A,284,R014207826_3854-43A
1693,R014207839,4136-50,306,R014207839_4136-50
629,R014207817,4206-69,239,R014207817_4206-69
1928,R014207941,1794-3B,0,R014207941_1794-3B
270,R014207769,JUNE-8-to-JUNE-17-1965,0,R014207769_JUNE-8-to-JUNE-17-1965
420,R014207810,5110,198,R014207810_5110


In [19]:
df_p1_mc = df_p1.merge(df_mc, on=['Directory', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_p1_mc))
df_p1_mc.head()

2417


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
0,R014207837,4013-50,162.0,0.0,150.0,4.0,8.0,277.122729,2023-05-13 11:13:35.969795,Rav Super3,R014207837_4013-50,152.0
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965,0.0
2,R014207820,2993-43A,16.0,0.0,12.0,0.0,4.0,232.333798,2023-05-13 11:15:58.948288,Rav Super8,R014207820_2993-43A,12.0
3,R014207813,5037,339.0,0.0,0.0,278.0,61.0,267.199561,2023-05-13 11:17:12.488731,Rav Super9,R014207813_5037,0.0
4,R014207822,3436-50-A,267.0,0.0,257.0,0.0,10.0,541.358854,2023-05-13 11:23:17.637033,Rav Super5,R014207822_3436-50-A,257.0


In [20]:
df1 = df_p1_mc.loc[df_p1_mc['Images_processed'] == 0]
df2 = df_p1_mc.loc[df_p1_mc['mapped_coords_result'] == 0]
df2a = df_p1_mc.loc[df_p1_mc['mapped_coords_result'].isnull()]
df3 = df_p1_mc.loc[(df_p1_mc['Images_processed'] == 0) | (df_p1_mc['mapped_coords_result'] == 0)]
print(len(df1))
print(len(df2))
print(len(df2a))
print(len(df3))
df3

439
443
1
443


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965,0.0
3,R014207813,5037,339.0,0.0,0.0,278.0,61.0,267.199561,2023-05-13 11:17:12.488731,Rav Super9,R014207813_5037,0.0
12,R014207769,MAR-31-TO-APR-17-1965,0.0,0.0,0.0,0.0,0.0,154.184504,2023-05-13 11:32:34.685801,Rav Super11,R014207769_MAR-31-TO-APR-17-1965,0.0
17,R014207822,3440-50A,0.0,0.0,0.0,0.0,0.0,217.161499,2023-05-13 11:36:34.537185,Rav Super11,R014207822_3440-50A,0.0
23,R014207766,OCT-3-TO-OCT-21-1964,0.0,0.0,0.0,0.0,0.0,170.518515,2023-05-13 11:43:49.565733,Rav Super8,R014207766_OCT-3-TO-OCT-21-1964,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2377,R014207763,OCT-24-TO-OCT-31-1965,0.0,0.0,0.0,0.0,0.0,187.000449,2023-05-15 13:05:38.157401,Rav Super12,R014207763_OCT-24-TO-OCT-31-1965,0.0
2379,R014207766,FEB-28-TO-MAR-7-1965,0.0,0.0,0.0,0.0,0.0,172.117441,2023-05-15 13:10:50.834574,Rav Super7,R014207766_FEB-28-TO-MAR-7-1965,0.0
2380,R014207711,63,0.0,0.0,0.0,0.0,0.0,151.077174,2023-05-15 13:12:55.531884,Rav Super11,R014207711_63,0.0
2393,R014207711,71,0.0,0.0,0.0,0.0,0.0,175.142264,2023-05-15 13:34:04.423286,Rav Super12,R014207711_71,0.0


In [21]:
df4 = df_p1_mc.loc[(df_p1_mc['Images_processed'] != 0) & (df_p1_mc['mapped_coords_result'].isnull())]
print(len(df4))
df4

1


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,mapped_coords_result
2416,-2493,,,,,,,,,,,


#  

#### After stage 2 processing:

Check from process_log_OCR:

In [22]:
df_process_log_OCR = pd.read_csv(logDir + 'process_log_OCR.csv')
print(len(df_process_log_OCR))
df_process_log_OCR.head()

135


Unnamed: 0,Directory,Subdirectory,Process_time,Process_timestamp,User,subdir_id
0,R014207827,3822-43A,990.21604,2023-05-15 18:06:19.824654,Rav Super1,R014207827_3822-43A
1,R014207820,3000-43A,1062.28344,2023-05-15 18:24:02.150093,Rav Super1,R014207820_3000-43A
2,R014207835,4060-19,940.294587,2023-05-15 18:39:42.472678,Rav Super1,R014207835_4060-19
3,R014207963,1544-1B,814.29774,2023-05-15 18:53:16.795418,Rav Super1,R014207963_1544-1B
4,R014207711,39,384.984926,2023-05-15 18:59:41.800343,Rav Super1,R014207711_39


In [23]:
df_inventory_proc = pd.read_csv(logDir + 'image_inventory_processed.csv')
print(len(df_inventory_proc))
df_inventory_proc.head()

1977


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207711,24,256,R014207711_24
1,R014207711,25,499,R014207711_25
2,R014207711,27,96,R014207711_27
3,R014207711,28,103,R014207711_28
4,R014207711,29,341,R014207711_29


In [24]:
subdir_ids_tot = df_inventory_proc['subdir_id'].unique()
df_p1 = df_process_log_OCR.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

['R014207819_4764-50',
 'R014207837_4011-50A',
 'R014207838_4566-37',
 'R014207818_4215-69-2',
 'R014207963_1540-1B',
 'R014207837_4020-53A',
 'R014207817_4194-12A',
 'R014207811_5075',
 'R014207831_3935-43A',
 'R014207952_2728-13B',
 'R028224481_4884',
 'R014207822_3432-50A',
 'R014207961_1070-B',
 'R014207945_2252-3A',
 'R014207818_4220-69',
 'R014207819_4751-08',
 'R014207952_2766-4A',
 'R014207819_4784-69',
 'R014207828_4427-69A-2',
 'R014207963_1551-5A',
 'R014207834_4496-43A',
 'R014207818_4216-69A-2',
 'R014207819_4763-50',
 'R014207838_4529-43',
 'R014207835_4037-53',
 'R014207831_3947-5A',
 'R014207952_2749-4A',
 'R014207827_3794-69A-2',
 'R014207833_4354-50A',
 'R014207822_3445-15A',
 'R014207961_1031-A',
 'R014207813_5047',
 'R014207822_3467-19A-4',
 'R014207835_4084-38',
 'R028224481_4867-50',
 'R014207825_3737-15A',
 'R014207829_4318-8',
 'R014207944_2029-13B',
 'R014207819_4791-69',
 'R014207813_4987',
 'R014207827_3825-43A',
 'R014207827_3829-50A',
 'R014207820_2998-43A'