# Analyze Processing Quality

#### Updated: Jan 11, 2023

#  

Analyze the overall quality of the processing of the first batch of Alouette images. Determine that all subdirectories have been processed. Determine which subdirectories have the highest loss - are there any patterns? For instance, do certain rolls have high loss, or images from certain ground stations?

In [1]:
import pandas as pd
import numpy as np
import os

In [5]:
rootDir = 'L:/DATA/Alouette_I/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Identify outstanding subdirectories to process:

Check from process_log:

Remove duplicates from process_log:

In [29]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_process_log.head()

3077


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207978F,319,239,206,5,11,17,259.625649,2022-12-15 23:28:15.951151,Rav,R014207978F_319
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B


In [30]:
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
print(len(df_p1))
df_p1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B
5,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288


In [31]:
df_p1.tail()

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
3072,R014207840,3054-13B,324,0,16,308,0,155.655366,2023-01-11 01:31:40.437574,Rav,R014207840_3054-13B
3073,R014207960,2537-5B,328,0,311,0,17,750.512748,2023-01-11 01:44:32.378911,Rav,R014207960_2537-5B
3074,R014207824,3167-14A,344,0,333,0,11,531.198269,2023-01-11 01:53:33.263641,Rav,R014207824_3167-14A
3075,R014207940F,367,435,401,0,23,11,732.43906,2023-01-11 02:05:55.478210,Rav,R014207940F_367
3076,R014207964,1580-1B,332,0,311,0,21,795.999348,2023-01-11 02:19:20.940736,Rav,R014207964_1580-1B


Look at duplicates from process_log:

In [39]:
df_p1_ = df_process_log[df_process_log.duplicated(subset=['subdir_id'], keep=False)]
df_p1_ = df_p1_.sort_values(['subdir_id', 'Process_timestamp'])
print(len(df_p1_))
df_p1_.head(10)

490


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
948,R014207816,3413-43A,0,0,0,0,0,2.043954,2022-12-21 13:19:23.255841,Rav Super,R014207816_3413-43A
1069,R014207816,3413-43A,290,0,284,1,5,183.085114,2022-12-21 21:57:07.659384,Rav,R014207816_3413-43A
1524,R014207816,3413-43A,290,0,284,1,5,198.636497,2022-12-27 16:28:55.551267,Rav,R014207816_3413-43A
2712,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2713,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2715,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2718,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2722,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2727,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2733,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A


Find any subdirectories that have not been processed:

In [19]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.head()

2638


Unnamed: 0,Roll,Subdirectory,images,subdir_id
0,R014207815,3488-15A,273,R014207815_3488-15A
1,R014207815,3489-15A,281,R014207815_3489-15A
2,R014207815,3490-15A,198,R014207815_3490-15A
3,R014207815,3491-8A,289,R014207815_3491-8A
4,R014207815,3492-8A,334,R014207815_3492-8A


In [41]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

[]

Check from resultDir:

In [24]:
df_result = pd.DataFrame()
i = 0
for file in os.listdir(resultDir):
    if 'R' in file:
        roll = file
        for file2 in os.listdir(resultDir + roll + '/'):
            if 'result' in file2:
                fn_parts = file2.split('_')
                subdirectory = fn_parts[1].replace('.csv', '')
                try:
                    df_load = pd.read_csv(resultDir + roll + '/' + file2, sep=',')
                    n = len(df_load)
                except pd.errors.EmptyDataError:
                    n = 0
                row = pd.DataFrame({
                    'Roll': roll,
                    'Subdirectory': subdirectory,
                    'images_result': n,
                    'subdir_id': roll + '_' + subdirectory
                }, index=[i])
                df_result = pd.concat([df_result, row])
                i += 1
                if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + roll + '/' + subdirectory)

100: Now inspecting...R014207816/3407-143
200: Now inspecting...R014207823/3567-43A
300: Now inspecting...R014207824/3187-14A
400: Now inspecting...R014207840/3047-53A-2-RR
500: Now inspecting...R014207842/3267-A3
600: Now inspecting...R014207907F/289
700: Now inspecting...R014207908F/586
800: Now inspecting...R014207929F/452
900: Now inspecting...R014207930F/677
1000: Now inspecting...R014207939/882-A
1100: Now inspecting...R014207942/1945-5A
1200: Now inspecting...R014207943/2107-5B
1300: Now inspecting...R014207947/1906-5B
1400: Now inspecting...R014207949/2126-5B
1500: Now inspecting...R014207951/2466-8A
1600: Now inspecting...R014207954/2206-18B
1700: Now inspecting...R014207956/2368-1B
1800: Now inspecting...R014207958/2770-12A
1900: Now inspecting...R014207959/2332-7A
2000: Now inspecting...R014207962/1471-1B
2100: Now inspecting...R014207965/1634-18B
2200: Now inspecting...R014207966/1193-5-A
2300: Now inspecting...R014207968/1235-3A
2400: Now inspecting...R014207974/749-A
2500

In [25]:
print(len(df_result))
df_result.head()

2638


Unnamed: 0,Roll,Subdirectory,images_result,subdir_id
0,R014207815,3488-15A,273,R014207815_3488-15A
1,R014207815,3489-15A,276,R014207815_3489-15A
2,R014207815,3490-15A,198,R014207815_3490-15A
3,R014207815,3491-8A,289,R014207815_3491-8A
4,R014207815,3492-8A,334,R014207815_3492-8A


In [26]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_res = df_result['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_res))
len(subdir_ids_rem)

0

In [33]:
df_p1_res = df_p1.merge(df_result, on=['Roll', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_p1_res))
df_p1_res.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
0,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B,386
1,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B,363
2,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A,333
3,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B,369
4,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288,421


In [41]:
df1 = df_p1_res.loc[df_p1_res['Images_processed'] == 0]
df2 = df_p1_res.loc[df_p1_res['images_result'] == 0]
df3 = df_p1_res.loc[(df_p1_res['Images_processed'] == 0) | (df_p1_res['images_result'] == 0)]
print(len(df1))
print(len(df2))
print(len(df3))
df3.head()

36
36
36


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,images_result
2558,R014207821,3346-38A,0,0,0,0,0,38.095704,2023-01-10 17:53:11.805510,Rav,R014207821_3346-38A,0
2559,R014207824,3143-14A,0,0,0,0,0,32.883987,2023-01-10 17:53:51.828994,Rav,R014207824_3143-14A,0
2560,R014207842,3259-50A,0,0,0,0,0,34.829748,2023-01-10 17:54:35.556059,Rav,R014207842_3259-50A,0
2561,R014207907F,301,0,0,0,0,0,504.873584,2023-01-10 18:03:06.670546,Rav,R014207907F_301,0
2562,R014207844,2914-43B,0,0,0,0,0,36.6783,2023-01-10 18:03:54.596781,Rav,R014207844_2914-43B,0


Check mapped_coords:

In [27]:
df_mc = pd.DataFrame()
i = 0
for file in os.listdir(resultDir + 'mapped_coords/'):
    if 'R' in file:
        roll = file
        for file2 in os.listdir(resultDir + 'mapped_coords/' + roll + '/'):
            subdirectory = file2
            n = 0
            for file3 in os.listdir(resultDir + 'mapped_coords/' + roll + '/' + subdirectory + '/'):
                if 'mapped_coords' in file3:
                    n += 1
            row = pd.DataFrame({
                'Roll': roll,
                'Subdirectory': subdirectory,
                'mapped_coords_result': n,
                'subdir_id': roll + '_' + subdirectory
            }, index=[i])
            df_mc = pd.concat([df_mc, row])
            i += 1
            if i % 100 == 0:
                    print(str(i) + ': Now inspecting...' + roll + '/' + subdirectory)

NameError: name 'Roll' is not defined

In [None]:
print(len(df_mc))
df_mc.head()

In [None]:
df_merge = df_p1.merge(df_mc, on=['Roll', 'Subdirectory', 'subdir_id'], how='left')
print(len(df_merge))
df_merge.head()

#  

#### Identify 0 Images_processed subdirectories, and try to process again:

In [42]:
df_p2 = df_p1.sort_values('Images_processed').reset_index(drop=True)
df_p2 = df_p2.loc[df_p2['Images_processed'] == 0]
print(len(df_p2))
df_p2.head()

35


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207907F,527,0,0,0,0,0,23.702462,2023-01-10 18:25:14.099628,Rav,R014207907F_527
1,R014207821,3322-15A,0,0,0,0,0,32.693111,2023-01-10 18:18:31.888616,Rav,R014207821_3322-15A
2,R014207832,3624-43A,0,0,0,0,0,35.22135,2023-01-10 18:17:49.315592,Rav,R014207832_3624-43A
3,R014207832,3611-38A,0,0,0,0,0,34.86728,2023-01-10 18:17:04.757866,Rav,R014207832_3611-38A
4,R014207821,3346-38A,0,0,0,0,0,38.095704,2023-01-10 17:53:11.805510,Rav,R014207821_3346-38A


In [10]:
#df_p2.to_csv(logDir + 'reprocess_list.csv', index=False)

#  

#### Order subdirectories by proportion 'total loss' (unprocessed + loss + outlier):

Compare with process_log with download_log:

In [43]:
df_download_log = pd.read_csv(logDir + 'download_log.csv')
print(len(df_download_log))
df_download_log.head()

2639


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User,subdir_id
0,R014207966,1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,R014207966_1185-1B
1,R014207975,1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,R014207975_1108-B
2,R014207957,2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,R014207957_2631-1A
3,R014207966,1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,R014207966_1150-B
4,R014207979F,288,421,75.788788,2022-12-16 00:11:51.540555,Rav,R014207979F_288


In [44]:
df_d1 = df_download_log.sort_values('Download_timestamp', ascending=True)
df_d1 = df_d1.drop_duplicates(subset=['subdir_id'], keep='last')
df_d1 = df_d1.rename(columns={
    'User': 'User_dl'
})
print(len(df_d1))
df_d1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User_dl,subdir_id
0,R014207966,1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,R014207966_1185-1B
1,R014207975,1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,R014207975_1108-B
2,R014207957,2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,R014207957_2631-1A
3,R014207966,1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,R014207966_1150-B
4,R014207979F,288,421,75.788788,2022-12-16 00:11:51.540555,Rav,R014207979F_288


In [45]:
df_d1_ = df_download_log[df_download_log.duplicated(subset=['subdir_id'], keep=False)]
df_d1_ = df_d1_.sort_values(['subdir_id', 'Download_timestamp'])
print(len(df_d1_))
df_d1_.head(10)

2


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User,subdir_id
978,R014207949,2114-1A,341,86.235073,2022-12-21 16:22:32.687731,Roksana,R014207949_2114-1A
2638,R014207949,2114-1A,341,84.204622,2023-01-06 18:10:28.619273,Rav,R014207949_2114-1A


In [46]:
df1 = df_p1.merge(df_d1, on=['Roll', 'Subdirectory', 'subdir_id'])
print(len(df1))
df1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl
0,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav
1,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav
2,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav
3,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav
4,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288,421,75.788788,2022-12-16 00:11:51.540555,Rav


In [47]:
df1['Images_unprocessed'] = df1['Images_downloaded'] - df1['Images_processed']
df1.head()

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl,Images_unprocessed
0,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,2
1,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,1
2,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,2
3,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,0
4,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288,421,75.788788,2022-12-16 00:11:51.540555,Rav,0


#  

Negative Images_unprocessed issue - reprocess these subdirectories:

In [48]:
df2 = df1.loc[df1['Images_unprocessed'] < 0]
print(len(df2))
df2.head()

46


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl,Images_unprocessed
483,R014207907F,524,712,353,306,23,30,671.095206,2022-12-19 20:35:24.248602,Roksana,R014207907F_524,412,79.659038,2022-12-19 20:21:24.132593,Roksana,-300
641,R014207840,3047-53A-2-RR,314,1,259,43,11,478.103771,2022-12-20 16:31:45.168516,Rav,R014207840_3047-53A-2-RR,313,76.046123,2022-12-20 16:22:28.320731,Rav,-1
670,R014207841,3116-13B,363,0,353,1,9,230.36739,2022-12-20 18:44:10.745672,Rav,R014207841_3116-13B,302,81.163815,2022-12-20 18:35:22.252768,Rav,-61
688,R014207844,2945-43B,326,0,309,4,13,849.052986,2022-12-20 20:35:39.269207,Rav,R014207844_2945-43B,287,69.989887,2022-12-20 20:18:39.378767,Rav,-39
741,R014207960,2582-18B,400,368,5,10,17,910.137281,2022-12-21 00:51:32.401796,Roksana,R014207960_2582-18B,328,86.695219,2022-12-21 00:34:34.681819,Roksana,-72


In [49]:
df2.to_csv(logDir + 'reprocess_list.csv', index=False)