# Analyze Processing Quality

#### Updated: Jan 10, 2023

#  

Analyze the overall quality of the processing of the first batch of Alouette images. Determine that all subdirectories have been processed. Determine which subdirectories have the highest loss - are there any patterns? For instance, do certain rolls have high loss, or images from certain ground stations?

In [34]:
import pandas as pd
import numpy as np
import os

In [35]:
logDir = 'L:/DATA/Alouette_I/06_log/'

#  

#### Identify outstanding subdirectories to process:

Remove duplicates from process_log:

In [36]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_process_log.head()

3031


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207978F,319,239,206,5,11,17,259.625649,2022-12-15 23:28:15.951151,Rav,R014207978F_319
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B


In [37]:
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
print(len(df_p1))
df_p1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B
5,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288


In [38]:
df_p1.tail()

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
3026,R014207832,3636-43A,0,0,0,0,0,37.51192,2023-01-10 18:22:26.001592,Rav,R014207832_3636-43A
3027,R014207844,2932-43B,0,0,0,0,0,31.943596,2023-01-10 18:23:06.984721,Rav,R014207844_2932-43B
3028,R014207955,2835-50B,0,0,0,0,0,31.801804,2023-01-10 18:23:46.929928,Rav,R014207955_2835-50B
3029,R014207823,3556-43,0,0,0,0,0,45.146546,2023-01-10 18:24:40.519141,Rav,R014207823_3556-43
3030,R014207907F,527,0,0,0,0,0,23.702462,2023-01-10 18:25:14.099628,Rav,R014207907F_527


Look at duplicates from process_log:

In [39]:
df_p1_ = df_process_log[df_process_log.duplicated(subset=['subdir_id'], keep=False)]
df_p1_ = df_p1_.sort_values(['subdir_id', 'Process_timestamp'])
print(len(df_p1_))
df_p1_.head(10)

490


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
948,R014207816,3413-43A,0,0,0,0,0,2.043954,2022-12-21 13:19:23.255841,Rav Super,R014207816_3413-43A
1069,R014207816,3413-43A,290,0,284,1,5,183.085114,2022-12-21 21:57:07.659384,Rav,R014207816_3413-43A
1524,R014207816,3413-43A,290,0,284,1,5,198.636497,2022-12-27 16:28:55.551267,Rav,R014207816_3413-43A
2712,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2713,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2715,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2718,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2722,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2727,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2733,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A


Find any subdirectories that have not been processed:

In [40]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.head()

2638


Unnamed: 0,Roll,Subdirectory,images,subdir_id
0,R014207815,3488-15A,273,R014207815_3488-15A
1,R014207815,3489-15A,281,R014207815_3489-15A
2,R014207815,3490-15A,198,R014207815_3490-15A
3,R014207815,3491-8A,289,R014207815_3491-8A
4,R014207815,3492-8A,334,R014207815_3492-8A


In [41]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

[]

#  

#### Identify 0 Images_processed subdirectories, and try to process again:

In [42]:
df_p2 = df_p1.sort_values('Images_processed').reset_index(drop=True)
df_p2 = df_p2.loc[df_p2['Images_processed'] == 0]
print(len(df_p2))
df_p2.head()

35


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207907F,527,0,0,0,0,0,23.702462,2023-01-10 18:25:14.099628,Rav,R014207907F_527
1,R014207821,3322-15A,0,0,0,0,0,32.693111,2023-01-10 18:18:31.888616,Rav,R014207821_3322-15A
2,R014207832,3624-43A,0,0,0,0,0,35.22135,2023-01-10 18:17:49.315592,Rav,R014207832_3624-43A
3,R014207832,3611-38A,0,0,0,0,0,34.86728,2023-01-10 18:17:04.757866,Rav,R014207832_3611-38A
4,R014207821,3346-38A,0,0,0,0,0,38.095704,2023-01-10 17:53:11.805510,Rav,R014207821_3346-38A


In [10]:
df_p2.to_csv(logDir + 'reprocess_list.csv', index=False)

#  

#### Order subdirectories by proportion 'total loss' (unprocessed + loss + outlier):

Compare with process_log with download_log:

In [13]:
df_download_log = pd.read_csv(logDir + 'download_log.csv')
print(len(df_download_log))
df_download_log.head()

2639


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User,subdir_id
0,R014207966,1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,R014207966_1185-1B
1,R014207975,1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,R014207975_1108-B
2,R014207957,2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,R014207957_2631-1A
3,R014207966,1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,R014207966_1150-B
4,R014207979F,288,421,75.788788,2022-12-16 00:11:51.540555,Rav,R014207979F_288


In [14]:
df_d1 = df_download_log.sort_values('Download_timestamp', ascending=True)
df_d1 = df_d1.drop_duplicates(subset=['subdir_id'], keep='last')
df_d1 = df_d1.rename(columns={
    'User': 'User_dl'
})
print(len(df_d1))
df_d1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User_dl,subdir_id
0,R014207966,1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,R014207966_1185-1B
1,R014207975,1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,R014207975_1108-B
2,R014207957,2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,R014207957_2631-1A
3,R014207966,1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,R014207966_1150-B
4,R014207979F,288,421,75.788788,2022-12-16 00:11:51.540555,Rav,R014207979F_288


In [15]:
df_d1_ = df_download_log[df_download_log.duplicated(subset=['subdir_id'], keep=False)]
df_d1_ = df_d1_.sort_values(['subdir_id', 'Download_timestamp'])
print(len(df_d1_))
df_d1_.head(10)

2


Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp,User,subdir_id
978,R014207949,2114-1A,341,86.235073,2022-12-21 16:22:32.687731,Roksana,R014207949_2114-1A
2638,R014207949,2114-1A,341,84.204622,2023-01-06 18:10:28.619273,Rav,R014207949_2114-1A


In [17]:
df1 = df_p1.merge(df_d1, on=['Roll', 'Subdirectory', 'subdir_id'])
print(len(df1))
df1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl
0,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav
1,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav
2,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav
3,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav
4,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288,421,75.788788,2022-12-16 00:11:51.540555,Rav


In [18]:
df1['Images_unprocessed'] = df1['Images_downloaded'] - df1['Images_processed']
df1.head()

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl,Images_unprocessed
0,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B,388,103.911195,2022-12-15 23:26:13.703970,Rav,2
1,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B,364,100.474518,2022-12-15 23:32:54.868807,Rav,1
2,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A,335,63.127634,2022-12-15 23:53:58.671362,Rav,2
3,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B,369,95.732583,2022-12-16 00:00:35.088810,Rav,0
4,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288,421,75.788788,2022-12-16 00:11:51.540555,Rav,0


#  

Negative Images_unprocessed issue - reprocess these subdirectories:

In [33]:
df2 = df1.loc[df1['Images_unprocessed'] < 0]
print(len(df2))
df2.head()

46


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id,Images_downloaded,Download_time,Download_timestamp,User_dl,Images_unprocessed
483,R014207907F,524,712,353,306,23,30,671.095206,2022-12-19 20:35:24.248602,Roksana,R014207907F_524,412,79.659038,2022-12-19 20:21:24.132593,Roksana,-300
641,R014207840,3047-53A-2-RR,314,1,259,43,11,478.103771,2022-12-20 16:31:45.168516,Rav,R014207840_3047-53A-2-RR,313,76.046123,2022-12-20 16:22:28.320731,Rav,-1
670,R014207841,3116-13B,363,0,353,1,9,230.36739,2022-12-20 18:44:10.745672,Rav,R014207841_3116-13B,302,81.163815,2022-12-20 18:35:22.252768,Rav,-61
688,R014207844,2945-43B,326,0,309,4,13,849.052986,2022-12-20 20:35:39.269207,Rav,R014207844_2945-43B,287,69.989887,2022-12-20 20:18:39.378767,Rav,-39
741,R014207960,2582-18B,400,368,5,10,17,910.137281,2022-12-21 00:51:32.401796,Roksana,R014207960_2582-18B,328,86.695219,2022-12-21 00:34:34.681819,Roksana,-72


In [None]:
df2.to_csv(logDir + 'reprocess_list.csv', index=False)