# Analyze Processing Quality

#### Updated: Jan 9, 2023

#  

Analyze the overall quality of the processing of the first batch of Alouette images. Determine that all subdirectories have been processed. Determine which subdirectories have the highest loss - are there any patterns? For instance, do certain rolls have high loss, or images from certain ground stations?

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
logDir = 'L:/DATA/Alouette_I/06_log/'

#  

#### Identify outstanding subdirectories to process:

Remove duplicates from process_log:

In [3]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_process_log.head()

2923


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207978F,319,239,206,5,11,17,259.625649,2022-12-15 23:28:15.951151,Rav,R014207978F_319
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B


In [4]:
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
print(len(df_p1))
df_p1.head()

2638


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
1,R014207966,1185-1B,386,0,383,0,3,1415.449691,2022-12-15 23:51:57.608703,Rav,R014207966_1185-1B
2,R014207975,1108-B,363,0,347,4,12,367.330205,2022-12-15 23:58:19.342118,Rav,R014207975_1108-B
3,R014207957,2631-1A,333,0,320,4,9,607.819416,2022-12-16 00:08:43.220755,Rav,R014207957_2631-1A
4,R014207966,1150-B,369,0,362,1,6,775.810147,2022-12-16 00:21:52.993215,Rav,R014207966_1150-B
5,R014207979F,288,421,345,3,23,50,157.020877,2022-12-16 00:24:43.050306,Rav,R014207979F_288


In [5]:
df_p1.tail()

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
2918,R014207909F,716,0,0,0,0,0,9.342925,2023-01-06 17:23:00.272846,Rav,R014207909F_716
2882,R014207929F,458,0,0,0,0,0,10.369817,2023-01-06 17:23:18.552994,Rav,R014207929F_458
2901,R014207955,2835-50B,0,0,0,0,0,16.12988,2023-01-06 17:23:40.365107,Rav,R014207955_2835-50B
2921,R014207962,1505-1B,0,0,0,0,0,7.618111,2023-01-06 17:23:56.385944,Rav,R014207962_1505-1B
2922,R014207949,2114-1A,340,0,324,5,11,519.394348,2023-01-06 18:19:20.246517,Rav,R014207949_2114-1A


Look at duplicates from process_log:

In [6]:
df_p1_ = df_process_log[df_process_log.duplicated(subset=['subdir_id'], keep=False)]
df_p1_ = df_p1_.sort_values(['subdir_id', 'Process_timestamp'])
print(len(df_p1_))
df_p1_.head(10)

331


Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
948,R014207816,3413-43A,0,0,0,0,0,2.043954,2022-12-21 13:19:23.255841,Rav Super,R014207816_3413-43A
1069,R014207816,3413-43A,290,0,284,1,5,183.085114,2022-12-21 21:57:07.659384,Rav,R014207816_3413-43A
1524,R014207816,3413-43A,290,0,284,1,5,198.636497,2022-12-27 16:28:55.551267,Rav,R014207816_3413-43A
2712,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2713,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2715,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2718,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2722,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2727,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A
2733,R014207816,3413-43A,0,0,0,0,0,7.184033,2023-01-06 17:16:15.210606,Rav,R014207816_3413-43A


Find any subdirectories that have not been processed:

In [7]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.head()

2638


Unnamed: 0,Roll,Subdirectory,images,subdir_id
0,R014207815,3488-15A,273,R014207815_3488-15A
1,R014207815,3489-15A,281,R014207815_3489-15A
2,R014207815,3490-15A,198,R014207815_3490-15A
3,R014207815,3491-8A,289,R014207815_3491-8A
4,R014207815,3492-8A,334,R014207815_3492-8A


In [8]:
subdir_ids_tot = df_inventory['subdir_id'].unique()
subdir_ids_proc = df_p1['subdir_id'].unique()
subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_proc))
subdir_ids_rem

[]

#  

#### Identify 0 Images_processed subdirectories, and try to process again:

In [11]:
df_p2 = df_p1.sort_values('Images_processed').reset_index(drop=True)
df_p2 = df_p2.loc[df_p2['Images_processed'] == 0]
df_p2

Unnamed: 0,Roll,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207942,1933-3A,0,0,0,0,0,0.731000,2022-12-23 13:24:02.660561,Roksana,R014207942_1933-3A
1,R014207821,3365-38A,0,0,0,0,0,0.719985,2022-12-23 14:23:16.095674,Roksana,R014207821_3365-38A
2,R014207940F,375,0,0,0,0,0,653.678183,2022-12-21 16:05:08.798327,Roksana,R014207940F_375
3,R014207840,3032-50A,0,0,0,0,0,664.504666,2022-12-21 07:29:40.568063,Rav,R014207840_3032-50A
4,R014207940F,404,0,0,0,0,0,675.182859,2022-12-21 00:51:44.388028,Rav,R014207940F_404
...,...,...,...,...,...,...,...,...,...,...,...
67,R014207844,2932-43B,0,0,0,0,0,16.060883,2023-01-06 17:21:31.104486,Rav,R014207844_2932-43B
68,R014207907F,527,0,0,0,0,0,8.034927,2023-01-06 17:22:43.826910,Rav,R014207907F_527
69,R014207823,3556-43,0,0,0,0,0,25.225622,2023-01-06 17:17:09.192231,Rav,R014207823_3556-43
70,R014207824,3133-8A,0,0,0,0,0,11.606884,2023-01-06 17:17:31.067429,Rav,R014207824_3133-8A


In [10]:
df_p2.to_csv(logDir + 'reprocess_list.csv', index=False)