# Quality Analysis (General)

#### Updated: May 19, 2023

#  

Quality analysis pipeline after phase 2 of Alouette processing (OCR processing phase):

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Combine BATCH I and II:

#  

#### Stage 1 - Overall Statistics:

In [3]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
n_images = df_inventory['images'].sum()

In [4]:
df_s1 = pd.read_csv(resultDir + 'result_stage1_raw.csv', low_memory=False)

In [5]:
n_proc = len(df_s1.loc[df_s1['processed_image_class'] == 'num']) + len(df_s1.loc[df_s1['processed_image_class'] == 'dot'])
n_loss = len(df_s1.loc[df_s1['processed_image_class'] == 'loss'])
n_outlier = len(df_s1.loc[df_s1['processed_image_class'] == 'outlier'])
n_unproc = n_images - n_proc - n_outlier - n_loss

In [6]:
print('# of images: ' + str(n_images))
print('# of images processed after stage 1 processing: ' + str(n_proc))
print('% of images processed after stage 1 processing: ' + str(round(((n_proc/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 1 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print('% of images unprocessed after stage 1 processing: ' + str(round(((n_unproc/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 1 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 1 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images: 726577
# of images processed after stage 1 processing: 449732
% of images processed after stage 1 processing: 61.9 %

% total loss after stage 1 processing: 38.1 %
% of images unprocessed after stage 1 processing: 17.34 %
% of images classified as 'loss' after stage 1 processing: 13.07 %
% of images classified as 'outlier' after stage 1 processing: 7.7 %


Breakdown 'loss' type:

In [7]:
df_loss_bd = df_s1.loc[df_s1['processed_image_class'] == 'loss'].groupby(['func_name']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
metadata_translation.determine_leftside_metadata_grid_mapping.extract_centroids_and_determine_type,37872
ionogram_content_extraction.extract_select_parameters.extract_fmin_and_max_depth,9521
image_segmentation.trim_raw_metadata.trimming_metadata,3968
image_segmentation.extract_ionogram_from_scan.extract_ionogram,1


Breakdown 'outlier' type:

In [8]:
df_outlier_bd = df_s1.loc[df_s1['processed_image_class'] == 'outlier'].groupby(['func_name']).count()[['filename']]
df_outlier_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
image_segmentation.segment_images_in_subdir.segment_images: iono size outlier,54428
image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier,1511


#  

#### Stage 2 - Overall Statistics:

In [9]:
df_s2 = pd.read_csv(resultDir + 'result_stage2_raw.csv', low_memory=False)

In [10]:
n_num2 = len(df_s2.loc[df_s2['processed_image_class'] == 'num2'])
n_num = len(df_s2.loc[df_s2['processed_image_class'] == 'num'])
n_dot = len(df_s2.loc[df_s2['processed_image_class'] == 'dot'])
n_proc2 = n_num2 + n_num + n_dot
n_loss = len(df_s2.loc[df_s2['processed_image_class'] == 'loss'])
n_outlier = len(df_s2.loc[df_s2['processed_image_class'] == 'outlier'])

In [11]:
print('# of images processed after stage 2 processing: ' + str(n_proc2))
print('% of images processed after stage 2 processing: ' + str(round(((n_proc2/n_images)*100), 2)) + ' %')
print('')
print("% of images classified as 'num2' after stage 2 processing: " + str(round(((n_num2/n_images)*100), 2)) + ' %')
print("% of images classified as 'num' after stage 2 processing: " + str(round(((n_num/n_images)*100), 2)) + ' %')
print("% of images classified as 'dot' after stage 2 processing: " + str(round(((n_dot/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 2 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 2 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 2 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images processed after stage 2 processing: 79146
% of images processed after stage 2 processing: 10.89 %

% of images classified as 'num2' after stage 2 processing: 4.16 %
% of images classified as 'num' after stage 2 processing: 6.39 %
% of images classified as 'dot' after stage 2 processing: 0.34 %

% total loss after stage 2 processing: 31.74 %
% of images classified as 'loss' after stage 2 processing: 11.86 %
% of images classified as 'outlier' after stage 2 processing: 2.55 %


Breakdown 'loss' type:

In [12]:
df_s2_loss = df_s2.copy(deep=True).loc[df_s2['processed_image_class'] == 'loss']
df_s2_loss['loss_summary'] = df_s2_loss['func_name'] + ', ' + df_s2_loss['details'] 
df_loss_bd = df_s2_loss.groupby(['loss_summary']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
loss_summary,Unnamed: 1_level_1
"image_segmentation.segment_images_in_subdir.segment_images: iono size outlier, OCR read metadata contains letters",374
"ionogram_content_extraction.extract_select_parameters.extract_fmin_and_max_depth, OCR read metadata contains letters",80
"metadata_translation.determine_leftside_metadata_grid_mapping.extract_centroids_and_determine_type, OCR read metadata contains letters",15
"image_segmentation.trim_raw_metadata.trimming_metadata, OCR read metadata contains letters",9
"image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier, OCR read metadata contains letters",2


In [13]:
df_s2_loss = df_s2.loc[df_s2['processed_image_class'] == 'loss']
df_loss_bd = df_s2_loss.groupby(['details']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
details,Unnamed: 1_level_1
metadata could not be read by OCR,45139
OCR read metadata contains letters,5173
metadata was interpreted to be dot type,1052
metadata was interpreted to be num type,878


Breakdown 'outlier' type:

In [14]:
df_s2_outlier = df_s2.loc[df_s2['processed_image_class'] == 'outlier']
df_outlier_bd = df_s2_outlier.groupby(['func_name']).count()[['filename']]
df_outlier_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
image_segmentation.segment_images_in_subdir.segment_images: iono size outlier,17947
image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier,567


#  

#### Master - Overall Statistics:

In [15]:
df_master = pd.read_csv(resultDir + 'result_master.csv', low_memory=False)
n_master = len(df_master.loc[(~pd.isna(df_master['Station_Code'])) & (df_master['time_quality'] == 1)])

In [16]:
n_tq1 = len(df_master.loc[df_master['time_quality'] == 1])
n_tq2 = len(df_master.loc[df_master['time_quality'] == 2])
n_tq3 = len(df_master.loc[df_master['time_quality'] == 3])
n_tq4 = len(df_master.loc[df_master['time_quality'] == 4])

In [17]:
print('# of images fully read after stage 2 processing: ' + str(n_master))
print('% of images fully read after stage 2 processing (yield): ' + str(round(((n_master/n_images)*100), 2)) + ' %')
print('')
print('% time quality level 1 yield: ' + str(round(((n_tq1/n_images)*100), 2)) + ' %')
print('% at least time quality level 2: ' + str(round((((n_tq1+n_tq2)/n_images)*100), 2)) + ' %')
print('% at least time quality level 3: ' + str(round((((n_tq1+n_tq2+n_tq3)/n_images)*100), 2)) + ' %')
print('% at least time quality level 4: ' + str(round((((n_tq1+n_tq2+n_tq3+n_tq4)/n_images)*100), 2)) + ' %')

# of images fully read after stage 2 processing: 58606
% of images fully read after stage 2 processing (yield): 8.07 %

% time quality level 1 yield: 8.07 %
% at least time quality level 2: 8.09 %
% at least time quality level 3: 8.15 %
% at least time quality level 4: 8.47 %


#  

#### Data distribution by ground station:

In [18]:
df2 = df_master.copy(deep=True)
df2['Station_Number_Station_Code_processed_image_class'] = df2['Station_Number'].astype(int).astype(str) + '_' + df2['Station_Code'] + '_' + df2['processed_image_class']

In [19]:
df_master_by_stn = df2.groupby(['Station_Number']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stn.head(10)

Unnamed: 0_level_0,filename
Station_Number,Unnamed: 1_level_1
5.0,10583
13.0,6424
1.0,6240
50.0,5887
43.0,4952
8.0,2680
12.0,2576
3.0,2541
53.0,2506
69.0,2292


In [22]:
df_master_by_stnname = df2.groupby(['Station_Name']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stnname.head(50)

Unnamed: 0_level_0,filename
Station_Name,Unnamed: 1_level_1
"Quito, Ecuador",11211
"College, Fairbanks, AK, USA",6098
"Ottawa, Canada",6049
"Resolute Bay, NT, Canada",5620
"Blossom Point, MD, USA",5608
Falkland Islands (Islas Malvinas),3333
"Winkfield, England, UK",2848
"Kano, Nigeria",2506
"Santiago, Chile",2503
"Tromso, Norway",2501


In [21]:
df_master_by_stn_code = df2.groupby(['Station_Number_Station_Code_processed_image_class']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stn_code.head(20)

Unnamed: 0_level_0,filename
Station_Number_Station_Code_processed_image_class,Unnamed: 1_level_1
5_QUI_num,10064
13_COL_num,5806
50_OTT_num2,5751
1_BPT_num,5572
43_RES_num2,4919
53_KNO_num2,2479
69_TRO_num2,2291
15_WNK_num2,1767
9_SOL_num,1760
3_FTM_num,1583


- show geographic distribution by metatype type 
- show geographic distribution by year

- show how each ground station used the same metadata type over time (metadata composition by ground station over time)


- Matplotlib - basemap
- cartopy
- ipyleaflet (interactive)
- plotly 

#  

#### Construct process_summary:

Load data:

In [23]:
print(len(df_inventory))
df_inventory.head()

2416


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207709,145,53,R014207709_145
1,R014207709,146,63,R014207709_146
2,R014207709,147,50,R014207709_147
3,R014207709,148,16,R014207709_148
4,R014207709,149,28,R014207709_149


In [24]:
df_process_log = pd.read_csv(logDir + 'process_log.csv')
print(len(df_process_log))
df_p1 = df_process_log.sort_values('Process_timestamp', ascending=True)
df_p1 = df_p1.drop_duplicates(subset=['subdir_id'], keep='last') 
print(len(df_p1))
df_p1.head()

2487
2417


Unnamed: 0,Directory,Subdirectory,Images_processed,Images_dot,Images_num,Images_loss,Images_outlier,Process_time,Process_timestamp,User,subdir_id
0,R014207837,4013-50,162.0,0.0,150.0,4.0,8.0,277.122729,2023-05-13 11:13:35.969795,Rav Super3,R014207837_4013-50
1,R014207769,JUNE-28-TO-JULY-6-1965,0.0,0.0,0.0,0.0,0.0,214.129275,2023-05-13 11:13:50.695690,Rav Super5,R014207769_JUNE-28-TO-JULY-6-1965
2,R014207820,2993-43A,16.0,0.0,12.0,0.0,4.0,232.333798,2023-05-13 11:15:58.948288,Rav Super8,R014207820_2993-43A
3,R014207813,5037,339.0,0.0,0.0,278.0,61.0,267.199561,2023-05-13 11:17:12.488731,Rav Super9,R014207813_5037
4,R014207822,3436-50-A,267.0,0.0,257.0,0.0,10.0,541.358854,2023-05-13 11:23:17.637033,Rav Super5,R014207822_3436-50-A


In [25]:
df_process_log2 = pd.read_csv(logDir + 'process_log_OCR.csv')
print(len(df_process_log2))
df_p2 = df_process_log2.sort_values('Process_timestamp', ascending=True)
df_p2 = df_p2.drop_duplicates(subset=['subdir_id'], keep='last') 
print(len(df_p2))
df_p2.head()

626
626


Unnamed: 0,Directory,Subdirectory,Process_time,Process_timestamp,User,subdir_id
0,R014207827,3822-43A,990.21604,2023-05-15 18:06:19.824654,Rav Super1,R014207827_3822-43A
1,R014207820,3000-43A,1062.28344,2023-05-15 18:24:02.150093,Rav Super1,R014207820_3000-43A
2,R014207835,4060-19,940.294587,2023-05-15 18:39:42.472678,Rav Super1,R014207835_4060-19
3,R014207963,1544-1B,814.29774,2023-05-15 18:53:16.795418,Rav Super1,R014207963_1544-1B
4,R014207711,39,384.984926,2023-05-15 18:59:41.800343,Rav Super1,R014207711_39


In [26]:
df_pp = pd.read_csv(resultDir + 'result_postprocess_raw.csv', low_memory=False)

Construct process_summary:

In [27]:
df_sum1 = df_inventory.merge(df_p1[['Images_processed', 'Images_loss', 'Images_outlier', 'Process_time', 'subdir_id']], how='left', on='subdir_id')
df_sum1 = df_sum1.rename(columns={
    'images': 'n_images',
    'Images_loss': 'n_stage1_loss',
    'Images_outlier': 'n_stage1_outlier',
    'Process_time': 'process_time_stage1'
})
df_sum1['n_stage1_processed'] = df_sum1['Images_processed'] - df_sum1['n_stage1_loss'] - df_sum1['n_stage1_outlier']
df_sum1['n_stage1_unprocessed'] = df_sum1['n_images'] - df_sum1['Images_processed']
df_sum1 = df_sum1[['Directory', 'Subdirectory', 'n_images', 'n_stage1_processed', 'n_stage1_unprocessed', 'n_stage1_loss',
                  'n_stage1_outlier', 'process_time_stage1', 'subdir_id']]
print(len(df_sum1))
df_sum1.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,n_images,n_stage1_processed,n_stage1_unprocessed,n_stage1_loss,n_stage1_outlier,process_time_stage1,subdir_id
1067,R014207828,4409-69A,296,258.0,0.0,5.0,33.0,1081.18384,R014207828_4409-69A
2096,R014207950,2504-5-B,337,315.0,1.0,7.0,14.0,1588.778542,R014207950_2504-5-B
896,R014207825,3757-38A,262,207.0,0.0,47.0,8.0,637.641968,R014207825_3757-38A
1528,R014207837,3970-50,337,51.0,0.0,259.0,27.0,240.495369,R014207837_3970-50
263,R014207769,JULY-30-TO-AUG-8-1965,233,0.0,233.0,0.0,0.0,147.650146,R014207769_JULY-30-TO-AUG-8-1965
960,R014207826,3881-69A,292,283.0,0.0,0.0,9.0,880.294278,R014207826_3881-69A
181,R014207766,APR-24-TO-MAY-1-1965,220,0.0,220.0,0.0,0.0,734.164482,R014207766_APR-24-TO-MAY-1-1965
2005,R014207945,2233-18B,320,300.0,0.0,6.0,14.0,678.772549,R014207945_2233-18B
146,R014207763,JULY-29-TO-AUG-6-1965,427,0.0,427.0,0.0,0.0,217.824506,R014207763_JULY-29-TO-AUG-6-1965
600,R014207817,4177-03A,266,187.0,61.0,0.0,18.0,481.455736,R014207817_4177-03A


In [28]:
df_sum2 = df_sum1.merge(df_p2[['Process_time', 'subdir_id']], how='left', on='subdir_id')
df_sum2 = df_sum2.rename(columns={
    'Process_time': 'process_time_stage2'
})
df_sum2 = df_sum2[['Directory', 'Subdirectory', 'n_images', 'n_stage1_processed', 'n_stage1_unprocessed', 'n_stage1_loss',
                  'n_stage1_outlier', 'process_time_stage1', 'process_time_stage2', 'subdir_id']]
print(len(df_sum2))
df_sum2.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,n_images,n_stage1_processed,n_stage1_unprocessed,n_stage1_loss,n_stage1_outlier,process_time_stage1,process_time_stage2,subdir_id
900,R014207825,3761-69A-2,315,298.0,0.0,4.0,13.0,847.905597,,R014207825_3761-69A-2
1266,R014207831,3948-3A,332,293.0,0.0,4.0,35.0,447.434135,,R014207831_3948-3A
1340,R014207833,4382-50,243,230.0,0.0,2.0,11.0,1409.015295,,R014207833_4382-50
1835,R014207892,109,116,0.0,116.0,0.0,0.0,195.888027,,R014207892_109
474,R014207811,5071,377,1.0,0.0,326.0,50.0,468.059904,,R014207811_5071
1792,R014207886,B-2492,256,0.0,256.0,0.0,0.0,163.713271,,R014207886_B-2492
328,R014207807,4956,256,0.0,256.0,0.0,0.0,249.534003,,R014207807_4956
481,R014207811,5078,388,2.0,0.0,354.0,32.0,289.832513,,R014207811_5078
1025,R014207827,3826-43A,287,281.0,1.0,0.0,5.0,877.006061,,R014207827_3826-43A
370,R014207808,4578-19,288,249.0,2.0,4.0,33.0,2059.647795,,R014207808_4578-19


In [29]:
df_s2['subdir_id'] = df_s2['Directory'] + '_' + df_s2['Subdirectory']
df_sum3_ = pd.DataFrame()

for i in range(0, len(df_sum2)):
    subdir_id = df_sum2['subdir_id'].iloc[i]
    df_s2_ = df_s2[['processed_image_class', 'filename', 'subdir_id']].loc[df_s2['subdir_id'] == subdir_id]
    if len(df_s2_) > 0:
        n_s2_proc = len(df_s2_.loc[df_s2_['processed_image_class'] == 'num2']) + len(df_s2_.loc[df_s2_['processed_image_class'] == 'num']) + len(df_s2_.loc[df_s2_['processed_image_class'] == 'dot'])
        n_s2_loss = len(df_s2_.loc[df_s2_['processed_image_class'] == 'loss'])
        n_s2_outlier = len(df_s2_.loc[df_s2_['processed_image_class'] == 'outlier'])
        row = pd.DataFrame({
            'n_stage2_processed': n_s2_proc,
            'n_stage2_loss': n_s2_loss,
            'n_stage2_outlier': n_s2_outlier,
            'subdir_id': subdir_id
        }, index=[0])
        df_sum3_ = pd.concat([df_sum3_, row])

df_sum3 = df_sum2.merge(df_sum3_, how='left', on='subdir_id')
df_sum3 = df_sum3[['Directory', 'Subdirectory', 'n_images', 'n_stage1_processed', 'n_stage1_unprocessed', 'n_stage1_loss',
                  'n_stage1_outlier', 'process_time_stage1', 'process_time_stage2', 'n_stage2_processed', 'n_stage2_loss',
                  'n_stage2_outlier', 'subdir_id']]

In [30]:
print(len(df_sum3))
df_sum3.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,n_images,n_stage1_processed,n_stage1_unprocessed,n_stage1_loss,n_stage1_outlier,process_time_stage1,process_time_stage2,n_stage2_processed,n_stage2_loss,n_stage2_outlier,subdir_id
1914,R014207941,1778-1B,336,293.0,3.0,12.0,28.0,1030.206842,7069.185497,293.0,12.0,28.0,R014207941_1778-1B
2171,R014207952,2759-4A,366,338.0,16.0,0.0,12.0,831.169161,,,,,R014207952_2759-4A
522,R014207813,4998,321,0.0,0.0,276.0,45.0,207.490443,,,,,R014207813_4998
1921,R014207941,1786-3B,314,291.0,8.0,0.0,15.0,710.814716,355.606185,291.0,0.0,15.0,R014207941_1786-3B
982,R014207826,3903-12,353,256.0,1.0,0.0,96.0,372.495427,,,,,R014207826_3903-12
400,R014207808,4608-72,270,205.0,1.0,20.0,44.0,1318.249937,,,,,R014207808_4608-72
1022,R014207827,3823-43A,363,352.0,0.0,0.0,11.0,1265.709869,,,,,R014207827_3823-43A
424,R014207810,5114,213,0.0,0.0,195.0,18.0,252.762091,,,,,R014207810_5114
1537,R014207837,3979-50A,323,299.0,0.0,4.0,20.0,741.568211,2108.513434,89.0,215.0,19.0,R014207837_3979-50A
560,R014207813,5038,377,76.0,223.0,20.0,58.0,318.863429,2129.635514,76.0,20.0,58.0,R014207813_5038


In [None]:
df_pp['subdir_id'] = df_pp['Directory'] + '_' + df_pp['Subdirectory']
df_sum4_ = pd.DataFrame()

for i in range(0, len(df_sum3)):
    subdir_id = df_sum3['subdir_id'].iloc[i]
    df_pp_ = df_pp[['processed_image_class', 'filename', 'subdir_id']].loc[df_pp['subdir_id'] == subdir_id]
    if len(df_pp_) > 0:
        n_pp_proc = len(df_pp_.loc[df_pp_['processed_image_class'] == 'num2']) + len(df_pp_.loc[df_pp_['processed_image_class'] == 'num']) + len(df_pp_.loc[df_pp_['processed_image_class'] == 'dot'])
        n_pp_loss = len(df_pp_.loc[df_pp_['processed_image_class'] == 'loss'])
        n_pp_outlier = len(df_pp_.loc[df_pp_['processed_image_class'] == 'outlier'])
        row = pd.DataFrame({
            'n_postprocess_processed': n_pp_proc,
            'n_postprocess_loss': n_pp_loss,
            'n_postprocess_outlier': n_pp_outlier,
            'subdir_id': subdir_id
        }, index=[0])
        df_sum4_ = pd.concat([df_sum4_, row])

df_sum4 = df_sum3.merge(df_sum4_, how='left', on='subdir_id')
df_sum4 = df_sum4[['Directory', 'Subdirectory', 'n_images', 'n_stage1_processed', 'n_stage1_unprocessed', 'n_stage1_loss',
                  'n_stage1_outlier', 'process_time_stage1', 'process_time_stage2', 'n_stage2_processed', 'n_stage2_loss',
                  'n_stage2_outlier', 'n_postprocess_processed', 'n_postprocess_loss', 'n_postprocess_outlier', 'subdir_id']]

In [None]:
print(len(df_sum4))
df_sum4.sample(10)

In [None]:
df_master['subdir_id'] = df_master['Directory'] + '_' + df_master['Subdirectory']
df_sum5_ = pd.DataFrame()

for i in range(0, len(df_sum4)):
    subdir_id = df_sum4['subdir_id'].iloc[i]
    df_master_ = df_master[['processed_image_class', 'filename', 'subdir_id']].loc[df_master['subdir_id'] == subdir_id]
    if len(df_master_) > 0:
        n_mas = len(df_master_)
        n_mas_num2 = len(df_master_.loc[df_master_['processed_image_class'] == 'num2'])
        n_mas_num = len(df_master_.loc[df_master_['processed_image_class'] == 'num'])
        n_mas_dot = len(df_master_.loc[df_master_['processed_image_class'] == 'dot'])
        row = pd.DataFrame({
            'n_master': n_mas,
            'n_master_num2': n_mas_num2,
            'n_master_num': n_mas_num,
            'n_master_dot': n_mas_dot,
            'subdir_id': subdir_id
        }, index=[0])
        df_sum5_ = pd.concat([df_sum5_, row])

df_sum5 = df_sum4.merge(df_sum5_, how='left', on='subdir_id')
df_sum5 = df_sum5[['Directory', 'Subdirectory', 'n_images', 'n_stage1_processed', 'n_stage1_unprocessed', 'n_stage1_loss',
                  'n_stage1_outlier', 'process_time_stage1', 'process_time_stage2', 'n_stage2_processed', 'n_stage2_loss',
                  'n_stage2_outlier', 'n_postprocess_processed', 'n_postprocess_loss', 'n_postprocess_outlier', 
                  'n_master', 'n_master_num2', 'n_master_num', 'n_master_dot', 'subdir_id']]

In [None]:
print(len(df_sum5))
df_sum5.sample(10)

In [None]:
df_sum5.to_csv(logDir + 'process_summary.csv', index=False)

#  

#### Plot histograms to characterize quality of processed subdirectories:

In [None]:
df_summary = pd.read_csv(logDir + 'process_summary.csv')
df_summary.head()

In [None]:
df_summary['pc_stage1_processed'] = df_summary['n_stage1_processed']/df_summary['n_images']
sns.histplot(data=df_summary, x='pc_stage1_processed', stat='percent')

In [None]:
df_summary['pc_stage2_processed'] = df_summary['n_stage2_processed']/df_summary['n_images']
sns.histplot(data=df_summary, x='pc_stage2_processed', stat='percent')

In [None]:
df_summary['pc_postprocess_processed'] = df_summary['n_postprocess_processed']/df_summary['n_images']
sns.histplot(data=df_summary, x='pc_postprocess_processed', stat='percent')

In [None]:
df_summary['yield'] = df_summary['n_master']/df_summary['n_images']
sns.histplot(data=df_summary, x='yield', stat='percent')