# Quality Analysis - General

#### Updated: May 17, 2023

#  

Quality analysis pipeline after phase 2 of Alouette processing (OCR processing phase):

In [73]:
import os
import numpy as np
import pandas as pd
import seaborn as sns

In [74]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Combine BATCH I and II:

#  

#### Stage 1 - Overall Statistics:

In [75]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
n_images = df_inventory['images'].sum()

In [76]:
df_s1 = pd.read_csv(resultDir + 'result_stage1_raw.csv', low_memory=False)

In [77]:
n_proc = len(df_s1.loc[df_s1['processed_image_class'] == 'num']) + len(df_s1.loc[df_s1['processed_image_class'] == 'dot'])
n_loss = len(df_s1.loc[df_s1['processed_image_class'] == 'loss'])
n_outlier = len(df_s1.loc[df_s1['processed_image_class'] == 'outlier'])
n_unproc = n_images - n_proc - n_outlier - n_loss

In [78]:
print('# of images: ' + str(n_images))
print('# of images processed after stage 1 processing: ' + str(n_proc))
print('% of images processed after stage 1 processing: ' + str(round(((n_proc/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 1 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print('% of images unprocessed after stage 1 processing: ' + str(round(((n_unproc/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 1 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 1 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images: 726577
# of images processed after stage 1 processing: 449732
% of images processed after stage 1 processing: 61.9 %

% total loss after stage 1 processing: 38.1 %
% of images unprocessed after stage 1 processing: 17.34 %
% of images classified as 'loss' after stage 1 processing: 13.07 %
% of images classified as 'outlier' after stage 1 processing: 7.7 %


Breakdown 'loss' type:

In [79]:
df_loss_bd = df_s1.loc[df_s1['processed_image_class'] == 'loss'].groupby(['func_name']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
metadata_translation.determine_leftside_metadata_grid_mapping.extract_centroids_and_determine_type,37872
ionogram_content_extraction.extract_select_parameters.extract_fmin_and_max_depth,9521
image_segmentation.trim_raw_metadata.trimming_metadata,3968
image_segmentation.extract_ionogram_from_scan.extract_ionogram,1


Breakdown 'outlier' type:

In [80]:
df_outlier_bd = df_s1.loc[df_s1['processed_image_class'] == 'outlier'].groupby(['func_name']).count()[['filename']]
df_outlier_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
image_segmentation.segment_images_in_subdir.segment_images: iono size outlier,54428
image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier,1511


#  

#### Stage 2 - Overall Statistics:

In [81]:
df_s2 = pd.read_csv(resultDir + 'result_stage2_raw.csv', low_memory=False)

In [82]:
n_num2 = len(df_s2.loc[df_s2['processed_image_class'] == 'num2'])
n_num = len(df_s2.loc[df_s2['processed_image_class'] == 'num'])
n_dot = len(df_s2.loc[df_s2['processed_image_class'] == 'dot'])
n_proc2 = n_num2 + n_num + n_dot
n_loss = len(df_s2.loc[df_s2['processed_image_class'] == 'loss'])
n_outlier = len(df_s2.loc[df_s2['processed_image_class'] == 'outlier'])

In [83]:
print('# of images processed after stage 2 processing: ' + str(n_proc2))
print('% of images processed after stage 2 processing: ' + str(round(((n_proc2/n_images)*100), 2)) + ' %')
print('')
print("% of images classified as 'num2' after stage 2 processing: " + str(round(((n_num2/n_images)*100), 2)) + ' %')
print("% of images classified as 'num' after stage 2 processing: " + str(round(((n_num/n_images)*100), 2)) + ' %')
print("% of images classified as 'dot' after stage 2 processing: " + str(round(((n_dot/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 2 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 2 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 2 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images processed after stage 2 processing: 25771
% of images processed after stage 2 processing: 3.55 %

% of images classified as 'num2' after stage 2 processing: 1.29 %
% of images classified as 'num' after stage 2 processing: 2.12 %
% of images classified as 'dot' after stage 2 processing: 0.14 %

% total loss after stage 2 processing: 22.14 %
% of images classified as 'loss' after stage 2 processing: 3.86 %
% of images classified as 'outlier' after stage 2 processing: 0.95 %


Breakdown 'loss' type:

In [84]:
df_s2_loss = df_s2.copy(deep=True).loc[df_s2['processed_image_class'] == 'loss']
df_s2_loss['loss_summary'] = df_s2_loss['func_name'] + ', ' + df_s2_loss['details'] 
df_loss_bd = df_s2_loss.groupby(['loss_summary']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
loss_summary,Unnamed: 1_level_1
"image_segmentation.segment_images_in_subdir.segment_images: iono size outlier, OCR read metadata contains letters",171
"ionogram_content_extraction.extract_select_parameters.extract_fmin_and_max_depth, OCR read metadata contains letters",47
"image_segmentation.trim_raw_metadata.trimming_metadata, OCR read metadata contains letters",6
"metadata_translation.determine_leftside_metadata_grid_mapping.extract_centroids_and_determine_type, OCR read metadata contains letters",3
"image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier, OCR read metadata contains letters",1


In [85]:
df_s2_loss = df_s2.loc[df_s2['processed_image_class'] == 'loss']
df_loss_bd = df_s2_loss.groupby(['details']).count()[['filename']]
df_loss_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
details,Unnamed: 1_level_1
metadata could not be read by OCR,13252
OCR read metadata contains letters,1625
metadata was interpreted to be num type,294
metadata was interpreted to be dot type,258


Breakdown 'outlier' type:

In [86]:
df_s2_outlier = df_s2.loc[df_s2['processed_image_class'] == 'outlier']
df_outlier_bd = df_s2_outlier.groupby(['func_name']).count()[['filename']]
df_outlier_bd.sort_values('filename', ascending=False)

Unnamed: 0_level_0,filename
func_name,Unnamed: 1_level_1
image_segmentation.segment_images_in_subdir.segment_images: iono size outlier,6494
image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier,391


#  

#### Master - Overall Statistics:

In [87]:
df_master = pd.read_csv(resultDir + 'result_master.csv', low_memory=False)
n_master = len(df_master.loc[(~pd.isna(df_master['Station_Code'])) & (df_master['time_quality'] == 1)])

In [88]:
n_tq1 = len(df_master.loc[df_master['time_quality'] == 1])
n_tq2 = len(df_master.loc[df_master['time_quality'] == 2])
n_tq3 = len(df_master.loc[df_master['time_quality'] == 3])
n_tq4 = len(df_master.loc[df_master['time_quality'] == 4])

In [89]:
print('# of images fully read after stage 2 processing: ' + str(n_master))
print('% of images fully read after stage 2 processing (yield): ' + str(round(((n_master/n_images)*100), 2)) + ' %')
print('')
print('% time quality level 1 yield: ' + str(round(((n_tq1/n_images)*100), 2)) + ' %')
print('% at least time quality level 2: ' + str(round((((n_tq1+n_tq2)/n_images)*100), 2)) + ' %')
print('% at least time quality level 3: ' + str(round((((n_tq1+n_tq2+n_tq3)/n_images)*100), 2)) + ' %')
print('% at least time quality level 4: ' + str(round((((n_tq1+n_tq2+n_tq3+n_tq4)/n_images)*100), 2)) + ' %')

# of images fully read after stage 2 processing: 18741
% of images fully read after stage 2 processing (yield): 2.58 %

% time quality level 1 yield: 2.58 %
% at least time quality level 2: 2.59 %
% at least time quality level 3: 2.61 %
% at least time quality level 4: 2.72 %


#  

#### Data distribution by ground station:

In [99]:
df2 = df_master.copy(deep=True)
df2['Station_Number_Station_Code_processed_image_class'] = df2['Station_Number'].astype(int).astype(str) + '_' + df2['Station_Code'] + '_' + df2['processed_image_class']

In [100]:
df_master_by_stn = df2.groupby(['Station_Number']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stn.head(10)

Unnamed: 0_level_0,filename
Station_Number,Unnamed: 1_level_1
5.0,3491
50.0,2059
13.0,1994
1.0,1880
43.0,1367
8.0,1053
3.0,949
69.0,824
12.0,654
15.0,524


In [101]:
df_master_by_stnname = df2.groupby(['Station_Name']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stnname.head(10)

Unnamed: 0_level_0,filename
Station_Name,Unnamed: 1_level_1
"Quito, Ecuador",3829
"Ottawa, Canada",2078
"College, Fairbanks, AK, USA",1981
"Blossom Point, MD, USA",1769
"Resolute Bay, NT, Canada",1487
"Santiago, Chile",998
"St. John's, NL, Canada",972
"Fort Myers, FL, USA",931
"Tromso, Norway",876
Falkland Islands (Islas Malvinas),875


In [102]:
df_master_by_stn_code = df2.groupby(['Station_Number_Station_Code_processed_image_class']).count()[['filename']].sort_values('filename', ascending=False)
df_master_by_stn_code.head(20)

Unnamed: 0_level_0,filename
Station_Number_Station_Code_processed_image_class,Unnamed: 1_level_1
5_QUI_num,3465
50_OTT_num2,1963
1_BPT_num,1760
13_COL_num,1734
43_RES_num2,1358
69_TRO_num2,823
3_FTM_num,786
8_SNT_num,736
15_WNK_num2,521
19_ULA_num2,456
