# Quality Analysis - General

#### Updated: May 17, 2023

#  

Quality analysis pipeline after phase 2 of Alouette processing (OCR processing phase):

In [11]:
import os
import numpy as np
import pandas as pd
import seaborn as sns

In [12]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Combine BATCH I and II:

#  

#### Stage 1 - Overall Statistics:

In [18]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,images,subdir_id
2331,R014207969,1302-18B,343,R014207969_1302-18B
1909,R014207941,1773-1,369,R014207941_1773-1
2131,R014207952,2719-13B,320,R014207952_2719-13B
1637,R014207838,4560-50,131,R014207838_4560-50
281,R014207769,MAY-18-TO-MAY-22-1965,228,R014207769_MAY-18-TO-MAY-22-1965
2232,R014207961,1079-B,368,R014207961_1079-B
927,R014207826,3848-15A,282,R014207826_3848-15A
2071,R014207950,2479-5-B,330,R014207950_2479-5-B
1124,R014207829,4285-43A-2,299,R014207829_4285-43A-2
991,R014207827,3792-69A,321,R014207827_3792-69A


In [20]:
n_images = df_inventory['images'].sum()
n_images

726577

In [35]:
df_s1 = pd.read_csv(resultDir + 'result_stage1_raw.csv', low_memory=False)
print(len(df_s1))
df_s1.sample(10)

600621


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,raw_coord,window_coord,mapping_Hz,mapping_km,details,Directory,Subdirectory,filename,station_code,station_number
183412,1.604839,1310.638298,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,,,,,R014207825,3781-50A,49.png,,
570430,1.71371,886.0,1.0,3.0,1.0,1.0,8.0,1.0,7.0,5.0,...,,,,,,R014207969,1298-18B,326.png,,
567024,1.677419,1402.083333,1.0,3.0,2.0,9.0,9.0,0.0,3.0,4.0,...,,,,,,R014207969,1288-5A,137.png,,
193583,1.540323,1400.0,2.0,3.0,5.0,0.0,4.0,1.0,4.0,2.0,...,,,,,,R014207826,3876-69A-3,27.png,,
482790,1.516129,6722.222222,1.0,5.0,1.0,8.0,0.0,0.0,4.0,2.0,...,,,,,,R014207950,2471-3-A,65.png,,
8648,,,,,,,,,,,...,,,,,,R014207807,4976,179.png,,
425302,,,,,,,,,,,...,,,,,"height: 1207,width: 3096",R014207886,22,81.png,,
595347,1.641129,1056.521739,,,,,,,,,...,[[ 541. 15.]\n [ 542. 15.]\n [ 543. 15.]...,[[ 541. 15.]\n [ 542. 15.]\n [ 543. 15.]...,"{1.5: 149, 2.0: 273, 2.5: 335, 3.5: 390, 4.5: ...","{100: 59, 200: 105}",,R028224481,4915,299.png,,
300105,1.5,1022.44898,1.0,2.0,1.0,2.0,0.0,2.0,2.0,1.0,...,,,,,,R014207833,4372-69,264.png,,
575466,,,,,,,,,,,...,,,,,"height: 721,width: 928",R014207969,1313-18B,115.png,,


In [36]:
df_s1['processed_image_class'].unique()

array(['num', 'loss', 'outlier', 'dot'], dtype=object)

In [37]:
n_proc = len(df_s1.loc[df_s1['processed_image_class'] == 'num']) + len(df_s1.loc[df_s1['processed_image_class'] == 'dot'])
n_loss = len(df_s1.loc[df_s1['processed_image_class'] == 'loss'])
n_outlier = len(df_s1.loc[df_s1['processed_image_class'] == 'outlier'])
n_unproc = n_images - n_proc - n_outlier - n_loss

In [38]:
print('# of images: ' + str(n_images))
print('# of images processed after stage 1 processing: ' + str(n_proc))
print('% of images processed after stage 1 processing: ' + str(round(((n_proc/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 1 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print('% of images unprocessed after stage 1 processing: ' + str(round(((n_unproc/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 1 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 1 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images: 726577
# of images processed after stage 1 processing: 449732
% of images processed after stage 1 processing: 61.9 %

% total loss after stage 1 processing: 38.1 %
% of images unprocessed after stage 1 processing: 17.34 %
% of images classified as 'loss' after stage 1 processing: 13.07 %
% of images classified as 'outlier' after stage 1 processing: 7.7 %


#  

#### Stage 2 - Overall Statistics:

In [34]:
df_s2 = pd.read_csv(resultDir + 'result_stage2_raw.csv', low_memory=False)
print(len(df_s2))
df_s2.sample(10)

42466


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,filename,station_code,station_number,rotated_metadata,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
41674,1.919355,1219.565217,,,,,,,,,...,237.png,,,,,,,,,
17095,1.806452,1960.0,,,,,,,,,...,154.png,,,,8.0,67.0,289.0,2.0,5.0,54.0
4645,,,,,,,,,,,...,321.png,,,,,,,,,
2235,1.524194,962.0,,,,,,,,,...,224.png,,,,37.0,68.0,201.0,7.0,22.0,33.0
17496,1.608871,1086.666667,,,,,,,,,...,28.png,,,,,,,,,
14542,1.629032,1347.916667,,,,,,,,,...,114.png,,,,,,,,,
29085,,,,,,,,,,,...,100.png,,,,,,,,,
34555,1.71371,1328.0,1.0,4.0,2.0,5.0,1.0,1.0,7.0,2.0,...,97.png,,,,,,,,,
21230,1.5,1655.882353,,,,,,,,,...,239.png,,,,,,,,,
1493,,,,,,,,,,,...,250.png,,,,,,,,,


In [41]:
df_s2.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'limits', 'height', 'width',
       'metadata_type', 'meta_height', 'meta_width', 'x_centroids',
       'y_centroids', 'is_dot', 'dict_metadata', 'raw_coord', 'window_coord',
       'mapping_Hz', 'mapping_km', 'details', 'Directory', 'Subdirectory',
       'filename', 'station_code', 'station_number', 'rotated_metadata',
       'station_number_OCR', 'year_OCR', 'day_of_year_OCR', 'hour_OCR',
       'minute_OCR', 'second_OCR'],
      dtype='object')

In [40]:
df_s2['processed_image_class'].unique()

array(['num', 'loss', 'outlier', 'num2', 'dot'], dtype=object)

In [42]:
n_num2 = len(df_s2.loc[df_s2['processed_image_class'] == 'num2'])
n_num = len(df_s2.loc[df_s2['processed_image_class'] == 'num'])
n_dot = len(df_s2.loc[df_s2['processed_image_class'] == 'dot'])
n_proc = n_num2 + n_num + n_dot
n_loss = len(df_s2.loc[df_s2['processed_image_class'] == 'loss'])
n_outlier = len(df_s2.loc[df_s2['processed_image_class'] == 'outlier'])


6111