# Quality Analysis - General

#### Updated: May 17, 2023

#  

Quality analysis pipeline after phase 2 of Alouette processing (OCR processing phase):

In [11]:
import os
import numpy as np
import pandas as pd
import seaborn as sns

In [12]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Combine BATCH I and II:

#  

#### Stage 1 - Overall Statistics:

In [18]:
df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
print(len(df_inventory))
df_inventory.sample(10)

2416


Unnamed: 0,Directory,Subdirectory,images,subdir_id
2331,R014207969,1302-18B,343,R014207969_1302-18B
1909,R014207941,1773-1,369,R014207941_1773-1
2131,R014207952,2719-13B,320,R014207952_2719-13B
1637,R014207838,4560-50,131,R014207838_4560-50
281,R014207769,MAY-18-TO-MAY-22-1965,228,R014207769_MAY-18-TO-MAY-22-1965
2232,R014207961,1079-B,368,R014207961_1079-B
927,R014207826,3848-15A,282,R014207826_3848-15A
2071,R014207950,2479-5-B,330,R014207950_2479-5-B
1124,R014207829,4285-43A-2,299,R014207829_4285-43A-2
991,R014207827,3792-69A,321,R014207827_3792-69A


In [20]:
n_images = df_inventory['images'].sum()
n_images

726577

In [35]:
df_s1 = pd.read_csv(resultDir + 'result_stage1_raw.csv', low_memory=False)
print(len(df_s1))
df_s1.sample(10)

600621


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,raw_coord,window_coord,mapping_Hz,mapping_km,details,Directory,Subdirectory,filename,station_code,station_number
183412,1.604839,1310.638298,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,,,,,R014207825,3781-50A,49.png,,
570430,1.71371,886.0,1.0,3.0,1.0,1.0,8.0,1.0,7.0,5.0,...,,,,,,R014207969,1298-18B,326.png,,
567024,1.677419,1402.083333,1.0,3.0,2.0,9.0,9.0,0.0,3.0,4.0,...,,,,,,R014207969,1288-5A,137.png,,
193583,1.540323,1400.0,2.0,3.0,5.0,0.0,4.0,1.0,4.0,2.0,...,,,,,,R014207826,3876-69A-3,27.png,,
482790,1.516129,6722.222222,1.0,5.0,1.0,8.0,0.0,0.0,4.0,2.0,...,,,,,,R014207950,2471-3-A,65.png,,
8648,,,,,,,,,,,...,,,,,,R014207807,4976,179.png,,
425302,,,,,,,,,,,...,,,,,"height: 1207,width: 3096",R014207886,22,81.png,,
595347,1.641129,1056.521739,,,,,,,,,...,[[ 541. 15.]\n [ 542. 15.]\n [ 543. 15.]...,[[ 541. 15.]\n [ 542. 15.]\n [ 543. 15.]...,"{1.5: 149, 2.0: 273, 2.5: 335, 3.5: 390, 4.5: ...","{100: 59, 200: 105}",,R028224481,4915,299.png,,
300105,1.5,1022.44898,1.0,2.0,1.0,2.0,0.0,2.0,2.0,1.0,...,,,,,,R014207833,4372-69,264.png,,
575466,,,,,,,,,,,...,,,,,"height: 721,width: 928",R014207969,1313-18B,115.png,,


In [36]:
df_s1['processed_image_class'].unique()

array(['num', 'loss', 'outlier', 'dot'], dtype=object)

In [37]:
n_proc = len(df_s1.loc[df_s1['processed_image_class'] == 'num']) + len(df_s1.loc[df_s1['processed_image_class'] == 'dot'])
n_loss = len(df_s1.loc[df_s1['processed_image_class'] == 'loss'])
n_outlier = len(df_s1.loc[df_s1['processed_image_class'] == 'outlier'])
n_unproc = n_images - n_proc - n_outlier - n_loss

In [38]:
print('# of images: ' + str(n_images))
print('# of images processed after stage 1 processing: ' + str(n_proc))
print('% of images processed after stage 1 processing: ' + str(round(((n_proc/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 1 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print('% of images unprocessed after stage 1 processing: ' + str(round(((n_unproc/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 1 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 1 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images: 726577
# of images processed after stage 1 processing: 449732
% of images processed after stage 1 processing: 61.9 %

% total loss after stage 1 processing: 38.1 %
% of images unprocessed after stage 1 processing: 17.34 %
% of images classified as 'loss' after stage 1 processing: 13.07 %
% of images classified as 'outlier' after stage 1 processing: 7.7 %


#  

#### Stage 2 - Overall Statistics:

In [72]:
df_s2 = pd.read_csv(resultDir + 'result_stage2_raw.csv', low_memory=False)
print(len(df_s2))
df_s2.sample(10)

43524


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,filename,station_code,station_number,rotated_metadata,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
38371,1.705645,1392.0,4.0,0.0,2.0,6.0,1.0,1.0,4.0,3.0,...,110.png,,,,,,,,,
26979,1.66129,1257.142857,,,,,,,,,...,224.png,,,,50.0,67.0,311.0,14.0,23.0,3.0
8578,1.544355,645.454545,,,,,,,,,...,320.png,,,,,,,,,
18073,1.576613,1112.765957,,,,,,,,,...,215.png,,,,,,,,,
22852,1.733871,1472.093023,,,,,,,,,...,144.png,,,,,,,,,
20442,1.903226,1742.105263,,,,,,,,,...,204.png,,,,,,,,,
11748,1.604839,1093.75,,,,,,,,,...,102.png,,,,,,,,,
21963,1.556452,1434.042553,,,,,,,,,...,133.png,,,,19.0,67.0,79.0,1.0,11.0,47.0
2407,,,,,,,,,,,...,36.png,,,,,,,,,
31077,1.5,1430.769231,1.0,4.0,1.0,4.0,0.0,0.0,6.0,5.0,...,216.png,,,,,,,,,


In [73]:
df_s2.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'limits', 'height', 'width',
       'metadata_type', 'meta_height', 'meta_width', 'x_centroids',
       'y_centroids', 'is_dot', 'dict_metadata', 'raw_coord', 'window_coord',
       'mapping_Hz', 'mapping_km', 'details', 'Directory', 'Subdirectory',
       'filename', 'station_code', 'station_number', 'rotated_metadata',
       'station_number_OCR', 'year_OCR', 'day_of_year_OCR', 'hour_OCR',
       'minute_OCR', 'second_OCR'],
      dtype='object')

In [74]:
df_s2['processed_image_class'].unique()

array(['num', 'loss', 'outlier', 'num2', 'dot'], dtype=object)

In [75]:
n_num2 = len(df_s2.loc[df_s2['processed_image_class'] == 'num2'])
n_num = len(df_s2.loc[df_s2['processed_image_class'] == 'num'])
n_dot = len(df_s2.loc[df_s2['processed_image_class'] == 'dot'])
n_proc2 = n_num2 + n_num + n_dot
n_loss = len(df_s2.loc[df_s2['processed_image_class'] == 'loss'])
n_outlier = len(df_s2.loc[df_s2['processed_image_class'] == 'outlier'])

In [77]:
print('# of images processed after stage 2 processing: ' + str(n_proc2))
print('% of images processed after stage 2 processing: ' + str(round(((n_proc2/n_images)*100), 2)) + ' %')
print('')
print("% of images classified as 'num2' after stage 2 processing: " + str(round(((n_num2/n_images)*100), 2)) + ' %')
print("% of images classified as 'num' after stage 2 processing: " + str(round(((n_num/n_images)*100), 2)) + ' %')
print("% of images classified as 'dot' after stage 2 processing: " + str(round(((n_dot/n_images)*100), 2)) + ' %')
print('')
print("% total loss after stage 2 processing: " + str(round((((n_unproc + n_loss + n_outlier)/n_images)*100), 2)) + ' %')
print("% of images classified as 'loss' after stage 2 processing: " + str(round(((n_loss/n_images)*100), 2)) + ' %')
print("% of images classified as 'outlier' after stage 2 processing: " + str(round(((n_outlier/n_images)*100), 2)) + ' %')

# of images processed after stage 2 processing: 18118
% of images processed after stage 2 processing: 2.49 %

% of images classified as 'num2' after stage 2 processing: 0.87 %
% of images classified as 'num' after stage 2 processing: 1.49 %
% of images classified as 'dot' after stage 2 processing: 0.14 %

% total loss after stage 2 processing: 20.83 %
% of images classified as 'loss' after stage 2 processing: 2.82 %
% of images classified as 'outlier' after stage 2 processing: 0.67 %


In [78]:
df_master = pd.read_csv(resultDir + 'result_master.csv', low_memory=False)
print(len(df_master))
df_master.sample(10)

13826


Unnamed: 0,Directory,Subdirectory,filename,processed_image_class,fmin,max_depth,Timestamp,time_quality,Station_Number,Station_Code,Station_Name,Lat,Lng
6486,R014207941,1796-5B,309.png,num,1.673387,1240.0,1966-06-16 11:24:30,1.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
2484,R014207969,1270-5A,377.png,num,1.689516,1441.666667,1965-10-16 20:14:31,1.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
1988,R014207969,1270-5A,204.png,num,1.508065,1383.333333,1965-10-15 00:00:00,4.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
5529,R014207963,1547-5A,210.png,num,1.782258,1338.461538,1966-03-26 06:17:10,1.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
12609,R014207833,4378-15A,225.png,num2,1.71371,1004.0,1967-11-26 14:43:56,1.0,15.0,WNK,"Winkfield, England, UK",51.4N,0.4W
2799,R014207969,1276-5A,185.png,num,1.633065,1350.0,1965-10-17 19:11:58,1.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
1486,R014207969,1269-13B,354.png,num,1.5,1153.846154,1965-08-03 13:09:27,1.0,13.0,COL,"College, Fairbanks, AK, USA",64.9N,147.8W
1937,R014207961,1047,115.png,num,1.552419,1121.568627,1965-10-13 12:29:48,1.0,9.0,SOL,Falkland Islands (Islas Malvinas),51.8S,57.9W
2302,R014207969,1270-5A,224.png,num,1.608871,1450.0,1965-10-16 02:47:32,1.0,5.0,QUI,"Quito, Ecuador",0.6S,78.6W
4632,R014207963,1548-1B,27.png,num,1.71371,1292.0,1966-02-23 06:21:15,1.0,1.0,BPT,"Blossom Point, MD, USA",38.4N,77.1W


In [69]:
n_master = len(df_master.loc[(~pd.isna(df_master['Station_Code'])) & (df_master['time_quality'] == 1)])
n_master

12968

In [70]:
n_tq1 = len(df_master.loc[df_master['time_quality'] == 1])
n_tq2 = len(df_master.loc[df_master['time_quality'] == 2])
n_tq3 = len(df_master.loc[df_master['time_quality'] == 3])
n_tq4 = len(df_master.loc[df_master['time_quality'] == 4])

In [83]:
print('# of images fully read after stage 2 processing: ' + str(n_master))
print('% of images fully read after stage 2 processing (yield): ' + str(round(((n_master/n_images)*100), 2)) + ' %')
print('')
print('% time quality level 1 yield: ' + str(round(((n_tq1/n_images)*100), 2)) + ' %')
print('% up to time quality level 2 yield: ' + str(round((((n_tq1+n_tq2)/n_images)*100), 2)) + ' %')
print('% up to time quality level 3 yield: ' + str(round((((n_tq1+n_tq2+n_tq3)/n_images)*100), 2)) + ' %')
print('% up to time quality level 4 yield: ' + str(round((((n_tq1+n_tq2+n_tq3+n_tq4)/n_images)*100), 2)) + ' %')

# of images fully read after stage 2 processing: 12968
% of images fully read after stage 2 processing (yield): 1.78 %

% time quality level 1 yield: 1.78 %
% up to time quality level 2 yield: 1.79 %
% up to time quality level 3 yield: 1.81 %
% up to time quality level 4 yield: 1.9 %
