# Post-Processing (General)

#### Updated: May 15, 2023

#  

Post-processing pipeline after subdirectories are OCR processed:

In [1]:
import os
import pandas as pd

In [17]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run2/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'
logDir = rootDir + '06_log/'

#  

#### Generate processed_inventory - RUN BEFORE starting OCR process step:

In [3]:
df_inventory = pd.DataFrame()
for directory in os.listdir(processedDir):
    print(directory)
    for subdirectory in os.listdir(processedDir + directory):
        n_images = len(os.listdir(processedDir + directory + '/' + subdirectory))
        df_inventory_ = pd.DataFrame({
            'Directory': directory, 
            'Subdirectory': subdirectory,
            'images': n_images,
            'subdir_id': directory + '_' + subdirectory
        }, index=[0])
        df_inventory = pd.concat([df_inventory, df_inventory_], axis=0, ignore_index=True)

R014207711
R014207807
R014207808
R014207810
R014207811
R014207813
R014207817
R014207818
R014207819
R014207820
R014207822
R014207825
R014207826
R014207827
R014207828
R014207829
R014207830
R014207831
R014207833
R014207834
R014207835
R014207836
R014207837
R014207838
R014207839
R014207843
R014207886
R014207892
R014207941
R014207944
R014207945
R014207950
R014207952
R014207961
R014207963
R014207969
R028224481


In [4]:
print(len(df_inventory))
df_inventory.head()

1977


Unnamed: 0,Directory,Subdirectory,images,subdir_id
0,R014207711,24,256,R014207711_24
1,R014207711,25,499,R014207711_25
2,R014207711,27,96,R014207711_27
3,R014207711,28,103,R014207711_28
4,R014207711,29,341,R014207711_29


In [5]:
df_inventory.to_csv(logDir + 'image_inventory_processed.csv', index=False)

#  

#### Concatenate 'OCR pass' results:

In [20]:
df_result = pd.DataFrame()
i = 0
for file in os.listdir(resultDir):
    if 'R' in file:
        directory = file
        for file2 in os.listdir(resultDir + directory + '/'):
            if 'result_OCRpass' in file2:
                fn_parts = file2.split('_')
                subdirectory = fn_parts[2].replace('.csv', '')
                if i > 0:
                    if i % 100 == 0:
                        df_result = pd.read_csv(resultDir + 'result_total.csv')
                        print(len(df_result))
                try:
                    df_load = pd.read_csv(resultDir + directory + '/' + file2, sep=',')
                    n = len(df_load)
                except pd.errors.EmptyDataError:
                    n = 0
                df_result = pd.concat([df_result, df_load])
                i += 1
                if i % 100 == 0:
                    print('Now saving the ' + str(i) + 'th result...')
                    df_result.to_csv(resultDir + 'result_total.csv', index=False)
df_result.to_csv(resultDir + 'result_total.csv', index=False)

#  

#### Reduce columns:

In [21]:
df_result = pd.read_csv(resultDir + 'result_total.csv')
print(len(df_result))
df_result.head()

623


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
0,1.508065,1293.75,,,,,,,,,...,metadata could not be read by OCR,R014207820,3000-43A,10.png,,,,,,
1,1.5,1235.416667,,,,,,,,,...,metadata could not be read by OCR,R014207820,3000-43A,100.png,,,,,,
2,1.5,1235.416667,,,,,,,,,...,metadata could not be read by OCR,R014207820,3000-43A,101.png,,,,,,
3,1.5,1235.416667,,,,,,,,,...,metadata could not be read by OCR,R014207820,3000-43A,102.png,,,,,,
4,1.5,1189.583333,,,,,,,,,...,metadata could not be read by OCR,R014207820,3000-43A,103.png,,,,,,


In [22]:
df_result.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'limits', 'height', 'width',
       'metadata_type', 'meta_height', 'meta_width', 'x_centroids',
       'y_centroids', 'is_dot', 'dict_metadata', 'raw_coord', 'window_coord',
       'mapping_Hz', 'mapping_km', 'details', 'Directory', 'Subdirectory',
       'filename', 'station_number_OCR', 'year_OCR', 'day_of_year_OCR',
       'hour_OCR', 'minute_OCR', 'second_OCR'],
      dtype='object')

In [23]:
df_result = df_result[['Directory', 'Subdirectory', 'filename', 'processed_image_class', 'fmin', 'max_depth', 
                       'station_number_1', 'station_number_2', 'year', 'day_1', 'day_2', 'day_3', 'hour_1', 'hour_2', 
                       'minute_1', 'minute_2', 'second_1', 'second_2', 'satellite_number', 'station_number_OCR', 
                       'year_OCR', 'day_of_year_OCR', 'hour_OCR', 'minute_OCR', 'second_OCR', 'metadata_type', 'is_dot', 
                       'func_name', 'details']] 
df_result.head()

Unnamed: 0,Directory,Subdirectory,filename,processed_image_class,fmin,max_depth,station_number_1,station_number_2,year,day_1,...,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,metadata_type,is_dot,func_name,details
0,R014207820,3000-43A,10.png,loss,1.508065,1293.75,,,,,...,,,,,,,,,,metadata could not be read by OCR
1,R014207820,3000-43A,100.png,loss,1.5,1235.416667,,,,,...,,,,,,,,,,metadata could not be read by OCR
2,R014207820,3000-43A,101.png,loss,1.5,1235.416667,,,,,...,,,,,,,,,,metadata could not be read by OCR
3,R014207820,3000-43A,102.png,loss,1.5,1235.416667,,,,,...,,,,,,,,,,metadata could not be read by OCR
4,R014207820,3000-43A,103.png,loss,1.5,1189.583333,,,,,...,,,,,,,,,,metadata could not be read by OCR


#  

#### Split results into different metadata types (dot, num, num2):

In [24]:
df_result['processed_image_class'].unique()

array(['loss', 'num2', 'outlier'], dtype=object)

In [25]:
df_num2 = df_result.loc[df_result['processed_image_class'] == 'num2']
print(len(df_num2))

17


In [26]:
df_num = df_result.loc[df_result['processed_image_class'] == 'num']
print(len(df_num))

0


In [27]:
df_dot = df_result.loc[df_result['processed_image_class'] == 'dot']
print(len(df_dot))

0


In [28]:
df_outlier = df_result.loc[df_result['processed_image_class'] == 'outlier']
print(len(df_outlier))

17


In [29]:
df_loss = df_result.loc[df_result['processed_image_class'] == 'loss']
print(len(df_loss))

589


In [30]:
len(df_num2) + len(df_num) + len(df_dot) + len(df_outlier) + len(df_loss)

623

#### num2 - check timestamp components, construct timestamp:

In [32]:
df_num2.sort_values('day_of_year_OCR', ascending=False)

Unnamed: 0,Directory,Subdirectory,filename,processed_image_class,fmin,max_depth,station_number_1,station_number_2,year,day_1,...,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,metadata_type,is_dot,func_name,details
185,R014207820,3000-43A,272.png,num2,1.5,1085.416667,,,,,...,43,36.0,532.0,18.0,2,51,,,,
509,R014207827,3822-43A,302.png,num2,1.5,1137.5,,,,,...,43,66.0,389.0,12.0,5,10,,,,
508,R014207827,3822-43A,301.png,num2,1.540323,1400.0,,,,,...,43,66.0,369.0,12.0,5,23,,,,
477,R014207827,3822-43A,274.png,num2,1.71371,1295.833333,,,,,...,43,66.0,369.0,11.0,77,35,,,,
285,R014207827,3822-43A,10.png,num2,1.697581,1400.0,,,,,...,43,66.0,339.0,6.0,2,21,,,,
528,R014207827,3822-43A,32.png,num2,1.697581,1347.916667,,,,,...,43,66.0,244.0,0.0,5,50,,,,
332,R014207827,3822-43A,142.png,num2,1.725806,1400.0,,,,,...,43,66.0,244.0,0.0,2,45,,,,
343,R014207827,3822-43A,152.png,num2,1.512097,1191.666667,,,,,...,43,66.0,244.0,0.0,2,50,,,,
376,R014207827,3822-43A,182.png,num2,1.5,1087.5,,,,,...,43,66.0,244.0,0.0,3,15,,,,
559,R014207827,3822-43A,46.png,num2,1.508065,1231.25,,,,,...,43,66.0,12.0,4.0,46,25,,,,


In [35]:
df_num2['day_of_year_OCR'] = df_num2['day_of_year_OCR'].astype('float')
df_num2['day_of_year_OCR']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_num2['day_of_year_OCR'] = df_num2['day_of_year_OCR'].astype('float')


185    532.0
285    339.0
289      0.0
303      0.0
310      0.0
332    244.0
343    244.0
376    244.0
400      0.0
460      0.0
477    369.0
508    369.0
509    389.0
528    244.0
559     12.0
578      0.0
613      0.0
Name: day_of_year_OCR, dtype: float64

In [36]:
df_num2.loc[df_num2['day_of_year_OCR'] > 366]

Unnamed: 0,Directory,Subdirectory,filename,processed_image_class,fmin,max_depth,station_number_1,station_number_2,year,day_1,...,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,metadata_type,is_dot,func_name,details
185,R014207820,3000-43A,272.png,num2,1.5,1085.416667,,,,,...,43,36.0,532.0,18.0,2,51,,,,
477,R014207827,3822-43A,274.png,num2,1.71371,1295.833333,,,,,...,43,66.0,369.0,11.0,77,35,,,,
508,R014207827,3822-43A,301.png,num2,1.540323,1400.0,,,,,...,43,66.0,369.0,12.0,5,23,,,,
509,R014207827,3822-43A,302.png,num2,1.5,1137.5,,,,,...,43,66.0,389.0,12.0,5,10,,,,


num:

dot:

#  

#### Retrieve station details:

dot, num:

num2:

#  

#### Save: