# OCR Read 'num2' Metadata

#### Updated: Apr 4, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record "that metadata. 

In [None]:
import pandas as pd
import numpy as np
import os
import time
import keras_ocr

In [None]:
pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
resultDir = rootDir + '05_result/'

#  

#### Functions:

In [None]:
def read_num2_metadata(prediction_groups, subdir_path):
    
    df_read = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        if len(df_ocr) == 6:
            if df_ocr['number'].iloc[0] == '10':
                row2 = pd.DataFrame({
                    'station_number_OCR': df_ocr['number'].iloc[1],
                    'year_OCR': df_ocr['number'].iloc[2],
                    'day_of_year_OCR': df_ocr['number'].iloc[3],
                    'hour_OCR': df_ocr['number'].iloc[4][0:2],
                    'minute_OCR': df_ocr['number'].iloc[4][2:],
                    'second_OCR': df_ocr['number'].iloc[5],
                    'filename': img_fns[batch_i + i].replace(subdir_path, '')
                }, index=[i])
                df_read = pd.concat([df_read, row2])
    
    return df_read

# 

#### Process subdirectory:

In [None]:
directory = 'R014207817' #'R014207950'
subdirectory = '4163-03A' #'2487-5-B'
testDir = rootDir + '04_processed/' + directory + '/' + subdirectory + '/'

In [None]:
batch_size = 9 # > 9 seems to lead to a 'dead kernel'

In [None]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)

In [None]:
len(img_fns)

In [None]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

In [None]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

In [None]:
start = time.time()

df_read = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
    df_read = pd.concat([df_read, df_read_])
    
#Remainder
print('Finishing up...')
batch_i = n_batches*batch_size
batch_f = batch_i + batch_remainder
prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
df_read_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
df_read = pd.concat([df_read, df_read_])
     
end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

In [None]:
print(len(df_read))
df_read.head(30)

Remove rows with letters read:

In [None]:
if len(df_read) > 0:
    df_read2 = df_read[~df_read['station_number'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['year'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['day_of_year'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['hour'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['minute'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['second'].str.contains("[a-zA-Z]")]
else:
    df_read2 = df_read

In [None]:
print(len(df_read2))
df_read2.head()

In [None]:
#df1['filenum'] = df1['filename'].str.replace('.png', '')
#df1['filenum'] = df1['filenum'].astype('int')
#df1 = df1.sort_values('filenum').reset_index(drop=True)
df_read2 = df_read2.sort_values('filename').reset_index(drop=True)

In [None]:
#df_read2.to_csv(resultDir + 'OCR_subdir_test_' + directory + '_' + subdirectory + '.csv', index=False)

In [None]:
print('Percent OCR read yield for test subdirectory: ' + str(round((len(df_read2)/len(img_fns))*100, 1)) + ' %')

#  

#### Integrate OCR read metadata into existing results data for subdirectory:

In [None]:
df_result = pd.read_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv')
print(len(df_result))
df_result.head()

In [None]:
df_result.columns

In [None]:
#df_read2 = pd.read_csv(resultDir + 'OCR_subdir_test_' + directory + '_' + subdirectory + '.csv')
print(len(df_read2))
df_read2.head()

In [None]:
if len(df_read2) > 0:
    df_merge = df_result.merge(df_read2, how='left', on='filename')
df_merge['day_of_year'] = df_merge['day_of_year'].astype('int')
df_merge.loc[df_merge['day_of_year'] > 0, 'processed_image_class'] = 'num2'

print(len(df_merge))
df_merge.sample(10)

In [None]:
df_merge.to_csv(resultDir + directory + '/' + 'result_OCR-' + directory + '_' + subdirectory + '.csv')

#  

#### Development:

In [None]:
#df_read = df_read.iloc[:, 1:]
#df_read.head()

In [None]:
#df_read.to_csv(resultDir + 'OCR_subdir_test_' + directory + '_' + subdirectory + '.csv', index=False)