# OCR Read 'num2' Metadata

#### Updated: Apr 4, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record "that metadata. 

In [1]:
import pandas as pd
import numpy as np
import os
import time
import keras_ocr

In [2]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for C:\Users\rnaidoo\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\rnaidoo\.keras-ocr\crnn_kurapan.h5


In [3]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
testDir = rootDir + '04_processed/R014207817/4163-03A/'
resultDir = rootDir + '05_result/'

#  

#### Functions:

In [4]:
def read_num2_metadata(prediction_groups, subdir_path):
    
    df_results = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        if len(df_ocr) == 6:
            if df_ocr['number'].iloc[0] == '10':
                row2 = pd.DataFrame({
                    'station_number': df_ocr['number'].iloc[1],
                    'year': df_ocr['number'].iloc[2],
                    'day_of_year': df_ocr['number'].iloc[3],
                    'hour': df_ocr['number'].iloc[4][0:2],
                    'minute': df_ocr['number'].iloc[4][2:],
                    'second': df_ocr['number'].iloc[5],
                    'filename': img_fns[batch_i + i].replace(subdir_path, '')
                }, index=[i])
                df_results = pd.concat([df_results, row2])
    
    return df_results

# 

#### Process subdirectory:

In [5]:
batch_size = 9 # > 9 seems to lead to a 'dead kernel'

In [6]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)

In [7]:
len(img_fns)

304

In [8]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

33

In [9]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

7

In [10]:
start = time.time()

df_results = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    #imgs_ocr = [
    #    keras_ocr.tools.read(img) for img in img_fns[batch_i:batch_f]
    #]
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_results_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
    df_results = pd.concat([df_results, df_results_])
    
#Remainder
print('Finishing up...')
batch_i = n_batches*batch_size
batch_f = batch_i + batch_remainder
prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
df_results_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
df_results = pd.concat([df_results, df_results_])
df_results = df_results.sort_values('filename').reset_index(drop=True)

end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

Starting batch... 0
Starting batch... 1
Starting batch... 2
Starting batch... 3
Starting batch... 4
Starting batch... 5
Starting batch... 6
Starting batch... 7
Starting batch... 8
Starting batch... 9
Starting batch... 10
Starting batch... 11
Starting batch... 12
Starting batch... 13
Starting batch... 14
Starting batch... 15
Starting batch... 16
Starting batch... 17
Starting batch... 18
Starting batch... 19
Starting batch... 20
Starting batch... 21
Starting batch... 22
Starting batch... 23
Starting batch... 24
Starting batch... 25
Starting batch... 26
Starting batch... 27
Starting batch... 28
Starting batch... 29
Starting batch... 30
Starting batch... 31
Starting batch... 32
Finishing up...
Time to OCR read all images in subdirectory: 14.8 min


In [15]:
df_results['filenum'] = df_results['filename'].str.replace('.png', '')
df_results['filenum'] = df_results['filenum'].astype('int')
df_results = df_results.sort_values('filenum').reset_index(drop=True)
print(len(df_results))
df_results.head(30)

262


  df_results['filenum'] = df_results['filename'].str.replace('.png', '')


Unnamed: 0,station_number,year,day_of_year,hour,minute,second,filename,filenum
0,3,66,339,19,05,26,5.png,5
1,3,66,339,19,05,43,6.png,6
2,3,66,339,19,06,oo,7.png,7
3,3,66,339,19,06,35,9.png,9
4,3,66,339,19,06,52,10.png,10
5,3,66,339,19,08,19,12.png,12
6,3,66,339,ig,os,36,13.png,13
7,3,66,339,19,06,54,14.png,14
8,3,66,339,1g,oo,11,15.png,15
9,3,66,339,19,09,28,16.png,16


In [17]:
df_results.to_csv(resultDir + 'OCR_subdir_test.csv')

Remove rows with letters read:

In [20]:
df1 = df_results[~df_results['station_number'].str.contains("[a-zA-Z]")]
df1 = df1[~df1['year'].str.contains("[a-zA-Z]")]
df1 = df1[~df1['day_of_year'].str.contains("[a-zA-Z]")]
df1 = df1[~df1['hour'].str.contains("[a-zA-Z]")]
df1 = df1[~df1['minute'].str.contains("[a-zA-Z]")]
df1 = df1[~df1['second'].str.contains("[a-zA-Z]")]

In [21]:
print(len(df1))
df1.head()

178


Unnamed: 0,station_number,year,day_of_year,hour,minute,second,filename,filenum
0,3,66,339,19,5,26,5.png,5
1,3,66,339,19,5,43,6.png,6
3,3,66,339,19,6,35,9.png,9
4,3,66,339,19,6,52,10.png,10
5,3,66,339,19,8,19,12.png,12


In [28]:
print('Percent OCR read yield for test subdirectory: ' + str(round((len(df1)/len(img_fns))*100, 1)) + ' %')

Percent OCR read yield for test subdirectory: 58.6 %
