# OCR Read 'num2' Metadata

#### Updated: Apr 17, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record "that metadata. 

In [1]:
import pandas as pd
import numpy as np
import os
from random import randrange
import time
import keras_ocr

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'

In [3]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for C:\Users\rnaidoo\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\rnaidoo\.keras-ocr\crnn_kurapan.h5


#  

#### Functions:

In [4]:
def read_num2_metadata(prediction_groups, subdir_path):
    
    df_read = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        if len(df_ocr) == 6:
            if df_ocr['number'].iloc[0] == '10':
                row2 = pd.DataFrame({
                    'station_number_OCR': df_ocr['number'].iloc[1],
                    'year_OCR': df_ocr['number'].iloc[2],
                    'day_of_year_OCR': df_ocr['number'].iloc[3],
                    'hour_OCR': df_ocr['number'].iloc[4][0:2],
                    'minute_OCR': df_ocr['number'].iloc[4][2:],
                    'second_OCR': df_ocr['number'].iloc[5],
                    'filename': img_fns[batch_i + i].replace(subdir_path, '')
                }, index=[i])
                df_read = pd.concat([df_read, row2])
    
    return df_read

# 

#### Process subdirectory:

In [5]:
#Draw random subdirectory
directory_list = os.listdir(processedDir)
directory = directory_list[randrange(len(directory_list))]
subdirectory_list = os.listdir(processedDir + directory + '/')
subdirectory = subdirectory_list[randrange(len(subdirectory_list))]
print(directory + '/' + subdirectory + '/')

R014207819/4793-15/


In [6]:
testDir = processedDir + directory + '/' + subdirectory + '/'

In [7]:
batch_size = 2 #9 # > 9 seems to lead to a 'dead kernel'

In [8]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)
len(img_fns)

233

In [9]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

116

In [10]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

1

In [11]:
start = time.time()

df_read = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
    df_read = pd.concat([df_read, df_read_])
    
#Remainder
print('Finishing up...')
batch_i = n_batches*batch_size
batch_f = batch_i + batch_remainder
prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
df_read_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
df_read = pd.concat([df_read, df_read_])
     
end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

Starting batch... 0
Starting batch... 1
Starting batch... 2
Starting batch... 3
Starting batch... 4
Starting batch... 5
Starting batch... 6
Starting batch... 7
Starting batch... 8
Starting batch... 9
Starting batch... 10
Starting batch... 11
Starting batch... 12
Starting batch... 13
Starting batch... 14
Starting batch... 15
Starting batch... 16
Starting batch... 17
Starting batch... 18
Starting batch... 19
Starting batch... 20
Starting batch... 21
Starting batch... 22
Starting batch... 23
Starting batch... 24
Starting batch... 25
Starting batch... 26
Starting batch... 27
Starting batch... 28
Starting batch... 29
Starting batch... 30
Starting batch... 31
Starting batch... 32
Starting batch... 33
Starting batch... 34
Starting batch... 35
Starting batch... 36
Starting batch... 37
Starting batch... 38
Starting batch... 39
Starting batch... 40
Starting batch... 41
Starting batch... 42
Starting batch... 43
Starting batch... 44
Starting batch... 45
Starting batch... 46
Starting batch... 47
St

Starting batch... 67
Starting batch... 68
Starting batch... 69
Starting batch... 70
Starting batch... 71
Starting batch... 72
Starting batch... 73
Starting batch... 74
Starting batch... 75
Starting batch... 76
Starting batch... 77
Starting batch... 78
Starting batch... 79
Starting batch... 80
Starting batch... 81
Starting batch... 82
Starting batch... 83
Starting batch... 84
Starting batch... 85
Starting batch... 86
Starting batch... 87
Starting batch... 88
Starting batch... 89
Starting batch... 90
Starting batch... 91
Starting batch... 92
Starting batch... 93
Starting batch... 94
Starting batch... 95
Starting batch... 96
Starting batch... 97
Starting batch... 98
Starting batch... 99
Starting batch... 100
Starting batch... 101
Starting batch... 102
Starting batch... 103
Starting batch... 104
Starting batch... 105
Starting batch... 106
Starting batch... 107
Starting batch... 108
Starting batch... 109
Starting batch... 110
Starting batch... 111
Starting batch... 112
Starting batch... 113

In [12]:
print(len(df_read))
df_read.head(30)

85


Unnamed: 0,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,filename
1,15,69,206,06,42,57,10.png
0,15,69,223,50,2,06,100.png
1,15,69,229,16,59,29,103.png
0,15,69,229,16,59,46,104.png
1,15,69,229,17,0o,oa,105.png
0,15,69,220,1t,o0,21,106.png
0,15,69,229,dt,oo,56,108.png
1,15,69,229,17,01,11,109.png
0,15,60,206,06,43,14,11.png
1,15,69,229,17,01,31,110.png


Remove rows with letters read:

In [13]:
if len(df_read) > 0:
    df_read2 = df_read[~df_read['station_number_OCR'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['year_OCR'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['day_of_year_OCR'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['hour_OCR'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['minute_OCR'].str.contains("[a-zA-Z]")]
    df_read2 = df_read2[~df_read2['second_OCR'].str.contains("[a-zA-Z]")]
else:
    df_read2 = df_read

In [14]:
print(len(df_read2))
df_read2.head()

49


Unnamed: 0,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,filename
1,15,69,206,6,42,57,10.png
0,15,69,223,50,2,6,100.png
1,15,69,229,16,59,29,103.png
0,15,69,229,16,59,46,104.png
1,15,69,229,17,1,11,109.png


In [15]:
df_read2 = df_read2.sort_values('filename').reset_index(drop=True)

In [16]:
print('Percent OCR read yield for test subdirectory: ' + str(round((len(df_read2)/len(img_fns))*100, 1)) + ' %')

Percent OCR read yield for test subdirectory: 21.0 %


#  

#### Integrate OCR read metadata into existing results data for subdirectory:

In [17]:
df_result = pd.read_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv')
print(len(df_result))
df_result.head()

233


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,second_1,second_2,station_number_1,station_number_2,processed_image_class,func_name,details,Roll,Subdirectory,filename
0,1.5,673.913043,0.0,3.0,1.0,2.0,0.0,1.0,2.0,1.0,...,2.0,1.0,1.0,1.0,num,,,R014207819,4793-15,10.png
1,1.5,673.913043,1.0,4.0,1.0,4.0,1.0,1.0,2.0,1.0,...,3.0,3.0,1.0,1.0,num,,,R014207819,4793-15,100.png
2,4.235849,1000.0,2.0,3.0,1.0,1.0,1.0,2.0,2.0,0.0,...,2.0,1.0,1.0,1.0,num,,,R014207819,4793-15,103.png
3,4.122642,1489.130435,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,1.0,3.0,0.0,num,,,R014207819,4793-15,104.png
4,4.207547,1489.130435,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,...,3.0,1.0,2.0,2.0,num,,,R014207819,4793-15,105.png


In [18]:
df_result.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'details', 'Roll', 'Subdirectory',
       'filename'],
      dtype='object')

In [19]:
print(len(df_read2))
df_read2.head()

49


Unnamed: 0,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,filename
0,15,69,206,6,42,57,10.png
1,15,69,223,50,2,6,100.png
2,15,69,229,16,59,29,103.png
3,15,69,229,16,59,46,104.png
4,15,69,229,17,1,11,109.png


In [20]:
if len(df_result) > 0:
    if len(df_read2) > 0:
        df_read2['day_of_year_OCR'] = df_read2['day_of_year_OCR'].astype('int')
        df_merge = df_result.merge(df_read2, how='left', on='filename')
        df_merge.loc[df_merge['day_of_year_OCR'] > 0, 'processed_image_class'] = 'num2'
    else:
        df_merge = df_result
else:
    df_merge = df_result
    
print(len(df_merge))
df_merge.sample(10)

233


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Roll,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
201,1.5,1108.695652,1.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,...,,R014207819,4793-15,88.png,,,,,,
25,4.150943,1163.043478,1.0,1.0,0.0,2.0,0.0,1.0,3.0,0.0,...,,R014207819,4793-15,124.png,15.0,69.0,229.0,17.0,5.0,35.0
15,1.548387,1271.73913,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,...,,R014207819,4793-15,115.png,15.0,69.0,229.0,17.0,2.0,58.0
121,1.5,1000.0,2.0,3.0,3.0,1.0,4.0,1.0,2.0,3.0,...,,R014207819,4793-15,218.png,,,,,,
192,1.5,1108.695652,2.0,2.0,0.0,3.0,0.0,2.0,2.0,2.0,...,,R014207819,4793-15,8.png,,,,,,
79,1.5,1000.0,1.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,...,,R014207819,4793-15,178.png,,,,,,
87,1.5,1000.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,2.0,...,,R014207819,4793-15,185.png,,,,,,
6,4.898148,1489.130435,2.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,...,,R014207819,4793-15,107.png,,,,,,
133,1.5,1047.826087,1.0,2.0,2.0,2.0,1.0,2.0,2.0,3.0,...,,R014207819,4793-15,229.png,,,,,,
91,1.5,1000.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,2.0,...,,R014207819,4793-15,189.png,,,,,,


In [21]:
df_merge.to_csv(resultDir + directory + '/' + 'result_OCRpass-' + directory + '_' + subdirectory + '.csv', index=False)