# OCR Read 'num2' Metadata

#### Updated: Apr 24, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record "that metadata. 

In [1]:
import pandas as pd
import numpy as np
import os
from random import randrange
import time
import keras_ocr

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'

In [3]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for C:\Users\rnaidoo\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\rnaidoo\.keras-ocr\crnn_kurapan.h5


#  

#### Functions:

In [42]:
def read_num2_metadata(prediction_groups, subdir_path):
    
    df_read = pd.DataFrame()
    df_notread = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        if len(df_ocr) == 6:
            if df_ocr['number'].iloc[0] == '10':
                row2 = pd.DataFrame({
                    'station_number_OCR': df_ocr['number'].iloc[1],
                    'year_OCR': df_ocr['number'].iloc[2],
                    'day_of_year_OCR': df_ocr['number'].iloc[3],
                    'hour_OCR': df_ocr['number'].iloc[4][0:2],
                    'minute_OCR': df_ocr['number'].iloc[4][2:],
                    'second_OCR': df_ocr['number'].iloc[5],
                    'filename': img_fns[batch_i + i].replace(subdir_path, '')
                }, index=[i])
                df_read = pd.concat([df_read, row2])
            else:
                df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
                df_notread = pd.concat([df_notread, df_ocr])
        else:
            df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
            df_notread = pd.concat([df_notread, df_ocr])
    
    return df_read, df_notread

# 

#### Process subdirectory:

In [43]:
#Draw random subdirectory
directory_list = os.listdir(processedDir)
directory = directory_list[randrange(len(directory_list))]
subdirectory_list = os.listdir(processedDir + directory + '/')
subdirectory = subdirectory_list[randrange(len(subdirectory_list))]
print(directory + '/' + subdirectory + '/')

R014207944/2029-13B/


In [44]:
#Manual subdirectory:
directory = 'R014207830'
subdirectory = '3727-19A'

'''#Manual subdirectory (dot, BATCH I):
directory = ''
subdirectory = '' '''

"#Manual subdirectory (dot, BATCH I):\ndirectory = ''\nsubdirectory = '' "

In [45]:
testDir = processedDir + directory + '/' + subdirectory + '/'

In [46]:
batch_size = 8 # > 9 seems to lead to a 'dead kernel'

In [47]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)
len(img_fns)

345

In [48]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

43

In [49]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

1

In [None]:
start = time.time()

df_read = pd.DataFrame()
df_notread = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
    
#Remainder
print('Finishing up...')
if batch_remainder > 0:
    batch_i = n_batches*batch_size
    batch_f = batch_i + batch_remainder
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
     
end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

Starting batch... 0
Starting batch... 1
Starting batch... 2
Starting batch... 3
Starting batch... 4
Starting batch... 5
Starting batch... 6
Starting batch... 7
Starting batch... 8
Starting batch... 9
Starting batch... 10
Starting batch... 11
Starting batch... 12
Starting batch... 13
Starting batch... 14
Starting batch... 15
Starting batch... 16
Starting batch... 17
Starting batch... 18
Starting batch... 19
Starting batch... 20
Starting batch... 21
Starting batch... 22
Starting batch... 23
Starting batch... 24
Starting batch... 25
Starting batch... 26
Starting batch... 27
Starting batch... 28
Starting batch... 29
Starting batch... 30
Starting batch... 31
Starting batch... 32
Starting batch... 33
Starting batch... 34
Starting batch... 35
Starting batch... 36


In [None]:
print(len(df_read))
df_read.head(30)

In [None]:
print(len(df_notread))
df_notread.head(30)

#  

#### Integrate OCR read metadata into existing results data for subdirectory:

In [15]:
df_result = pd.read_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv')
#Change 'Roll' to 'Directory':
df_result.rename(columns={
    'Roll': 'Directory'
})
print(len(df_result))
df_result.head()

345


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,is_dot,dict_metadata,raw_coord,window_coord,mapping_Hz,mapping_km,details,Directory,Subdirectory,filename
0,1.576613,1410.638298,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,,,,,,,,R014207830,3727-19A,10.png
1,1.612903,1425.531915,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,...,,,,,,,,R014207830,3727-19A,100.png
2,1.584677,1106.382979,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,,,,,,,R014207830,3727-19A,101.png
3,1.604839,1278.723404,3.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,,,,,,,,R014207830,3727-19A,102.png
4,1.576613,1391.489362,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,...,,,,,,,,R014207830,3727-19A,103.png


In [16]:
df_result.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'limits', 'height', 'width',
       'metadata_type', 'meta_height', 'meta_width', 'x_centroids',
       'y_centroids', 'is_dot', 'dict_metadata', 'raw_coord', 'window_coord',
       'mapping_Hz', 'mapping_km', 'details', 'Directory', 'Subdirectory',
       'filename'],
      dtype='object')

In [17]:
if len(df_result) > 0:
    if len(df_read) > 0:
        df_read['day_of_year_OCR'] = df_read['day_of_year_OCR'].astype('int')
        df_merge = df_result.merge(df_read, how='left', on='filename')
        df_merge.loc[df_merge['day_of_year_OCR'] > 0, 'processed_image_class'] = 'num2'
    else:
        df_merge = df_result
else:
    df_merge = df_result
    
print(len(df_merge))
df_merge.sample(10)

345


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
120,1.625,1425.531915,2.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,...,,R014207830,3727-19A,21.png,,,,,,
238,1.520161,1425.531915,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,R014207830,3727-19A,319.png,,,,,,
101,1.625,1425.531915,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,R014207830,3727-19A,193.png,,,,,,
257,1.544355,1421.276596,3.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,...,,R014207830,3727-19A,336.png,,,,,,
333,1.592742,1425.531915,3.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,...,,R014207830,3727-19A,96.png,,,,,,
220,1.568548,1425.531915,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,...,,R014207830,3727-19A,302.png,,,,,,
134,1.572581,1425.531915,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,R014207830,3727-19A,223.png,,,,,,
3,1.604839,1278.723404,3.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,,R014207830,3727-19A,102.png,,,,,,
280,1.564516,1425.531915,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,...,,R014207830,3727-19A,48.png,,,,,,
162,1.637097,1425.531915,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,,R014207830,3727-19A,25.png,,,,,,


Classify rows with OCR letters read as 'loss' and clear metadata:

In [18]:
OCR_cols = ['station_number_OCR', 'year_OCR', 'day_of_year_OCR', 'hour_OCR', 'minute_OCR', 'second_OCR']
md_cols = ['satellite_number', 'year', 'day_1', 'day_2', 'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1', 
           'second_2', 'station_number_1', 'station_number_2']

In [None]:
if len(df_read) > 0:
    for col in OCR_cols:
        df_merge[col].str.replace('o', '0')

In [21]:
if len(df_read) > 0:
    for col in OCR_cols:
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'processed_image_class'] = 'loss'
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'details'] = 'OCR read metadata contains letters'        

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
n_OCR_read = 0
for i in range(0, len(df_merge)):
    if df_merge['processed_image_class'].iloc[i] == 'loss':
        if df_merge['details'].iloc[i] == 'OCR read metadata contains letters':
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan
    elif df_merge['processed_image_class'].iloc[i] == 'num2':
        for col in md_cols:
            df_merge[col].iloc[i] = np.nan
        n_OCR_read += 1

In [20]:
df_merge.loc[df_merge['processed_image_class'] == 'num']

Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
0,1.576613,1410.638298,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,,R014207830,3727-19A,10.png,,,,,,
1,1.612903,1425.531915,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,...,,R014207830,3727-19A,100.png,,,,,,
2,1.584677,1106.382979,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,R014207830,3727-19A,101.png,,,,,,
3,1.604839,1278.723404,3.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,,R014207830,3727-19A,102.png,,,,,,
4,1.576613,1391.489362,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,...,,R014207830,3727-19A,103.png,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,1.584677,1425.531915,2.0,2.0,2.0,2.0,1.0,1.0,2.0,3.0,...,,R014207830,3727-19A,95.png,,,,,,
333,1.592742,1425.531915,3.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,...,,R014207830,3727-19A,96.png,,,,,,
334,1.600806,1425.531915,2.0,2.0,2.0,2.0,2.0,0.0,3.0,1.0,...,,R014207830,3727-19A,97.png,,,,,,
335,1.608871,1408.510638,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,...,,R014207830,3727-19A,98.png,,,,,,


In [None]:
if n_OCR_read > 0:
    print('Percent of OCR read images that read only numbers, for test subdirectory: ' + str(round((n_OCR_read/len(df_read))*100, 1)) + ' %')

#### Construct timestamp:

#  

Save:

In [None]:
df_merge.to_csv(resultDir + directory + '/' + 'result_OCRpass-' + directory + '_' + subdirectory + '.csv', index=False)