# OCR Read 'num2' Metadata

#### Updated: May 12, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record that metadata. 

In [1]:
import pandas as pd
import numpy as np
import os
from random import randrange
import time
import keras_ocr

In [2]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'

In [3]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for C:\Users\rnaidoo\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\rnaidoo\.keras-ocr\crnn_kurapan.h5


#  

#### Functions:

In [4]:
def read_num2_metadata(prediction_groups, subdir_path, batch_i, img_fns):
    
    df_read = pd.DataFrame()
    df_notread = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        
            #String concatenate, fix string
            read_str = ''
            for j in range(0, len(df_ocr)):
                read_str_ = df_ocr['number'].iloc[j]
                read_str += read_str_
            read_str = read_str.replace('o', '0')

            #Test for num2
            if len(read_str) == 15:
                if read_str[0:2] == '10':
                    row2 = pd.DataFrame({
                        'station_number_OCR': read_str[2:4],
                        'year_OCR': read_str[4:6],
                        'day_of_year_OCR': read_str[6:9],
                        'hour_OCR': read_str[9:11],
                        'minute_OCR': read_str[11:13],
                        'second_OCR': read_str[13:15],
                        'filename': img_fns[batch_i + i].replace(subdir_path, '')
                    }, index=[i])
                    df_read = pd.concat([df_read, row2])
                else:
                    df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
                    df_notread = pd.concat([df_notread, df_ocr])
            else:
                df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
                df_notread = pd.concat([df_notread, df_ocr])
    
    return df_read, df_notread

# 

#### Process subdirectory:

In [76]:
#Draw random subdirectory
directory_list = os.listdir(processedDir)
directory = directory_list[randrange(len(directory_list))]
subdirectory_list = os.listdir(processedDir + directory + '/')
subdirectory = subdirectory_list[randrange(len(subdirectory_list))]
print(directory + '/' + subdirectory + '/')

R014207945/2266-13B/


In [5]:
#Manual subdirectory:
directory = 'R028224481'
subdirectory = '4875-21'

#Manual subdirectory (num):
#directory = 'R014207941'
#subdirectory = '1765-5'

In [6]:
testDir = processedDir + directory + '/' + subdirectory + '/'

In [7]:
batch_size = 8 # > 9 seems to lead to a 'dead kernel'

In [8]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)
len(img_fns)

266

In [9]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

33

In [10]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

2

In [11]:
start = time.time()

df_read = pd.DataFrame()
df_notread = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir, batch_i=batch_i, 
                                               img_fns=img_fns)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
    
#Remainder
print('Finishing up...')
if batch_remainder > 0:
    batch_i = n_batches*batch_size
    batch_f = batch_i + batch_remainder
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir, batch_i=batch_i, 
                                              img_fns=img_fns)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
     
end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

Starting batch... 0
Starting batch... 1
Starting batch... 2
Starting batch... 3
Starting batch... 4
Starting batch... 5
Starting batch... 6
Starting batch... 7
Starting batch... 8
Starting batch... 9
Starting batch... 10
Starting batch... 11
Starting batch... 12
Starting batch... 13
Starting batch... 14
Starting batch... 15
Starting batch... 16
Starting batch... 17
Starting batch... 18
Starting batch... 19
Starting batch... 20
Starting batch... 21
Starting batch... 22
Starting batch... 23
Starting batch... 24
Starting batch... 25
Starting batch... 26
Starting batch... 27
Starting batch... 28
Starting batch... 29
Starting batch... 30
Starting batch... 31
Starting batch... 32
Finishing up...
Time to OCR read all images in subdirectory: 9.6 min


In [12]:
print(len(df_read))
df_read.head(30)

66


Unnamed: 0,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR,filename
4,21,69,332,05,46,03,102.png
5,21,69,332,05,16,21,103.png
7,21,69,322,05,16,55,105.png
4,21,69,327,17,17,34,11.png
5,21,69,332,05,50,39,118.png
0,21,6a,332,05,51,13,120.png
3,21,69,332,05,52,05,123.png
5,21,69,332,05,52,39,125.png
6,21,69,332,05,52,57,126.png
4,21,69,332,05,54,25,131.png


In [13]:
print(len(df_notread))
df_notread.head(30)

1034


Unnamed: 0,number,x,y,filename
0,sa,373.295258,336.431183,1.png
1,s,774.440796,402.358826,1.png
2,es,814.21875,414.84375,1.png
0,1o,889.65625,718.15625,10.png
1,21,999.523438,718.15625,10.png
2,69,1127.418579,713.785461,10.png
3,327,1280.566406,714.435547,10.png
4,1711,1476.507812,712.796875,10.png
5,116,1599.012451,716.696533,10.png
0,tote,660.257751,740.289062,100.png


#  

#### Integrate OCR read metadata into existing results data for subdirectory:

In [46]:
df_result = pd.read_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv')
#Change 'Roll' to 'Directory':
df_result = df_result.rename(columns={
    'Roll': 'Directory'
})
print(len(df_result))
df_result.head()

266


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,second_1,second_2,station_number_1,station_number_2,processed_image_class,func_name,details,Directory,Subdirectory,filename
0,1.5,1204.651163,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,2.0,1.0,1.0,1.0,num,,,R028224481,4875-21,10.png
1,1.568548,1030.232558,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,num,,,R028224481,4875-21,102.png
2,1.5,1072.093023,1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,...,0.0,2.0,2.0,1.0,num,,,R028224481,4875-21,103.png
3,4.712963,1365.116279,2.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,0.0,2.0,1.0,1.0,num,,,R028224481,4875-21,104.png
4,4.564815,1365.116279,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,num,,,R028224481,4875-21,105.png


In [47]:
df_result.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'details', 'Directory',
       'Subdirectory', 'filename'],
      dtype='object')

In [48]:
if len(df_result) > 0:
    if len(df_read) > 0:
        df_merge = df_result.merge(df_read, how='left', on='filename')
        for i in range(0, len(df_merge)):
            if df_merge['processed_image_class'].iloc[i] != 'loss':
                if df_merge['processed_image_class'].iloc[i] != 'outlier':
                    #print(~pd.isna(df_merge['day_of_year_OCR'].iloc[i]))
                    if pd.isna(df_merge['day_of_year_OCR'].iloc[i]) == False:
                        #print(df_merge['day_of_year_OCR'].iloc[i])
                        #if int(df_merge['day_of_year_OCR'].iloc[i]) > 0:
                        df_merge['processed_image_class'].iloc[i] = 'num2'
        #df_merge.loc[~pd.isna(df_merge['day_of_year_OCR']), 'processed_image_class'] = 'num2'
        #df_merge.loc[df_merge['day_of_year_OCR'] > 0, 'processed_image_class'] = 'num2'
    else:
        df_merge = df_result
else:
    df_merge = df_result
    
print(len(df_merge))
df_merge.sample(10)

266


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['processed_image_class'].iloc[i] = 'num2'


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
68,1.971774,972.093023,1.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,...,,R028224481,4875-21,169.png,,,,,,
177,1.5,1437.209302,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,R028224481,4875-21,38.png,,,,,,
244,,,,,,,,,,,...,"height: 66,width: 787",R028224481,4875-21,133.png,,,,,,
45,1.5,1365.116279,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,...,,R028224481,4875-21,146.png,,,,,,
191,4.537037,972.093023,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,R028224481,4875-21,53.png,,,,,,
183,4.188679,739.534884,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,,R028224481,4875-21,46.png,,,,,,
172,4.198113,1365.116279,1.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,...,,R028224481,4875-21,30.png,21.0,69.0,527.0,11.0,23.0,4.0
190,4.443396,1437.209302,1.0,2.0,1.0,1.0,1.0,0.0,2.0,1.0,...,,R028224481,4875-21,52.png,21.0,69.0,330.0,6.0,22.0,24.0
147,1.53629,913.953488,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,,R028224481,4875-21,244.png,,,,,,
136,1.641129,913.953488,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.0,...,,R028224481,4875-21,234.png,,,,,,


In [107]:
#df_merge[['day_of_year_OCR', 'processed_image_class']].sample(10)

Classify rows with OCR letters read as 'loss' and clear metadata:

In [49]:
OCR_cols = ['station_number_OCR', 'year_OCR', 'day_of_year_OCR', 'hour_OCR', 'minute_OCR', 'second_OCR']
md_cols = ['satellite_number', 'year', 'day_1', 'day_2', 'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1', 
           'second_2', 'station_number_1', 'station_number_2']

In [50]:
if len(df_read) > 0:
    for col in OCR_cols:
        df_merge[col] = df_merge[col].astype('string')
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'processed_image_class'] = 'loss'
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'details'] = 'OCR read metadata contains letters'   

In [51]:
n_OCR_read = 0
for i in range(0, len(df_merge)):
    if df_merge['processed_image_class'].iloc[i] == 'loss':
        if df_merge['details'].iloc[i] == 'OCR read metadata contains letters':
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan
    elif df_merge['processed_image_class'].iloc[i] == 'num2':
        for col in md_cols:
            df_merge[col].iloc[i] = np.nan
        n_OCR_read += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan


In [52]:
df_merge.loc[df_merge['processed_image_class'] == 'num']

Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
0,1.500000,1204.651163,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,,R028224481,4875-21,10.png,,,,,,
3,4.712963,1365.116279,2.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,...,,R028224481,4875-21,104.png,,,,,,
5,4.273585,1365.116279,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,...,,R028224481,4875-21,106.png,,,,,,
6,4.666667,1365.116279,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,R028224481,4875-21,107.png,,,,,,
7,4.574074,1365.116279,1.0,1.0,1.0,1.0,1.0,0.0,2.0,2.0,...,,R028224481,4875-21,108.png,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,4.851852,1146.511628,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,R028224481,4875-21,93.png,,,,,,
234,4.796296,1148.837209,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,...,,R028224481,4875-21,95.png,,,,,,
235,4.555556,1204.651163,1.0,1.0,1.0,1.0,0.0,2.0,0.0,2.0,...,,R028224481,4875-21,96.png,,,,,,
236,1.758065,1262.790698,2.0,0.0,3.0,0.0,2.0,1.0,2.0,2.0,...,,R028224481,4875-21,97.png,,,,,,


If num2 metadata type is detected, classify images with all other metadata types as loss:

#### Assumption: film rolls are unlikely to have mixed metadata - assume that if num2 metadata type is detected, that there is no other images on the film roll with other types of metadata.

In [53]:
if len(df_read) > 0:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'num':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan
        if df_merge['processed_image_class'].iloc[i] == 'dot':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['processed_image_class'].iloc[i] = 'loss'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan


In [54]:
df_merge.loc[df_merge['processed_image_class'] == 'num']

Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR


In [55]:
if n_OCR_read > 0:
    print('Percent of OCR read images that read only numbers, for test subdirectory: ' + str(round((n_OCR_read/len(df_read))*100, 1)) + ' %')

Percent of OCR read images that read only numbers, for test subdirectory: 69.7 %


#### If num2 metadata type is not detected:

In [56]:
if len(df_read) == 0:
    n_num = len(df_merge.loc[df_merge['processed_image_class'] == 'num'])
    n_dot = len(df_merge.loc[df_merge['processed_image_class'] == 'dot'])

In [41]:
n_num

NameError: name 'n_num' is not defined

In [42]:
n_dot

NameError: name 'n_dot' is not defined

If num type metadata is the majority, classify dot type images as loss:

In [57]:
if n_num > n_dot:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'dot':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata was interpreted to be dot type'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan

NameError: name 'n_num' is not defined

If dot type metadata is the majority, classify num type images as loss:

In [58]:
if n_dot > n_num:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'num':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata was interpreted to be num type'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan   

NameError: name 'n_dot' is not defined

#  

#### Save:

In [59]:
df_merge.to_csv(resultDir + directory + '/' + 'result_OCRpass-' + directory + '_' + subdirectory + '.csv', index=False)

#  

#### Development:

In [4]:
a = np.nan

In [5]:
b = 265

In [6]:
~np.isnan(a)

False

In [11]:
~pd.isna(a)

-2

In [15]:
pd.isna(a)

True

In [7]:
~np.isnan(b)

True

In [12]:
~pd.isna(b)

-1

In [16]:
pd.isna(b)

False

In [8]:
a + b

nan

In [17]:
if ~pd.isna(b):
    print('b is not NaN!')

b is not NaN!


In [18]:
if pd.isna(a):
    print('a is NaN!')

a is NaN!
