# OCR Read 'num2' Metadata

#### Updated: May 12, 2023

#  

Test post-processing a subdirectory, to first detect if the image is of 'num2' metadata type, and then to read and record that metadata. 

In [92]:
import pandas as pd
import numpy as np
import os
from random import randrange
import time
import keras_ocr

In [93]:
rootDir = 'L:/DATA/Alouette_I/BATCH_II_Run1/'
processedDir = rootDir + '04_processed/'
resultDir = rootDir + '05_result/'

In [94]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for C:\Users\rnaidoo\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\rnaidoo\.keras-ocr\crnn_kurapan.h5


#  

#### Functions:

In [95]:
def read_num2_metadata(prediction_groups, subdir_path, batch_i, img_fns):
    
    df_read = pd.DataFrame()
    df_notread = pd.DataFrame()
    for i in range(0, len(prediction_groups)):
        df_ocr = pd.DataFrame()
        predicted_image = prediction_groups[i]
        if len(predicted_image) > 0:
            for text, box in predicted_image:
                row = pd.DataFrame({
                    'number': text,
                    'x': box[1][0],
                    'y': box[1][1]
                }, index=[0])
                df_ocr = pd.concat([df_ocr, row])
            df_ocr = df_ocr.sort_values('x').reset_index(drop=True)
        
            #String concatenate, fix string
            read_str = ''
            for j in range(0, len(df_ocr)):
                read_str_ = df_ocr['number'].iloc[j]
                read_str += read_str_
            read_str = read_str.replace('o', '0')

            #Test for num2
            if len(read_str) == 15:
                if read_str[0:2] == '10':
                    row2 = pd.DataFrame({
                        'station_number_OCR': read_str[2:4],
                        'year_OCR': read_str[4:6],
                        'day_of_year_OCR': read_str[6:9],
                        'hour_OCR': read_str[9:11],
                        'minute_OCR': read_str[11:13],
                        'second_OCR': read_str[13:15],
                        'filename': img_fns[batch_i + i].replace(subdir_path, '')
                    }, index=[i])
                    df_read = pd.concat([df_read, row2])
                else:
                    df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
                    df_notread = pd.concat([df_notread, df_ocr])
            else:
                df_ocr['filename'] = img_fns[batch_i + i].replace(subdir_path, '')
                df_notread = pd.concat([df_notread, df_ocr])
    
    return df_read, df_notread

# 

#### Process subdirectory:

In [76]:
#Draw random subdirectory
directory_list = os.listdir(processedDir)
directory = directory_list[randrange(len(directory_list))]
subdirectory_list = os.listdir(processedDir + directory + '/')
subdirectory = subdirectory_list[randrange(len(subdirectory_list))]
print(directory + '/' + subdirectory + '/')

R014207945/2266-13B/


In [122]:
#Manual subdirectory:
directory = 'R014207833'
subdirectory = '4330-12'

#Manual subdirectory (num):
#directory = 'R014207941'
#subdirectory = '1765-5'

In [123]:
testDir = processedDir + directory + '/' + subdirectory + '/'

In [124]:
batch_size = 8 # > 9 seems to lead to a 'dead kernel'

In [125]:
img_fns = []
for file in os.listdir(testDir):
    img_fns.append(testDir + file)
len(img_fns)

278

In [126]:
n_batches = int(np.floor(len(img_fns)/batch_size))
n_batches

34

In [127]:
batch_remainder = len(img_fns)%batch_size
batch_remainder

6

In [None]:
start = time.time()

df_read = pd.DataFrame()
df_notread = pd.DataFrame()
for i in range(0, n_batches):
    print('Starting batch... ' + str(i))
    batch_i = i*batch_size
    batch_f = batch_i + batch_size
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir, batch_i=batch_i, 
                                               img_fns=img_fns)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
    
#Remainder
print('Finishing up...')
if batch_remainder > 0:
    batch_i = n_batches*batch_size
    batch_f = batch_i + batch_remainder
    prediction_groups = pipeline.recognize(img_fns[batch_i:batch_f])
    df_read_, df_notread_ = read_num2_metadata(prediction_groups=prediction_groups, subdir_path=testDir, batch_i=batch_i, 
                                              img_fns=img_fns)
    df_read = pd.concat([df_read, df_read_])
    df_notread = pd.concat([df_notread, df_notread_])
     
end = time.time()
t = end - start
print('Time to OCR read all images in subdirectory: ' + str(round(t/60, 1)) + ' min')

Starting batch... 0
Starting batch... 1
Starting batch... 2
Starting batch... 3
Starting batch... 4
Starting batch... 5


In [None]:
print(len(df_read))
df_read.head(30)

In [None]:
print(len(df_notread))
df_notread.head(30)

#  

#### Integrate OCR read metadata into existing results data for subdirectory:

In [105]:
df_result = pd.read_csv(resultDir + directory + '/' + 'result-' + directory + '_' + subdirectory + '.csv')
#Change 'Roll' to 'Directory':
df_result = df_result.rename(columns={
    'Roll': 'Directory'
})
print(len(df_result))
df_result.head()

201


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,second_1,second_2,station_number_1,station_number_2,processed_image_class,func_name,details,Directory,Subdirectory,filename
0,1.5,1743.75,2.0,2.0,2.0,2.0,0.0,4.0,3.0,2.0,...,2.0,2.0,2.0,2.0,num,,,R014207833,4361-50A,1.png
1,1.5,1528.125,2.0,2.0,2.0,2.0,0.0,4.0,2.0,0.0,...,3.0,2.0,2.0,2.0,num,,,R014207833,4361-50A,10.png
2,1.5,1196.875,2.0,2.0,2.0,2.0,0.0,4.0,1.0,2.0,...,0.0,2.0,4.0,2.0,num,,,R014207833,4361-50A,100.png
3,1.5,1196.875,2.0,2.0,2.0,2.0,0.0,4.0,2.0,0.0,...,3.0,2.0,2.0,2.0,num,,,R014207833,4361-50A,101.png
4,1.5,800.0,2.0,2.0,2.0,3.0,0.0,4.0,2.0,0.0,...,2.0,2.0,2.0,2.0,num,,,R014207833,4361-50A,102.png


In [106]:
df_result.columns

Index(['fmin', 'max_depth', 'satellite_number', 'year', 'day_1', 'day_2',
       'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1',
       'second_2', 'station_number_1', 'station_number_2',
       'processed_image_class', 'func_name', 'details', 'Directory',
       'Subdirectory', 'filename'],
      dtype='object')

In [107]:
if len(df_result) > 0:
    if len(df_read) > 0:
        df_merge = df_result.merge(df_read, how='left', on='filename')
        for i in range(0, len(df_merge)):
            if df_merge['processed_image_class'].iloc[i] != 'loss':
                if df_merge['processed_image_class'].iloc[i] != 'outlier':
                    if pd.isna(df_merge['day_of_year_OCR'].iloc[i]) == False:
                        df_merge['processed_image_class'].iloc[i] = 'num2'
        #df_merge.loc[~pd.isna(df_merge['day_of_year_OCR']), 'processed_image_class'] = 'num2'
        #df_merge.loc[df_merge['day_of_year_OCR'] > 0, 'processed_image_class'] = 'num2'
    else:
        df_merge = df_result
else:
    df_merge = df_result
    
print(len(df_merge))
df_merge.sample(10)

201


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['processed_image_class'].iloc[i] = 'num2'


Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
147,1.907258,884.375,2.0,2.0,2.0,2.0,2.0,2.0,0.0,4.0,...,,R014207833,4361-50A,49.png,50.0,67.0,358.0,5.0,16.0,52
175,1.544355,2046.875,2.0,2.0,2.0,2.0,0.0,4.0,3.0,0.0,...,,R014207833,4361-50A,74.png,50.0,67.0,358.0,16.0,20.0,07
111,1.592742,1509.375,2.0,2.0,2.0,2.0,0.0,4.0,3.0,2.0,...,,R014207833,4361-50A,216.png,50.0,67.0,363.0,15.0,54.0,27
61,1.705645,884.375,4.0,0.0,4.0,0.0,2.0,4.0,0.0,3.0,...,,R014207833,4361-50A,170.png,50.0,67.0,363.0,4.0,50.0,a0
128,1.5,1334.375,2.0,2.0,2.0,2.0,0.0,4.0,3.0,0.0,...,,R014207833,4361-50A,31.png,,,,,,
138,1.5,884.375,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,...,,R014207833,4361-50A,40.png,50.0,67.0,358.0,5.0,14.0,18
131,2.459677,884.375,2.0,2.0,2.0,2.0,0.0,4.0,2.0,0.0,...,,R014207833,4361-50A,34.png,50.0,67.0,358.0,5.0,12.0,36
116,1.5,1509.375,2.0,2.0,2.0,2.0,0.0,4.0,2.0,2.0,...,,R014207833,4361-50A,220.png,50.0,67.0,363.0,15.0,55.0,36
76,1.604839,1431.25,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,...,,R014207833,4361-50A,184.png,50.0,61.0,363.0,4.0,54.0,38
113,1.5,1431.25,2.0,2.0,2.0,2.0,0.0,4.0,3.0,2.0,...,,R014207833,4361-50A,218.png,50.0,67.0,363.0,15.0,55.0,02


In [108]:
#df_merge[['day_of_year_OCR', 'processed_image_class']].sample(10)

Classify rows with OCR letters read as 'loss' and clear metadata:

In [109]:
OCR_cols = ['station_number_OCR', 'year_OCR', 'day_of_year_OCR', 'hour_OCR', 'minute_OCR', 'second_OCR']
md_cols = ['satellite_number', 'year', 'day_1', 'day_2', 'day_3', 'hour_1', 'hour_2', 'minute_1', 'minute_2', 'second_1', 
           'second_2', 'station_number_1', 'station_number_2']

In [110]:
if len(df_read) > 0:
    for col in OCR_cols:
        df_merge[col] = df_merge[col].astype('string')
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'processed_image_class'] = 'loss'
        df_merge.loc[df_merge[col].str.contains("[a-zA-Z]"), 'details'] = 'OCR read metadata contains letters'   

In [111]:
n_OCR_read = 0
for i in range(0, len(df_merge)):
    if df_merge['processed_image_class'].iloc[i] == 'loss':
        if df_merge['details'].iloc[i] == 'OCR read metadata contains letters':
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan
    elif df_merge['processed_image_class'].iloc[i] == 'num2':
        for col in md_cols:
            df_merge[col].iloc[i] = np.nan
        n_OCR_read += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan


In [112]:
df_merge.loc[df_merge['processed_image_class'] == 'num']

Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR
0,1.500000,1743.750,2.0,2.0,2.0,2.0,0.0,4.0,3.0,2.0,...,,R014207833,4361-50A,1.png,,,,,,
2,1.500000,1196.875,2.0,2.0,2.0,2.0,0.0,4.0,1.0,2.0,...,,R014207833,4361-50A,100.png,,,,,,
3,1.500000,1196.875,2.0,2.0,2.0,2.0,0.0,4.0,2.0,0.0,...,,R014207833,4361-50A,101.png,,,,,,
4,1.500000,800.000,2.0,2.0,2.0,3.0,0.0,4.0,2.0,0.0,...,,R014207833,4361-50A,102.png,,,,,,
5,1.500000,1096.875,3.0,2.0,2.0,2.0,0.0,4.0,0.0,2.0,...,,R014207833,4361-50A,104.png,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,1.552419,1431.250,2.0,2.0,2.0,3.0,0.0,4.0,2.0,0.0,...,,R014207833,4361-50A,70.png,,,,,,
182,1.532258,1334.375,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,R014207833,4361-50A,80.png,,,,,,
184,1.524194,1096.875,3.0,2.0,2.0,2.0,0.0,4.0,2.0,0.0,...,,R014207833,4361-50A,82.png,,,,,,
186,1.500000,1587.500,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,R014207833,4361-50A,9.png,,,,,,


If num2 metadata type is detected, classify images with all other metadata types as loss:

#### Assumption: film rolls are unlikely to have mixed metadata - assume that if num2 metadata type is detected, that there is no other images on the film roll with other types of metadata.

In [113]:
if len(df_read) > 0:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'num':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan
        if df_merge['processed_image_class'].iloc[i] == 'dot':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['processed_image_class'].iloc[i] = 'loss'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['details'].iloc[i] = 'metadata could not be read by OCR'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge[col].iloc[i] = np.nan


In [114]:
df_merge.loc[df_merge['processed_image_class'] == 'num']

Unnamed: 0,fmin,max_depth,satellite_number,year,day_1,day_2,day_3,hour_1,hour_2,minute_1,...,details,Directory,Subdirectory,filename,station_number_OCR,year_OCR,day_of_year_OCR,hour_OCR,minute_OCR,second_OCR


In [115]:
if n_OCR_read > 0:
    print('Percent of OCR read images that read only numbers, for test subdirectory: ' + str(round((n_OCR_read/len(df_read))*100, 1)) + ' %')

Percent of OCR read images that read only numbers, for test subdirectory: 83.0 %


#### If num2 metadata type is not detected:

In [116]:
if len(df_read) == 0:
    n_num = len(df_merge.loc[df_merge['processed_image_class'] == 'num'])
    n_dot = len(df_merge.loc[df_merge['processed_image_class'] == 'dot'])

In [117]:
n_num

NameError: name 'n_num' is not defined

In [118]:
n_dot

NameError: name 'n_dot' is not defined

If num type metadata is the majority, classify dot type images as loss:

In [119]:
if n_num > n_dot:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'dot':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata was interpreted to be dot type'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan

NameError: name 'n_num' is not defined

If dot type metadata is the majority, classify num type images as loss:

In [120]:
if n_dot > n_num:
    for i in range(0, len(df_merge)):
        if df_merge['processed_image_class'].iloc[i] == 'num':
            df_merge['processed_image_class'].iloc[i] = 'loss'
            df_merge['details'].iloc[i] = 'metadata was interpreted to be num type'
            for col in md_cols:
                df_merge[col].iloc[i] = np.nan   

NameError: name 'n_dot' is not defined

#  

#### Save:

In [121]:
df_merge.to_csv(resultDir + directory + '/' + 'result_OCRpass-' + directory + '_' + subdirectory + '.csv', index=False)

#  

#### Development:

In [4]:
a = np.nan

In [5]:
b = 265

In [6]:
~np.isnan(a)

False

In [11]:
~pd.isna(a)

-2

In [15]:
pd.isna(a)

True

In [7]:
~np.isnan(b)

True

In [12]:
~pd.isna(b)

-1

In [16]:
pd.isna(b)

False

In [8]:
a + b

nan

In [17]:
if ~pd.isna(b):
    print('b is not NaN!')

b is not NaN!


In [18]:
if pd.isna(a):
    print('a is NaN!')

a is NaN!
