In [0]:
import os
import cv2
import numpy as np

# **CODE for Normalizing Images**
{Before uploading Data to AWS S3 bucket Normalization has been applied}

In [0]:
img_path_output = r"/content/drive/My Drive/RC2/"  #Output of Normalized images{this folder images to be uploaded to S3 bucket}
img_path = r"/content/drive/My Drive/RC/"          #Input of image_norm function

def imagenorm(img_path):
        for filename in os.listdir(img_path):
                img = cv2.imread(img_path + filename, -1)
                rgb_planes = cv2.split(img)
                result_planes = []
                result_norm_planes = []
                for plane in rgb_planes:
                    dilated_img = cv2.dilate(plane, np.ones((12,12), np.uint8))
                    bg_img = cv2.medianBlur(dilated_img, 15)
                    diff_img = 255 - cv2.absdiff(plane, bg_img)
                    norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
                    result_planes.append(diff_img)
                    result_norm_planes.append(norm_img)
                result = cv2.merge(result_planes)
                result_norm = cv2.merge(np.array(result_planes),np.array(result_norm_planes))
                img_name = filename
                cv2.imwrite(img_path_output + img_name, result)

In [0]:
imagenorm(img_path)

# **Custom Classes**

In [0]:
class RawRegistrationCard(object):
    def __init__(self, file_name, raw_text):
        self.file_name = file_name
        self.text = raw_text
    

In [0]:

class RegistrationCard(object):
    def __init__(self, file_name, reg_num, chassis_num, name, reg_date, mfg_date, eng_name):
        self.file_name = file_name
        self.reg_num = reg_num
        self.chassis_num = chassis_num
        self.name = name
        self.reg_date = reg_date
        self.mfg_date = mfg_date
        self.eng_name = eng_name
    
    def __repr__(self):
        from pprint import pformat
        return pformat(vars(self), indent=4, width=1)

# **Use your Bucket name , access key and secrect key to process test data.**

In [0]:
# Detects text in a document stored in an S3 bucket. 
import boto3
import sys
from time import sleep
import math
import pandas as pd



bucket=''   #bucket name here
ACCESS_KEY=''   #access key here
SECRET_KEY='' #secret key here
client = boto3.client('textract', region_name='us-east-1', aws_access_key_id=ACCESS_KEY,aws_secret_access_key=SECRET_KEY)
s3 = boto3.resource('s3',aws_access_key_id=ACCESS_KEY,aws_secret_access_key=SECRET_KEY)
your_bucket = s3.Bucket(bucket)
# extracted_data = pd.DataFrame()

In [0]:

raw_cards = []

for s3_file in your_bucket.objects.all():
    response = client.detect_document_text(Document={'S3Object': {'Bucket': bucket, 'Name': s3_file.key}})
    blocks=response['Blocks']
    raw_text = []
    for block in blocks:
        if block['BlockType'] == 'LINE':
            raw_text.append(block["Text"])
    raw_cards.append(RawRegistrationCard(s3_file.key, " | ".join(raw_text)))
    sleep(2)


# **Using Regular Expression**

In [0]:
import re

In [0]:
def extractRegistrationNumber(text):
    try:
        delhi_reg_regex = r"DL[A-Z0-9\s]*"
        haryana_reg_regex = r"HR[A-Z0-9\s]*"
        reg_num_delhi = re.search(delhi_reg_regex,text)
        reg_num_har = re.search(haryana_reg_regex,text)
        if reg_num_delhi:
            return reg_num_delhi.group(0)
        elif reg_num_har:
            return reg_num_har.group(0)
        else:
            return ""
    except:
        return ""
 

In [0]:
   
def extractName(text):
    try:
        tokens = text.split(" | ")
        for i, token in enumerate(tokens):
            token = token.strip()
            if "NAME" in token or "Name" in token:
                # check if name is present in the same token ; eg:  | Owner's Name SUBE SINGH | or whether is it in the next
                if "NAME" in token and len(token) - (token.find("NAME") + 5) > 3:
                    return token[token.find("NAME") + 5:]
                elif "Name" in token and len(token) - (token.find("Name") +5) > 3:
                    return token[token.find("Name") + 5:]
                else:
                    return tokens[i+1]
        return ""
    except:
        return ""
    

In [0]:
def extractChassisNumber(text):
    try:
        delhi_ch_regex = r"(CH\.*\s*NO\s*\:*\s*\|*\s*)([A-Z0-9\s\.]*)\s*\|*"
        haryana_ch_regex = r"(Chassis\sNo\.*\s*\|*\s*)\s*([A-Z0-9\.\s]*)"
        ch_num_delhi = re.search(delhi_ch_regex,text)
        ch_num_har = re.search(haryana_ch_regex,text)
        if ch_num_delhi:
            return ch_num_delhi.group(2)
        elif ch_num_har:
            return ch_num_har.group(2)
        else:
            return ""
    except:
        return ""

In [0]:
def extractMfgDate(text):
    try:
        delhi_ch_regex = r"(MFG\s*\.*\s*DT\s*\.*\s*\|*\s*)([0-9]+\/20[0-9]+)"
        haryana_ch_regex = r"(Month.*Mfg.*)([0-9]+\/20[0-9]+)"
        ch_num_delhi = re.search(delhi_ch_regex,text)
        ch_num_har = re.search(haryana_ch_regex,text)
        if ch_num_delhi:
            return ch_num_delhi.group(2)
        elif ch_num_har:
            return ch_num_har.group(2)
        else:
            return ""
    except:
        return ""

In [0]:
def extractRegDate(text):
    try:
        delhi_ch_regex = r"(REG.*DT.*\s)([0-9]+\/[0-9]+\/[0-9]+)"
        haryana_ch_regex = r"([0-9]+\.[0-9]+\.[0-9]+)"
        ch_num_delhi = re.search(delhi_ch_regex,text)
        ch_num_har = re.search(haryana_ch_regex,text)
        if ch_num_delhi:
            return ch_num_delhi.group(2)
        elif ch_num_har:
            return ch_num_har.group(2)
        else:
            return ""
    except:
        return ""

In [0]:
def extractEngNum(text):
    try:
        delhi_ch_regex = r"(E[E\s\.]*NO[\.\|\s]*\s*)([A-Z0-9]*)"
        haryana_ch_regex = r"(Engine\s*)([A-Z0-9]*)"
        ch_num_delhi = re.search(delhi_ch_regex,text)
        ch_num_har = re.search(haryana_ch_regex,text)
        if ch_num_delhi:
            return ch_num_delhi.group(2)
        elif ch_num_har:
            return ch_num_har.group(2)
        else:
            return ""
    except:
        return ""

In [12]:
sample = raw_cards[1].text
print(extractName(sample))
print(extractChassisNumber(sample))
print(extractRegistrationNumber(sample))
print(extractMfgDate(sample))
print(extractRegDate(sample))
print(extractEngNum(sample))

SRISHTI NAYAR
MA3FHEB1S00358580 
DL9CAC6215 
12/2012
23/12/2027
D13A0338461


In [0]:
cards = []

for raw_card in raw_cards:
    cards.append(RegistrationCard(raw_card.file_name, 
                                  extractRegistrationNumber(raw_card.text), 
                                  extractChassisNumber(raw_card.text), 
                                  extractName(raw_card.text), 
                                  extractRegDate(raw_card.text), 
                                  extractMfgDate(raw_card.text), 
                                  extractEngNum(raw_card.text)))
    

In [0]:
df = pd.DataFrame([card.__dict__ for card in cards ])

In [15]:
df

Unnamed: 0,file_name,reg_num,chassis_num,name,reg_date,mfg_date,eng_name
0,txt_mudit_b11_11597.jpg,DL9CAC6215,,SRISHTI NAYAR,23/07/2027,12/2012,D13A0338461
1,txt_mudit_b11_11599.jpg,DL9CAC6215,MA3FHEB1S00358580,SRISHTI NAYAR,23/12/2027,12/2012,D13A0338461
2,txt_mudit_b11_12.jpg,DL2CAT9109 NEW,MA3ETDE1S00218363,ANOOP SURESH DHAWALE,26/12/2015,07/2015,7567094
3,txt_mudit_b11_1328.jpg,HRO6P 5988,MA3EYD81S00765439,SUBE SINGH,,9/2006,F8DN3321864
4,txt_mudit_b11_1330.jpg,HRO6S 8814,MA3EYD81S01277497,RANBEER,,1/2008,F8DN1266647
5,txt_mudit_b11_1332.jpg,HR49D 0002,MA3FJEB1S00404062,AMAR NATH,,9/2013,D13A2235550
6,txt_mudit_b11_1334.jpg,HR06AH1768,MA3ELMG1800384268,MR HARISH KUMAR,,4/2016,K14BN
7,txt_mudit_b11_1337.jpg,HR14K 6035,MA.SF FJEB1560872866,PAWAN KUMAAR,,7/2013,D13A
8,txt_mudit_b11_1339.jpg,HRO6AE8243,MA3EJKD1800683213,"and Address HARISONS AUTOMOBILES,G.T.ROAD,PANIPAT",,1/2015,K12MN
9,txt_mudit_b11_1343.jpg,HRO6AK2102,MA3FHEB1S00B52684,MR. JITENDER,,9/2016,D13A2899227


In [0]:
from google.colab import drive
drive.mount('/content/drive')

# **Saving the Dataframe as .csv file**

In [0]:
df.to_csv("/content/drive/My Drive/result.csv")