<a href="https://colab.research.google.com/github/anuj-glitch/rc-info-extractor/blob/master/rc_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [171]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.


In [172]:
!pip install pytesseract



In [0]:
# import the necessary packages

import pytesseract

import pandas as pd
import numpy as np
from PIL import Image,ImageEnhance
import os
import cv2
from skimage.filters import threshold_local
import imutils
import re

from google.colab.patches import cv2_imshow




In [0]:


def order_points(pts):
	# initialzie a list of coordinates that will be ordered
	# such that the first entry in the list is the top-left,
	# the second entry is the top-right, the third is the
	# bottom-right, and the fourth is the bottom-left
	rect = np.zeros((4, 2), dtype = "float32")
	# the top-left point will have the smallest sum, whereas
	# the bottom-right point will have the largest sum
	s = pts.sum(axis = 1)
	rect[0] = pts[np.argmin(s)]
	rect[2] = pts[np.argmax(s)]
	# now, compute the difference between the points, the
	# top-right point will have the smallest difference,
	# whereas the bottom-left will have the largest difference
	diff = np.diff(pts, axis = 1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]
	# return the ordered coordinates
	return rect

def four_point_transform(image, pts):
	# obtain a consistent order of the points and unpack them
	# individually
	rect = order_points(pts)
	(tl, tr, br, bl) = rect
	# compute the width of the new image, which will be the
	# maximum distance between bottom-right and bottom-left
	# x-coordiates or the top-right and top-left x-coordinates
	widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
	widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
	maxWidth = max(int(widthA), int(widthB))
	# compute the height of the new image, which will be the
	# maximum distance between the top-right and bottom-right
	# y-coordinates or the top-left and bottom-left y-coordinates
	heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
	heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
	maxHeight = max(int(heightA), int(heightB))
	# now that we have the dimensions of the new image, construct
	# the set of destination points to obtain a "birds eye view",
	# (i.e. top-down view) of the image, again specifying points
	# in the top-left, top-right, bottom-right, and bottom-left
	# order
	dst = np.array([
		[0, 0],
		[maxWidth - 1, 0],
		[maxWidth - 1, maxHeight - 1],
		[0, maxHeight - 1]], dtype = "float32")
	# compute the perspective transform matrix and then apply it
	M = cv2.getPerspectiveTransform(rect, dst)
	warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
	# return the warped image
	return warped


def de_noise(image):
    img = cv2.fastNlMeansDenoising(image,None,10,10,7,21)
    return img


def sharpen_image(image):
    #sharpen image
    kernel_sharpening = np.array([[-1,-1,-1],
                                  [-1, 10,-1],
                                  [-1,-1,-1]])
    # applying the sharpening kernel to the input image & displaying it.
    sharpen = cv2.filter2D(image, -1, kernel_sharpening)
    #cv2.imshow('Image Sharpening', sharpen)
    return sharpen

def blur_check(image):
    laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
    return  laplacian_var


def transform(image):
      ratio = image.shape[0] / 500.0
      orig = image.copy()
      image = imutils.resize(image, height = 500)
      
      # convert the image to grayscale, blur it, and find edges in the image
      #gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
      gray = cv2.GaussianBlur(image, (5, 5), 0)
      edged = cv2.Canny(gray, 75, 200)
      # show the original image and the edge detected image
      #cv2_imshow(image)
      #cv2_imshow(edged)


      # find the contours in the edged image, keeping only the
      # largest ones, and initialize the screen contour
      cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
      cnts = imutils.grab_contours(cnts)
      cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]
      # loop over the contours

      
      for c in cnts:
          # approximate the contour
          peri = cv2.arcLength(c, True)
          approx = cv2.approxPolyDP(c, 0.02 * peri, True)
          # if our approximated contour has four points, then we
          # can assume that we have found our screen
          if len(approx) == 4:
            screenCnt = approx
            break

      # show the contour (outline) of the rc image
      #cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2)
      #cv2_imshow(image)

      # apply the four point transform to obtain a top-dow view of the original image
      warped = four_point_transform(orig, screenCnt.reshape(4, 2) * ratio)
     
      # show the Applied perspective transformed images
      #print("STEP 3: Apply perspective transform")
      #cv2_imshow(imutils.resize(warped, height = 500))

      return warped




def extractdata(Rc_list):
  
  data=[]
  j=0

  for rc_image in Rc_list: 
    
    
    # 'RC/'+pictures is the directory of test images. 
    # Pictures are taken one at a time
    #img = Image.open('/content/RC/'+ rc_image)   
    
    Ttext=""
    #Converting Image to string
    try:  
        Timg = cv2.imread('/content/RC/'+ rc_image,0)  
        Timg = transform(Timg)
        Timg = Image.fromarray(Timg)
        #Timg = cv2.convertScaleAbs(Timg, alpha=1, beta=0)
        enhanced_img=ImageEnhance.Brightness(Timg)                         
        Timg=enhanced_img.enhance(1.2)
        enhanced_img=ImageEnhance.Color(Timg)
        Timg=enhanced_img.enhance(0.0)
                

       # Timg.save('%s' % ('T'+rc_image) ,dpi=(300,300))
       # Timg = Image.open('%s' % ('T'+rc_image))
        Ttext = pytesseract.image_to_string(Timg)
    except:
        print("")

    Nimg = Image.open('/content/RC/'+ rc_image)
    enhanced_img=ImageEnhance.Brightness(Nimg)                         
    Nimg=enhanced_img.enhance(1.2)
    enhanced_img=ImageEnhance.Color(Nimg)
    Nimg=enhanced_img.enhance(0.0)

  #  Nimg.save('%s' % ('N'+rc_image) ,dpi=(300,300))
  #  Nimg = Image.open('%s' % ('N'+rc_image))
    Ntext = pytesseract.image_to_string(Nimg)


    if ((len(Ttext) - Ttext.count(" ")) < (len(Ntext) - Ntext.count(" "))):

          text = Ntext
    else:
          text = Ttext
        

    
    # spliting the text according the spaces
    new_str=text.split()  

   # print("-------------------------"+str(j)+rc_image+"--------------------------\n"+ text)



    # spliting text on the basis of space
    #initializing list which hold all 6 values needed from a image
   
    aa=[None]*7
    aa[0]=rc_image                                             
    data.append(aa)
    two=0

    #======  Fetching License plate number or Regn number   ===============================================================
    
    pattern =  "[A-Z]{1,3}\d\w[-]?[A-Z][-]?\D?\d{4}" 
    regn = re.search(pattern, text)
    if regn is not None:
        data[j][1]= regn.group()
    
    else:
     for i in range(len(new_str)-6):
        if new_str[i]=='REGN':
            if len(new_str[i+4])<9:
                    a=new_str[i+4]+new_str[i+5]
                    data[j][1]=a
            else:
                data[j][1]=new_str[i+4]
        else:
            if new_str[i]=='REGN.':
                if len(new_str[i+3])<9:
                    a=new_str[i+3]+new_str[i+4]
                    data[j][1]=a
                else:
                    data[j][1]=new_str[i+3]
        if new_str[i]=="Registration":
            if new_str[i+1]=="No.":
                if len(new_str[i+2])<9:
                    a=new_str[i+2]+new_str[i+3]
                    data[j][1]=a
                else:
                    data[j][1]=new_str[i+2]
                
        if 'HR' in new_str[i]:
            data[j][1]==new_str[i]

        
    #========== Fetching VIN number or Chassis number (typically 17 digit long)  ==========================================
    pattern = "\w{9,11}\s?\d{5,6}" 
    chass = re.search(pattern, text)
    if chass is not None:
        data[j][2]= chass.group()
        
    else: 
     for i in range(len(new_str)-6):
        if new_str[i]=='CH.' or new_str[i]=='CH'  :    
            data[j][1]=new_str[i+3]
            
        if new_str[i]=="Chassis" or new_str[i]=="Chasis":
            if new_str[i+1]=="No." or new_str[i]=='CHINO' :
                data[j][1]=new_str[i+2]
          


# =========== Fetching Name  ==========================================================================================
    for i in range(len(new_str)-6):
        if new_str[i]=='NAME' or new_str[i]=='Name' or new_str[i]=='name':
            if '_' or '-' in new_str[i+1]:
                if new_str[i+2]==':' or new_str[i+1]==':.':
                    if new_str[i+3]=='MR' or new_str[i+3]=='mr' or new_str[i+3]=='Mr' or new_str[i+3]=='MR.' or new_str[i+3]=='mr.' or new_str[i+3]=='Mr.':
                        name=new_str[i+4]
                    else:
                        name=new_str[i+3]
                else:
                    if new_str[i+2]=='MR' or new_str[i+2]=='mr' or new_str[i+2]=='Mr' or new_str[i+2]=='MR.' or new_str[i+2]=='mr.' or new_str[i+2]=='Mr.':
                        name=new_str[i+3]
                    else:
                        name=new_str[i+2]
                    
            elif new_str[i+1]==':' or new_str[i+1]==':.':
                    if new_str[i+2]=='MR' or new_str[i+2]=='mr' or new_str[i+2]=='Mr' or new_str[i+2]=='MR.' or new_str[i+2]=='mr.' or new_str[i+2]=='Mr.':
                        name=new_str[i+3]
                    else:
                        name=new_str[i+2]
            elif new_str[i+1]=='MR' or new_str[i+1]=='mr' or new_str[i+1]=='Mr' or new_str[i+1]=='MR.' or new_str[i+1]=='mr.' or new_str[i+1]=='Mr.' :
                name=new_str[i+2]
            
            elif new_str[i-1]=="Owner's":
                name=new_str[i+1]
            elif "&" in new_str[i+1]:
                name=new_str[i+4] #i+4
            else:
                name=new_str[i+1]
    data[j][3]= name


    
              
    #=========== Fetching Engine number  ====================================================================================

    pattern = "[A-Z][0-9]\w{3,4}\s?\d{5,6}"
    eno = re.search(pattern, text)
    if eno is not None:
        data[j][4]= eno.group()

    else:
     for i in range(len(new_str)-6):
        if new_str[i]=='ENO':
            if new_str[i+1]=='-' or new_str[i+1]=='_':
                data[j][3]=new_str[i+3]
            else:
                if new_str[i+2][-1]=='-':
                    a=str(new_str[i+2])+str(new_str[i+3])
                    data[j][3]=a
                else:
                    data[j][3]=new_str[i+2]
        if new_str[i]=="Engine":
            if len(new_str[i+2])<11:
                a=new_str[i+2]+new_str[i+3]
                data[j][3]=a
            else:
                data[j][3]=new_str[i+2]
                

      

    # ======== Fetching Registration date  ==============================================================================
 
    pattern = "\d{1,2}[/-]\w{2,}[/-]\d{4}" 
    dates = re.findall(pattern, text)
    if (len(dates)!=0):
        if len(dates)==1:
          for date in dates:
            if "-" in date:
                day, month, year = date.split("-")
            else:
                day, month, year = date.split("/")
            if int(year)>2020:
            #    data[j][5]= date[0]
                data[j][6]= dates[0]
            else:
                data[j][5]= date[0]
              #   data[j][6]= date
        else:
                data[j][5]= dates[0]
                data[j][6]= dates[1]
            
    else:
      for i in range(len(new_str)-6): 
        if new_str[i]=="REG." or new_str[i]=="REGN":
            if "DT" in new_str[i+1]:
                if new_str[i+2]==":":
                    reg_date=new_str[i+3]
                else:
                    reg_date=new_str[i+2]
                length=len(reg_date)
                reg_date=reg_date[:length-8]+" / "+reg_date[length-7:length-5]+" / "+reg_date[length-4:]        
            
                data[j][5]=reg_date
        
        if new_str[i]=="Year":
            if new_str[i+1]=="of":
                if "Manu" in new_str[i+2]:
                    manufacture_date=new_str[i+3]
            length=len(manufacture_date)
            manufacture_date=manufacture_date[:length-5]+" / "+manufacture_date[length-4:]
            data[j][6]=manufacture_date
            
        if new_str[i]=="Yr":
            if new_str[i+1]=="of":
                manufacture_date=new_str[i+2]
            length=len(manufacture_date)
            manufacture_date=manufacture_date[:length-5]+" / "+manufacture_date[length-4:]
            data[j][6]=manufacture_date
        
        
        
        if new_str[i]=="Month":
            if new_str[i+3]=="of":
                if new_str[i+4]=="Mfg." or new_str[i+4]=="Mfg" or new_str[i+4]=="Mig." or new_str[i+4]=="Mig":
                    manufacture_date=new_str[i+5]
            length=len(manufacture_date)
            manufacture_date=manufacture_date[:length-5]+"/"+manufacture_date[length-4:]
            data[j][6]=manufacture_date   
        
        if "MFG" in new_str[i]:
            if new_str[i+1]==":":
                manufacture_date=new_str[i+2]
            elif "DT" in new_str[i+1]:
                if new_str[i+2]==":":
                    manufacture_date=new_str[i+3]
                else:
                    manufacture_date=new_str[i+2]
            else:
                manufacture_date=new_str[i+1]
            length=len(manufacture_date)
            manufacture_date=manufacture_date[:length-5]+"/"+manufacture_date[length-4:]
            data[j][6]=manufacture_date
        
      
            

  
    j+=1
  return data



In [175]:

#Creating a list to store the images from RC folder
Rc_list=os.listdir('RC')

# test images will go here... create a folder named - "RC" and copy paste all the test photos in it.
for i in range(len(Rc_list)):
    print(i,Rc_list[i])


0 txt_mudit_b11_1343.jpg
1 txt_mudit_b8_1_574.jpg
2 txt_mudit_b11_961.jpg
3 txt_mudit_b11_11597.jpg
4 txt_mudit_b11_1524.jpg
5 txt_mudit_b11_1332.jpg
6 txt_mudit_b11_11599.jpg
7 txt_mudit_b8_1_554.jpg
8 txt_mudit_b8_1_555.jpg
9 txt_mudit_b11_1535.jpg
10 txt_mudit_b11_1536.jpg
11 txt_mudit_b8_1_571.jpg
12 txt_mudit_b11_12.jpg
13 txt_mudit_b11_1364.jpg
14 txt_mudit_b8_1_741.jpg
15 txt_mudit_b11_1534.jpg
16 txt_mudit_b11_1361.jpg
17 txt_mudit_b11_1369.jpg
18 txt_mudit_b8_1_513.jpg
19 txt_mudit_b8_1_742.jpg
20 txt_mudit_b11_1330.jpg
21 txt_mudit_b11_1363.jpg
22 txt_mudit_b8_1_573.jpg
23 txt_mudit_b8_1_806.jpg
24 txt_mudit_b11_1354.jpg
25 txt_mudit_b11_1347.jpg
26 txt_mudit_b8_1_740.jpg
27 txt_mudit_b11_1355.jpg
28 txt_mudit_b11_1339.jpg
29 txt_mudit_b8_1_546.jpg
30 txt_mudit_b8_1_807.jpg
31 txt_mudit_b11_1529.jpg
32 txt_mudit_b11_1365.jpg
33 txt_mudit_b11_480.jpg
34 txt_mudit_b8_1_545.jpg
35 txt_mudit_b8_2_213.jpg
36 txt_mudit_b8_1_854.jpg
37 txt_mudit_b11_439.jpg
38 txt_mudit_b11_1334.jpg

In [176]:

#Creating a Dataframe for the data extracted
data = extractdata(Rc_list)
































In [177]:

df=pd.DataFrame(data)
df.columns=["File name","Regn number","Chassis number","Name","Engine number","Registration date","Mfg. date"]
df

Unnamed: 0,File name,Regn number,Chassis number,Name,Engine number,Registration date,Mfg. date
0,txt_mudit_b11_1343.jpg,S00B5268,A3FHEB1S00B52684,Address,B1S00B52684,6-Nov-2016,20-Oct-2031
1,txt_mudit_b8_1_574.jpg,DL3CAx6515,07146KRZwWO006272,BSCS,,2 / 04 / 2009,/-
2,txt_mudit_b11_961.jpg,DL5CJ 6088,MA3FHEB1S00520199,RAJNISH,B1S00520199,06/10/2013,05/10/2028
3,txt_mudit_b11_11597.jpg,DL9CAC6215,,RAJNISH,,,
4,txt_mudit_b11_1524.jpg,5387OSNo,,RAJNISH,,,/CD
5,txt_mudit_b11_1332.jpg,HR49D 0002,,Fue!Used,,3-Nov-2013,08-Oct-2028
6,txt_mudit_b11_11599.jpg,DL9CAC6215,MA3FHEB1S00358580,Address,B1S00358580,2,
7,txt_mudit_b8_1_554.jpg,HR1O-P-0840,MALCG41GLAM255721,Address,G4EB9M256677,2,
8,txt_mudit_b8_1_555.jpg,,,R.gistration,,19/01/2025,03/03/2010
9,txt_mudit_b11_1535.jpg,DL5CJ 4987,MA3EWDE1S00526415,MANJULA,E1S00526415,12/04/2013,14/04/2028
