# 1) Convert PDF to JPG 

In [13]:
import glob
from pdf2image import convert_from_path
import os
from PIL import Image

# ---------------------------------------------------------------------------

input_path = 'pdf_forms/credit card form pdf/'
output_path = 'jpg_forms/raw_images/'

# ---------------------------------------------------------------------------

!mkdir jpg_forms
!mkdir jpg_forms/raw_images

# ---------------------------------------------------------------------------

pdf_list = glob.glob1(input_path,"*.pdf")
print("There are",len(pdf_list),"pdf files")

# ---------------------------------------------------------------------------

from tqdm import tqdm
j = 0
for i in tqdm(range(len(pdf_list))):
    pages = convert_from_path(input_path+pdf_list[i])
    for page in pages:
        page.save(output_path + 'image_' + str(j).zfill(3) + ".jpg", 'JPEG')
        j+=1
        
print("Successfully converted to jpg images. See the output at",output_path)

# ---------------------------------------------------------------------------

jpg_list = glob.glob1(output_path,"*.jpg")
print("\nThere are", len(jpg_list), "images to be labelled")

# ---------------------------------------------------------------------------

  0%|          | 0/8 [00:00<?, ?it/s]

There are 8 pdf files


100%|██████████| 8/8 [00:11<00:00,  1.48s/it]

Successfully converted to jpg images. See the output at jpg_forms/raw_images/

There are 62 images to be labelled





# 2) Resize JPG dimensions to majority image

In [22]:
rows = []

jpg_list = glob.glob1(output_path,"*.jpg")
for i in tqdm(range(len(jpg_list))):
    img = Image.open(output_path+jpg_list[i])
    img_name = jpg_list[i]
    img_size = img.size
    row = list([img_name, img_size])
    rows.append(row)

import pandas as pd
img_df = pd.DataFrame(rows, columns=['Image Name', 'Dimension'])
img_df['Dimension'].value_counts()

100%|██████████| 62/62 [00:00<00:00, 1515.34it/s]


(1654, 2339)    56
(1655, 2339)     4
(2725, 3862)     1
(2728, 3870)     1
Name: Dimension, dtype: int64

In [23]:
from PIL import Image
jpg_list = sorted(glob.glob1(output_path,"*.jpg"))
output_path2 = 'jpg_forms/resized_images/'

!mkdir jpg_forms/resized_images

for i in tqdm(range(len(jpg_list))):
    img = Image.open(output_path+jpg_list[i])
    img = img.resize((1000,1000))
    img.save(output_path2 + 'resized_image_' + str(i).zfill(3) + ".jpg", 'JPEG')

100%|██████████| 62/62 [00:06<00:00,  9.07it/s]


# 3) Label objects in images using labelImg

In [19]:
!python3 ../labelImg/labelImg.py

  self.dock.setFeatures(self.dock.features() ^ self.dockFeatures)
  self.zoomWidget.setValue(value)
  v_bar.setValue(new_v_bar_value)
  bar.setValue(bar.value() + bar.singleStep() * units)
  h_bar.setValue(new_h_bar_value)
Image:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_000.jpg -> Annotation:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_000.xml
Image:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_000.jpg -> Annotation:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_000.xml
Image:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_001.jpg -> Annotation:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resized_image_001.xml
Image:/home/azzubair/Desktop/DataMicron/RND/myOCR_banking_form/jpg_forms/resized_images/resiz

# 4) Augment labelled images

In [88]:
import lxml.etree

output_path2 = 'jpg_forms/resized_images/'
output_path3 = 'jpg_forms/augmented_images/'
output_path4 = 'jpg_forms/selected_images/'

!mkdir jpg_forms/augmented_images/
xml_list = sorted(glob.glob1(output_path2,"*.xml"))

k = 1
for i in tqdm(range(len(xml_list))):
    img = Image.open(output_path2+xml_list[i][:-4]+'.jpg')
    img.save(output_path3 + 'augmented_image_' + str(i).zfill(3) + ".jpg", 'JPEG')
    tree = lxml.etree.parse(output_path2 +xml_list[i])  
    root = tree.getroot()
    for member in root.findall('object'):
            root.find('filename').text = 'augmented_image_' + str(i).zfill(3) + ".jpg"
    tree.write(output_path3+ 'augmented_image_' + str(i).zfill(3) + ".xml")
    for j in range(4):
        img.save(output_path3 + 'augmented_image_copy_' + str(k).zfill(3) + ".jpg", 'JPEG')
        for member in root.findall('object'):
            root.find('filename').text = 'augmented_image_copy_' + str(k).zfill(3) + ".jpg"
        tree.write(output_path3+ 'augmented_image_copy_' + str(k).zfill(3) + ".xml")
        k+=1
total_image = sorted(glob.glob1(output_path3,"*.jpg"))
total_xml   = sorted(glob.glob1(output_path3,"*.xml"))
print("\nThere are " + str(len(total_image)) + " images after augmentation")

!mkdir jpg_forms/selected_images/

for i in tqdm(range(len(total_image))):
    img = Image.open(output_path3+total_image[i][:-4]+'.jpg')
    img.save(output_path4 +total_image[i], 'JPEG')

print("\nSuccessfully select " + str(len(total_image)) + " images only for model training")

100%|██████████| 45/45 [00:02<00:00, 15.98it/s]



There are 225 images after augmentation


100%|██████████| 225/225 [00:04<00:00, 53.21it/s]


Successfully select 225 images only for model training





In [89]:
%%bash

mv jpg_forms/selected_images jpg_forms/train
cd jpg_forms
zip -r train.zip train/

  adding: train/ (stored 0%)
  adding: train/augmented_image_copy_022.jpg (deflated 7%)
  adding: train/augmented_image_copy_114.jpg (deflated 13%)
  adding: train/augmented_image_044.jpg (deflated 13%)
  adding: train/augmented_image_copy_104.jpg (deflated 5%)
  adding: train/augmented_image_copy_046.jpg (deflated 28%)
  adding: train/augmented_image_copy_047.jpg (deflated 28%)
  adding: train/augmented_image_014.jpg (deflated 24%)
  adding: train/augmented_image_023.jpg (deflated 16%)
  adding: train/augmented_image_copy_034.jpg (deflated 5%)
  adding: train/augmented_image_copy_098.jpg (deflated 23%)
  adding: train/augmented_image_copy_171.jpg (deflated 7%)
  adding: train/augmented_image_copy_123.jpg (deflated 11%)
  adding: train/augmented_image_copy_108.jpg (deflated 15%)
  adding: train/augmented_image_copy_020.jpg (deflated 11%)
  adding: train/augmented_image_copy_131.jpg (deflated 14%)
  adding: train/augmented_image_copy_080.jpg (deflated 20%)
  adding: train/augmented_imag

# 4) Convert XML to CSV files

In [90]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET


def xml_to_csv(input_path, output_path):
    xml_list = []
    for xml_file in glob.glob(input_path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    xml_df.to_csv(output_path + 'labels.csv', index=None)
    return xml_df

input_path = 'jpg_forms/augmented_images/'
output_path = 'jpg_forms/'
xml_to_csv(input_path , output_path)

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,augmented_image_copy_131.jpg,1000,1000,Signature,290,784,488,860
1,augmented_image_copy_131.jpg,1000,1000,Signature,610,784,812,861
2,augmented_image_copy_131.jpg,1000,1000,Name,265,861,500,879
3,augmented_image_copy_131.jpg,1000,1000,Name,583,862,838,879
4,augmented_image_copy_131.jpg,1000,1000,IC,263,880,502,890
...,...,...,...,...,...,...,...,...
1195,augmented_image_copy_097.jpg,1000,1000,Date,441,926,831,959
1196,augmented_image_copy_097.jpg,1000,1000,Date,522,727,930,758
1197,augmented_image_copy_097.jpg,1000,1000,Signature,66,713,513,757
1198,augmented_image_003.jpg,1000,1000,Logo,672,905,936,955
