<a href="https://colab.research.google.com/github/arpit-parejiya01/Text_Extraction_OCR/blob/main/custom_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import cv2
from sklearn.model_selection import train_test_split

# Set image dimensions
img_height = 32
img_width = 128
num_classes = 80  # Number of possible characters (adjust to your charset)

# Build CNN Model for feature extraction
def build_cnn(input_shape):
    model = tf.keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(256, (3, 3), activation='relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Reshape((-1, 256))
    ])
    return model

# Build RNN for sequence prediction
def build_rnn(input_shape):
    model = tf.keras.Sequential([
        layers.LSTM(256, return_sequences=True),
        layers.LSTM(256, return_sequences=True),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Define the CTC Loss Function
def ctc_loss_lambda_func(y_true, y_pred):
    input_length = tf.math.reduce_sum(tf.ones_like(y_pred[:, :, 0]), axis=1)
    label_length = tf.math.reduce_sum(tf.ones_like(y_true), axis=1)
    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

# Build the OCR model using CNN+RNN with CTC loss
def build_ocr_model(img_height, img_width, num_classes):
    input_img = layers.Input(shape=(img_height, img_width, 1))  # Input image (Grayscale)
    cnn_output = build_cnn((img_height, img_width, 1))(input_img)
    rnn_output = build_rnn((None, 256))(cnn_output)

    # CTC Loss
    labels = layers.Input(name='label', shape=[None], dtype='float32')
    input_length = layers.Input(name='input_length', shape=[1], dtype='int64')
    label_length = layers.Input(name='label_length', shape=[1], dtype='int64')

    loss_out = layers.Lambda(ctc_loss_lambda_func, output_shape=(1,), name='ctc')([labels, rnn_output])

    model = tf.keras.Model(inputs=[input_img, labels, input_length, label_length], outputs=loss_out)

    # Compile model with Adam optimizer
    model.compile(optimizer='adam')

    return model

# Model summary
ocr_model = build_ocr_model(img_height, img_width, num_classes)
ocr_model.summary()

# Generate random data (for illustration purposes)
def generate_dummy_data(num_samples):
    images = np.random.rand(num_samples, img_height, img_width, 1)  # Random image data
    labels = np.random.randint(1, num_classes, (num_samples, 10))  # Random text labels
    input_lengths = np.ones((num_samples, 1)) * (img_width // 4 - 2)  # Dummy input lengths
    label_lengths = np.ones((num_samples, 1)) * 10  # Dummy label lengths
    return images, labels, input_lengths, label_lengths

# Generate dummy data
num_samples = 100
images, labels, input_lengths, label_lengths = generate_dummy_data(num_samples)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2)

# Training the model with dummy data
ocr_model.fit([X_train, y_train, input_lengths, label_lengths], y_train, epochs=10, batch_size=16)


In [None]:
# Replace this function to load actual images and labels
def load_custom_dataset():
    # Load your images and corresponding labels here
    pass

# Replace the dummy data generation with the actual dataset
images, labels, input_lengths, label_lengths = load_custom_dataset()

# Train your model
ocr_model.fit([images, labels, input_lengths, label_lengths], labels, epochs=100, batch_size=32)


In [None]:
# Load a new image
def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Load the image in grayscale
    img = cv2.resize(img, (img_width, img_height))  # Resize to match the input size
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

# Inference: Extract text from new image
new_image_path = 'path_to_your_image.png'
new_image = preprocess_image(new_image_path)

# Predict with the trained model
pred = ocr_model.predict([new_image])

# Decode the output (using CTC decoding)
decoded_text = tf.keras.backend.ctc_decode(pred, input_length=np.ones(pred.shape[0]) * pred.shape[1])[0][0]

print("Extracted text: ", decoded_text)


In [None]:
!pip install pytesseract

In [None]:
!apt-get install tesseract-ocr
!apt-get install libtesseract-dev


In [None]:
import cv2
import pytesseract
import numpy as np
import matplotlib.pyplot as plt

# Ensure tesseract is installed and configured correctly
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Adjust the path to tesseract executable

# Function to preprocess the image and extract text
def extract_text(image_path):
    # Load the image using OpenCV
    img = cv2.imread(image_path)

    # Convert to grayscale for better OCR performance
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian Blur to reduce image noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Use adaptive thresholding to improve text and symbol recognition
    processed_img = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)

    # Use morphological operations to enhance text areas
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    processed_img = cv2.morphologyEx(processed_img, cv2.MORPH_CLOSE, kernel)

    # Enhance contrast and brightness
    processed_img = cv2.convertScaleAbs(processed_img, alpha=1.5, beta=0)


    # Use pytesseract to extract text with enhanced configuration for symbols
    custom_config = r"--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+-=~`[]{}|\\:;\'<>,./?₹$€¥¢£"

    # Extract text from the image
    extracted_text = pytesseract.image_to_string(processed_img, config=custom_config)

    # Remove any unwanted extra characters that may have been added
    # Keep only printable characters and specified symbols
    extracted_text = ''.join(char for char in extracted_text if char.isprintable() or char in {'₹', '$', '€', '¥', '¢', '£'})

    return extracted_text

# Main function to run the OCR process
def main():
    # Path to your image
    image_path = '/content/1724676987981-AWSInvoice.png'  # Replace with the actual image path

    # Print image path for debugging
    print("Processing image:", image_path)

    # Extract the text
    text = extract_text(image_path)

    # Print the extracted text
    print("Extracted Text:\n", text)

# Example usage
if __name__ == "__main__":
    main()


Processing image: /content/1724676987981-AWSInvoice.png
Extracted Text:
 amazon. Nn TaxInvoice/BillofSupply/CashMemo~~) (OriginalforRecipient)IRN/QRCode:GyyentaswynreeeoneBEAOSIes a if eeta SeisEe AOSoe PatniReaaeesarneeae SearsSoldBy: BillingAddress;RKWorldinfocomPvt.Ltd. ASPIRESOFTSERVPRIVATELIMITEDAnganBanquethall,Groundfloor,ONYXroom,9PARISHRAMCOMPLEX,202,SECONDFLOOR,Nandanvan-4,,NearPrernatirthDerasar,satellite, 5BRASHMISOCIETY,AhmedabadAhmedabad-380015 AHMEDABAD,GJ,380007Ahmedabad,Gujarat,380015 ININ GSTRegistratlonNo:24AAQCA1169D1ZUState/UTCode:24PANNo:AAECROS64MGSTReglstrationNo:24AAECROS64M1Z9 ShippingAddress;ASPIRESOFTSERVPRIVATELIMITEDFSSAILicenseNo, PuravGandhi11222999000045 DharmkrupaBunglow,Bhatta,PaldiAHMEDABAD,GUJARAT,380007INState/UTCode:24GSTRegistratlonNo:24AAQCA1169D1ZUPlaceofsupply:GJPlaceofdellvery:GUJARATOrderNumber:405-5277972-7380334 InvolceNumber:FAMA-42378OrderDate:20.06.2024 InvolceDetalls;GJ-FAMA-1293787125-2425InvolceDate:20.06.2024st. a Unit Net[Taxfrax]7

# **Easy-OCRA**

In [None]:
!pip install easyocr
!pip install torch torchvision torchaudio


In [None]:
import easyocr
import cv2
import re


# Initialize the EasyOCR reader with the desired language
reader = easyocr.Reader(['en'],gpu=True)



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


In [None]:
# Define the path to the image
image_path = '/content/1724676987981-AWSInvoice.png'

# Read the image using OpenCV
image = cv2.imread(image_path)

# Perform OCR using EasyOCR
results = reader.readtext(image)

# Function to clean unwanted symbols from amounts
def clean_text(text):
    # Remove any unwanted symbols but keep currency signs, numbers, dots, commas, etc.
    cleaned_text = re.sub(r'[^\d\w\s,.₹$€¥¢£]', '', text)
    return cleaned_text

# Extract and clean the text from the OCR results
extracted_text = ""
for (bbox, text, prob) in results:
    # Cleaning up the text, if necessary
    cleaned_text = clean_text(text)
    extracted_text += cleaned_text + "\n"

# Display the extracted and cleaned text
print("Extracted Text:\n", extracted_text)


Extracted Text:
 amazonin
Tax InvoiceBill of SupplyCash Memo
Original for Recipient
IRNQR Code
Sold By
Billing Address
R K WorldInfocom Pvt. Ltd.
ASPIRE SOFTSERV PRIVATE LIMITED
Angan Banquet hall, Ground floor, ONYX room_
PARISHRAM COMPLEX, 202, SECOND FLOOR
Nandanvan 4,, Near Prernatirth Derasar, satellite
SB RASHMI SOCIETY, Ahmedabad
Ahmedabad
380015
AHMEDABAD, GJ, 380007
Ahmedabad, Gujarat, 380015
IN
IN
GST Registration No 24AAQCAII6IDIZU
StateUT Code 24
PAN No AAECROS6AM
GST Registration No 24AAECRO564M1Z9
Shipping Address
ASPIRE SOFTSERV PRIVATE LIMITED
FSSAI License No.
Purav Gandhi
11222999000045
Dharmkrupa Bunglow Bhatta, Paldi
AHMEDABAD GUJARAT, 380007
IN
StatelUT Code 24
GST Registration No 24AAQCAI16IDIZU
Place of supply GJ
Place of delivery GUJARAT
Order Number 40552779727380334
Invoice Number
FAMA42378
Order Date 20.06.2024
Invoice Details
GJFAMA12937871252425
Invoice Date
20.06.2024
SI.
Unit
Net
Tax
Tax
Tax
Total
Description
Qtyl
No
Price
Amount Rate
Type
Amount Amount
H

# **Tesseract OCR**

In [None]:
!pip install pytesseract
!sudo apt install tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,207 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [None]:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

# Ensure Tesseract is installed and configured correctly
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

def preprocess_image(image_path):
    """
    Preprocess the image to enhance OCR accuracy.
    Converts the image to grayscale, applies Gaussian blur, and adaptive thresholding.
    """
    if not os.path.isfile(image_path):
        raise FileNotFoundError(f"The file at {image_path} does not exist.")

    # Load the image using OpenCV
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Failed to load image at {image_path}.")

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Apply adaptive thresholding to binarize the image
    processed_img = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)
    return processed_img

def extract_text(image):
    """
    Extract text from the preprocessed image using Tesseract OCR.
    """
    # Use pytesseract to extract text from image
    # Custom configuration to improve OCR accuracy
    custom_config = r'--oem 3 --psm 6'

    # Extract text
    text = pytesseract.image_to_string(image, config=custom_config)

    return text

def main():
    # Path to your image
    image_path = '/content/1724676987981-AWSInvoice.png'  # Replace with your image path

    try:
        # Preprocess the image
        processed_image = preprocess_image(image_path)

        # Extract text from the preprocessed image
        extracted_text = extract_text(processed_image)

        # Print the extracted text
        print("Extracted Text:\n", extracted_text)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Extracted Text:
 a iv a 20 ni | nN Tax Invoice/Bill of Supply/Cash Memo
~eeee) (Original for Recipient)
IRN/QR Code: Opuraae oe seen
uaa
SUES ARATE
DAN ae
Be Ae Loe!
Sold By: Billing Address ;
R K Worldinfocom Pvt. Ltd. ASPIRE SOFTSERV PRIVATE LIMITED
* angan Banquet hall, Ground floor, ONYX room, © PARISHRAM COMPLEX, 202, SECOND FLOOR,
Nandanvan -4,, Near Prernatirth Derasar, satellite, 5B RASHMI SOCIETY, Ahmedabad
Ahmedabad - 380015 AHMEDABAD, GJ, 380007
Ahmedabad, Gujarat, 380015 IN
IN GST Registration No: 24AAQCA1169D1ZU
State/UT Code: 24
PAN No: AAECROS64M
GST Reglstration No: 24AAECRO564M1Z9 Shipping Address :
ASPIRE SOFTSERV PRIVATE LIMITED
FSSAI License No. Purav Gandhi
11222998000045 Dharmkrupa Bunglow, Bhatta, Paldi
AHMEDABAD, GUJARAT, 380007
IN
State/UT Code: 24
GST Registration No: 24AAQCA1169D1ZU
Place of supply: GJ
Place of dellvery: GUJARAT
Order Number: 405-5277972-7380334 Involce Number : FAMA-42378
Order Date: 20.06.2024 Involce Detalls : GJ-FAMA-1293787 125-2425
Invo

In [None]:
!pip install pytesseract opencv-python


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,179 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [None]:
import re
import pytesseract
import cv2

# Function to extract text from an image using pytesseract
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Binarization for better OCR accuracy
    _, binary_img = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Extract text using pytesseract
    text = pytesseract.image_to_string(binary_img)
    return text

# Function to create regex patterns for each section
def create_regex_patterns():
    patterns = {
        'Name': r'\b(?:Mr\.|Ms\.|Dr\.|Prof\.)?\s*([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b',  # Name pattern
        'Phone': r'\+?\d[\d\s\-\(\)]{8,15}',  # Phone number pattern
        'Email': r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',  # Email pattern
        'LinkedIn': r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+',  # LinkedIn URL pattern
        'Skills': r'Skills[:\s]+([\w\s,]+)',  # Skills section
        'Education': r'Education[:\s]+([\w\s,]+(?:\d{4})?[\w\s,]*)',  # Education section
        'Experience': r'(Experience[:\s]+)([\w\s,]+\d{4}\s*-\s*\d{4}[\w\s,.]*)',  # Experience section
        'Projects': r'Projects[:\s]+([\w\s,.]+)',  # Projects section
        'Achievements': r'Achievements[:\s]+([\w\s,.]+)',  # Achievements section
        'Location': r'(Location|Address)[:\s]+([A-Za-z0-9,\s]+)'  # Address/Location section
    }
    return patterns

# Function to extract sections based on regex patterns
def extract_details_using_regex(text, patterns):
    details = {}

    for label, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            details[label] = match.group(0)  # Full matched string
        else:
            details[label] = 'Not Found'

    return details

# Main function to extract details from the resume image
def extract_resume_details_from_image(image_path):
    patterns = create_regex_patterns()
    text = extract_text_from_image(image_path)

    # Extract details using regex patterns
    details = extract_details_using_regex(text, patterns)

    return details

# Example usage
if __name__ == "__main__":
    image_path = '/content/i1.jpg'

    # Extract details from the resume image
    resume_details = extract_resume_details_from_image(image_path)

    # Print the extracted details
    for label, content in resume_details.items():
        print(f"{label}: {content if content else 'Not Found'}")


Name: MEGHANSHU KUMRAWAT
Patel colony
Phone: 6265149219 
Email: 6@gmail.com
LinkedIn: Not Found
Skills: skills to a dynamic and
innovative team
Education: EDUCATION
Bachelor of Science 
Experience: Not Found
Projects: Not Found
Achievements: Not Found
Location: Not Found


In [None]:
!pip install tensorflow



In [None]:
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load annotated data
with open('/content/annotated_data.json') as f:
    data = json.load(f)

texts = []
labels = []
for item in data:
    # Access the value of the 'text' key in the item dictionary
    text = item['text']
    label = item['label']
    texts.append(text)
    labels.append(label)

# Flatten the lists
texts_flat = [word for sublist in texts for word in sublist]
labels_flat = [label for sublist in labels for label in sublist]

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_flat)
X_seq = tokenizer.texts_to_sequences(texts_flat)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels_flat)
y_encoded = y_encoded.reshape(len(labels), -1)

# Pad sequences
X_padded = pad_sequences(X_seq, padding='post')
y_padded = pad_sequences(y_encoded, padding='post')

# Convert to one-hot encoding
num_labels = len(label_encoder.classes_)
y_padded = to_categorical(y_padded, num_classes=num_labels)

KeyError: 'label'

In [None]:
import cv2
import pytesseract
import re

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh_img = cv2.threshold(gray_img, 127, 255, cv2.THRESH_BINARY)
    denoised_img = cv2.fastNlMeansDenoising(thresh_img, None, 30, 7, 21)
    return denoised_img

def extract_text(image):
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, config=custom_config)
    return text

def extract_section(text, section_name):
    pattern = rf'{section_name}\s*[:\-]*\s*(.*?)(?=\n[A-Z][a-z]|$)'
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else f'{section_name} not found'

def main(image_path):
    preprocessed_image = preprocess_image(image_path)
    extracted_text = extract_text(preprocessed_image)

    # Print entire extracted text for debugging
    print("Extracted Text:\n", extracted_text)

    sections = ['Skills', 'Education', 'Experience', 'Projects', 'Achievements']
    for section in sections:
        section_text = extract_section(extracted_text, section)
        print(f'{section}:\n{section_text}\n')

if __name__ == '__main__':
    main('/content/Data Scientist Resume Example.jpeg')



Extracted Text:
 88 Hendford Hill. London 822 0GX, United Kingdom |078 3515 0056 | emilysaavedraggmeil.com
2 Profile @ Education
Highly accurate and experienced Data Scientist IB Diploma Programme 09/2010 - 05/2012
adept at collecting, analyzing, and interpreting The international School Estepona, €4 Paraiso
large datasets, develo ping new forecasting Spain
madels, and performing data management tasks.
Possessing an extensive analytical skills, strong Graduated with Distinction (Grade | - A/excetlent
attention fa detail, and a significant ability to equivatent in all 6 subjects)
workin team environments, Emnly is presently
looking for a Data Scientist position with a & Skills
forward-moving company
Languages
m@ Work experience Spanish Native
English Full
SpyBiotech, inc. 09/2017 - 02/2018 Franch Limited
Data Scientist, London, United Kingdom |
© Assisted in scientific research on BNA ctoning Computer/Data A Skills
and analyzed the results. Microsoft Office EE
+ Collected, studied, and 