Process to Extract data from Texas Driving License Using EasyOCR and Regex

## Library Needed

In [2]:
!pip install easyocr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting easyocr
  Downloading easyocr-1.7.0-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (813 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m813.9/813.9 kB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.0/146.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyclipper, ninja, python-bidi, easyocr
Successfully installed e

## Full Readout

In [76]:
import easyocr

def extract_text_from_image(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Extract all text
    extracted_text = []
    for detection in result:
        text = detection[1]
        extracted_text.append(text)

    return extracted_text

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract text from the image
text = extract_text_from_image(image_path)

# Print the extracted text
for line in text:
    print(line)




USA
TTX
DRIVER LICENSE
Id DL
12345678
ciass
AM
Iss
07/30/2006
07/30/2012
DOB
07/30/1976
SAMPLE
JANICE
2120 OLD MAIN STREET
ANYTOWN TX 12345-0000
Restriclions A
9 End
P
damcetQmpe
Hgt 5-04
15 Sex F
Eyes BLU
DA
Dd 1234567890o00oo0oo00
Texas
Exp


Extracting only the ID Number

In [8]:
import easyocr
import re

def extract_id_number(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Extract the 8-digit ID number
    id_number = None
    for detection in result:
        text = detection[1]
        match = re.search(r'\b\d{8}\b', text)
        if match:
            id_number = match.group()
            break

    return id_number

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract the 8-digit ID number from the image
id_number = extract_id_number(image_path)

# Print the extracted ID number
print(f"ID number: {id_number}")




ID number: 12345678


## Extracting Dates

In [83]:
import easyocr
import re
from datetime import datetime

def extract_id_details(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Initialize variables
    name = None
    dob = None
    issue_date = None
    expiry_date = None

    # Extract details using regular expressions
    dates = []
    for detection in result:
        text = detection[1]


        # Extract dates
        match_dates = re.findall(r'\b\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\b', text)
        if match_dates:
            dates.extend(match_dates)

    # Sort the extracted dates
    dates.sort(key=lambda date: datetime.strptime(date, '%m/%d/%Y'))

    # Categorize dates
    if dates:
        dob = dates[0]
        issue_date = dates[len(dates) // 2]
        expiry_date = dates[-1]

    return  dob, issue_date, expiry_date

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract ID details from the image
dob, issue_date, expiry_date = extract_id_details(image_path)

# Print the extracted details
print(f"DOB: {dob}")
print(f"Issue Date: {issue_date}")
print(f"Expiry Date: {expiry_date}")




DOB: 07/30/1976
Issue Date: 07/30/2006
Expiry Date: 07/30/2012


## Code to Derive data other than name and address

In [60]:
import easyocr
import re
from datetime import datetime

def extract_id_details(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Initialize variables
    name = None
    dob = None
    sex = None
    issue_date = None
    expiry_date = None
    id_number = None
    restrictions = None
    height = None
    eye_color = None


    # Extract details using regular expressions
    dates = []
    extracted_text = []
    for detection in result:
        text = detection[1]

        # Remove extracted text from result
        if extracted_text:
            for extracted in extracted_text:
                text = text.replace(extracted, '')

        # Extract sex
        if sex is None:
            match_sex = re.search(r'(?i)Sex\s+(\w)', text)
            if match_sex:
                sex = match_sex.group(1)
                extracted_text.append(match_sex.group())

        # Extract dates
        match_dates = re.findall(r'\b\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\b', text)
        if match_dates:
            dates.extend(match_dates)
            extracted_text.extend(match_dates)

        # Extract the 8-digit ID number
        if id_number is None:
            match_id = re.search(r'\b\d{8}\b', text)
            if match_id:
                id_number = match_id.group()
                extracted_text.append(match_id.group())

        # Extract restrictions
        if restrictions is None:
            match_restrictions = re.search(r'(?i)Restriclions\s+(.*)', text)
            if match_restrictions:
                restrictions = match_restrictions.group(1)
                extracted_text.append(match_restrictions.group())

        # Extract height
        if height is None:
            match_height = re.search(r'(?i)Hgt\s+(\d+-\d+)', text)
            if match_height:
                height = match_height.group(1)
                extracted_text.append(match_height.group())

        # Extract eye color
        if eye_color is None:
            match_eye_color = re.search(r'(?i)Eyes\s+(\w+)', text)
            if match_eye_color:
                eye_color = match_eye_color.group(1)
                extracted_text.append(match_eye_color.group())


    # Sort the extracted dates
    dates.sort(key=lambda date: datetime.strptime(date, '%m/%d/%Y'))

    # Categorize dates
    if dates:
        if dob is None:
            dob = dates[0]
            extracted_text.append(dob)
        issue_date = dates[len(dates) // 2]
        extracted_text.append(issue_date)
        expiry_date = dates[-1]
        extracted_text.append(expiry_date)

    # Remove extracted text from the full predictions
    for i, detection in enumerate(result):
        if detection[1] in extracted_text:
            result.pop(i)

    return name, dob, sex, issue_date, expiry_date, id_number, restrictions, height, eye_color

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract ID details from the image
name, dob, sex, issue_date, expiry_date, id_number, restrictions, height, eye_color = extract_id_details(image_path)

# Print the extracted details
print(f"DOB: {dob}")
print(f"Sex: {sex}")
print(f"Issue Date: {issue_date}")
print(f"Expiry Date: {expiry_date}")
print(f"ID number: {id_number}")
print(f"Restrictions: {restrictions}")
print(f"Height: {height}")
print(f"Eye Color: {eye_color}")




DOB: 07/30/1976
Sex: F
Issue Date: 07/30/2006
Expiry Date: 07/30/2012
ID number: 12345678
Restrictions: A
Height: 5-04
Eye Color: BLU


## Code to Derive "Name and Address"

In [52]:
import easyocr
import re

def extract_capitalized_text(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Define the regular expression pattern for fully capitalized text
    pattern = r'(?<!\S)[A-Z\s]+\b'

    # Extract text using the regular expression pattern
    extracted_text = []
    for detection in result:

        matches = re.findall(pattern, text)
        if matches:
            extracted_text.extend(matches)

    return extracted_text

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract capitalized text from the image
capitalized_text = extract_capitalized_text(image_path)

# Print the extracted capitalized text
for text in capitalized_text:
    print(text.strip())




USA
TTX
DRIVER LICENSE
DL
AM
DOB
SAMPLE
JANICE
OLD MAIN STREET
ANYTOWN TX
A
P
F
BLU
DA


In [71]:
import easyocr
import re

def extract_capitalized_text(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Define the regular expression pattern for fully capitalized text
    pattern = r'(?<!\S)[A-Z\s]+\b'

    # Define the excluded keywords or patterns
    excluded_keywords = ['USA', 'TTX', 'DRIVER LICENSE', 'DL', 'AM', 'DOB', 'A', 'P', 'F', 'BLU', 'DA', 'M', 'RESTRICTIONS', 'HGT']

    # Extract text using the regular expression pattern
    extracted_text = []
    for detection in result:
        text = detection[1]
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                match = match.strip()
                if match not in excluded_keywords:
                    extracted_text.append(match)

    return extracted_text

# Specify the path to the image
image_path = '/content/Screenshot 2023-06-05 at 4.13.45 PM.png'

# Extract capitalized text from the image
capitalized_text = extract_capitalized_text(image_path)

# Print the extracted capitalized text
for text in capitalized_text:
    print(text)




TX
ROSE STREET ROAD
KING TOWN TX
DD




TX
ROSE STREET ROAD
KING TOWN TX
DD


In [59]:
import easyocr
import re

def extract_information(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Define the regular expression pattern for fully capitalized text
    pattern = r'(?<!\S)[A-Z\s]+\b'

    # Define the excluded keywords or patterns
    excluded_keywords = ['USA', 'TTX', 'DRIVER LICENSE', 'DL', 'AM', 'DOB', 'A', 'P', 'F', 'BLU', 'DA', 'M', 'RESTRICTIONS', 'HGT']

    # Extract text using the regular expression pattern
    extracted_text = []
    for detection in result:
        text = detection[1]
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                match = match.strip()
                if match not in excluded_keywords:
                    extracted_text.append(match)

    # Extract the required information
    first_name = extracted_text[0]
    second_name = extracted_text[1]
    first_line_address = extracted_text[2]
    second_line_address =  extracted_text[3]

    # Extract the whole sentence for "OLD MAIN STREET" and "ANYTOWN TX"
    full_address = ""
    for detection in result:
        if extracted_text[2] in detection[1]:
            full_address += detection[1] + " "
        elif extracted_text[3] in detection[1]:
            full_address += detection[1]
            break

    return first_name, second_name, first_line_address, second_line_address, full_address

# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract information from the image
first_name, second_name, first_line_address, second_line_address, full_address = extract_information(image_path)

# Print the extracted information
print(f"First Name: {first_name}")
print(f"Second Name: {second_name}")
print(f"Full Address: {full_address}")




First Name: SAMPLE
Second Name: JANICE
Full Address: 2120 OLD MAIN STREET ANYTOWN TX 12345-0000


## Full Operation

In [72]:
import easyocr
import re
from datetime import datetime

def extract_id_details(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Initialize variables
    name = None
    dob = None
    sex = None
    issue_date = None
    expiry_date = None
    id_number = None
    restrictions = None
    height = None
    eye_color = None

    # Extract details using regular expressions
    dates = []
    extracted_text = []
    for detection in result:
        text = detection[1]

        # Remove extracted text from result
        if extracted_text:
            for extracted in extracted_text:
                text = text.replace(extracted, '')

        # Extract sex
        if sex is None:
            match_sex = re.search(r'(?i)Sex\s+(\w)', text)
            if match_sex:
                sex = match_sex.group(1)
                extracted_text.append(match_sex.group())

        # Extract dates
        match_dates = re.findall(r'\b\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\b', text)
        if match_dates:
            dates.extend(match_dates)
            extracted_text.extend(match_dates)

        # Extract the 8-digit ID number
        if id_number is None:
            match_id = re.search(r'\b\d{8}\b', text)
            if match_id:
                id_number = match_id.group()
                extracted_text.append(match_id.group())

        # Extract restrictions
        if restrictions is None:
            match_restrictions = re.search(r'(?i)Restriclions\s+(.*)', text)
            if match_restrictions:
                restrictions = match_restrictions.group(1)
                extracted_text.append(match_restrictions.group())

        # Extract height
        if height is None:
            match_height = re.search(r'(?i)Hgt\s+(\d+-\d+)', text)
            if match_height:
                height = match_height.group(1)
                extracted_text.append(match_height.group())

        # Extract eye color
        if eye_color is None:
            match_eye_color = re.search(r'(?i)Eyes\s+(\w+)', text)
            if match_eye_color:
                eye_color = match_eye_color.group(1)
                extracted_text.append(match_eye_color.group())

    # Sort the extracted dates
    dates.sort(key=lambda date: datetime.strptime(date, '%m/%d/%Y'))

    # Categorize dates
    if dates:
        if dob is None:
            dob = dates[0]
            extracted_text.append(dob)
        issue_date = dates[len(dates) // 2]
        extracted_text.append(issue_date)
        expiry_date = dates[-1]
        extracted_text.append(expiry_date)

    # Remove extracted text from the full predictions
    for i, detection in enumerate(result):
        if detection[1] in extracted_text:
            result.pop(i)

    return  dob, sex, issue_date, expiry_date, id_number, restrictions, height, eye_color


def extract_information(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Define the regular expression pattern for fully capitalized text
    pattern = r'(?<!\S)[A-Z\s]+\b'

    # Define the excluded keywords or patterns
    excluded_keywords = ['USA', 'TTX', 'DRIVER LICENSE', 'DL', 'AM', 'DOB', 'A', 'P', 'F', 'BLU', 'DA', 'M', 'RESTRICTIONS', 'HGT']

    # Extract text using the regular expression pattern
    extracted_text = []
    for detection in result:
        text = detection[1]
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                match = match.strip()
                if match not in excluded_keywords:
                    extracted_text.append(match)

    # Extract the required information
    first_name = extracted_text[0]
    second_name = extracted_text[1]
    first_line_address = extracted_text[2]
    second_line_address = extracted_text[3]

    # Extract the whole sentence for "OLD MAIN STREET" and "ANYTOWN TX"
    full_address = ""
    for detection in result:
        if extracted_text[2] in detection[1]:
            full_address += detection[1] + " "
        elif extracted_text[3] in detection[1]:
            full_address += detection[1]
            break

    return first_name, second_name, first_line_address, second_line_address, full_address


# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract ID details from the image
dob, sex, issue_date, expiry_date, id_number, restrictions, height, eye_color = extract_id_details(image_path)

# Extract information from the image
first_name, second_name, first_line_address, second_line_address, full_address = extract_information(image_path)

# Print the extracted details
# Print the extracted information
print("\nInformation:")
print(f"First Name: {first_name}")
print(f"Second Name: {second_name}")
print(f"Full Address: {full_address}")
print(f"DOB: {dob}")
print(f"Sex: {sex}")
print(f"Issue Date: {issue_date}")
print(f"Expiry Date: {expiry_date}")
print(f"ID number: {id_number}")
print(f"Restrictions: {restrictions}")
print(f"Height: {height}")
print(f"Eye Color: {eye_color}")





Information:
First Name: SAMPLE
Second Name: JANICE
Full Address: 2120 OLD MAIN STREET ANYTOWN TX 12345-0000
DOB: 07/30/1976
Sex: F
Issue Date: 07/30/2006
Expiry Date: 07/30/2012
ID number: 12345678
Restrictions: A
Height: 5-04
Eye Color: BLU


Inclusion of Dictionary

In [77]:
import easyocr
import re
from datetime import datetime

def extract_id_details(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Initialize variables
    details = {}

    # Extract details using regular expressions
    dates = []
    extracted_text = []
    for detection in result:
        text = detection[1]

        # Remove extracted text from result
        if extracted_text:
            for extracted in extracted_text:
                text = text.replace(extracted, '')

        # Extract sex
        if 'sex' not in details:
            match_sex = re.search(r'(?i)Sex\s+(\w)', text)
            if match_sex:
                details['sex'] = match_sex.group(1)
                extracted_text.append(match_sex.group())

        # Extract dates
        match_dates = re.findall(r'\b\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\b', text)
        if match_dates:
            dates.extend(match_dates)
            extracted_text.extend(match_dates)

        # Extract the 8-digit ID number
        if 'id_number' not in details:
            match_id = re.search(r'\b\d{8}\b', text)
            if match_id:
                details['id_number'] = match_id.group()
                extracted_text.append(match_id.group())

        # Extract restrictions
        if 'restrictions' not in details:
            match_restrictions = re.search(r'(?i)Restriclions\s+(.*)', text)
            if match_restrictions:
                details['restrictions'] = match_restrictions.group(1)
                extracted_text.append(match_restrictions.group())

        # Extract height
        if 'height' not in details:
            match_height = re.search(r'(?i)Hgt\s+(\d+-\d+)', text)
            if match_height:
                details['height'] = match_height.group(1)
                extracted_text.append(match_height.group())

        # Extract eye color
        if 'eye_color' not in details:
            match_eye_color = re.search(r'(?i)Eyes\s+(\w+)', text)
            if match_eye_color:
                details['eye_color'] = match_eye_color.group(1)
                extracted_text.append(match_eye_color.group())

    # Sort the extracted dates
    dates.sort(key=lambda date: datetime.strptime(date, '%m/%d/%Y'))

    # Categorize dates
    if dates:
        if 'dob' not in details:
            details['dob'] = dates[0]
            extracted_text.append(details['dob'])
        details['issue_date'] = dates[len(dates) // 2]
        extracted_text.append(details['issue_date'])
        details['expiry_date'] = dates[-1]
        extracted_text.append(details['expiry_date'])

    # Remove extracted text from the full predictions
    for i, detection in enumerate(result):
        if detection[1] in extracted_text:
            result.pop(i)

    return details


def extract_information(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read the image and extract text
    result = reader.readtext(image_path)

    # Define the regular expression pattern for fully capitalized text
    pattern = r'(?<!\S)[A-Z\s]+\b'

    # Define the excluded keywords or patterns
    excluded_keywords = ['USA', 'TTX', 'DRIVER LICENSE', 'DL', 'AM', 'DOB', 'A', 'P', 'F', 'BLU', 'DA', 'M', 'RESTRICTIONS', 'HGT']

    # Extract text using the regular expression pattern
    extracted_text = []
    for detection in result:
        text = detection[1]
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                match = match.strip()
                if match not in excluded_keywords:
                    extracted_text.append(match)

    # Extract the required information
    details = {}
    details['name'] = extracted_text[0]
    details['second_name'] = extracted_text[1]
    details['first_line_address'] = extracted_text[2]
    details['second_line_address'] = extracted_text[3]

    # Extract the whole sentence for "OLD MAIN STREET" and "ANYTOWN TX"
    full_address = ""
    for detection in result:
        if extracted_text[2] in detection[1]:
            full_address += detection[1] + " "
        elif extracted_text[3] in detection[1]:
            full_address += detection[1]
            break

    details['full_address'] = full_address

    return details


# Specify the path to the image
image_path = '/content/Texas_ID.png'

# Extract ID details from the image
id_details = extract_id_details(image_path)

# Extract information from the image
information = extract_information(image_path)

# Combine the extracted details and information
combined_data = {**id_details, **information}

# Print the extracted details and information
print("Extracted Details and Information:")
for key, value in combined_data.items():
    print(f"{key}: {value}")




Extracted Details and Information:
id_number: 12345678
restrictions: A
height: 5-04
sex: F
eye_color: BLU
dob: 07/30/1976
issue_date: 07/30/2006
expiry_date: 07/30/2012
name: SAMPLE
second_name: JANICE
first_line_address: OLD MAIN STREET
second_line_address: ANYTOWN TX
full_address: 2120 OLD MAIN STREET ANYTOWN TX 12345-0000


In [80]:
combined_data

{'id_number': '12345678',
 'restrictions': 'A',
 'height': '5-04',
 'sex': 'F',
 'eye_color': 'BLU',
 'dob': '07/30/1976',
 'issue_date': '07/30/2006',
 'expiry_date': '07/30/2012',
 'name': 'SAMPLE',
 'second_name': 'JANICE',
 'first_line_address': 'OLD MAIN STREET',
 'second_line_address': 'ANYTOWN TX',
 'full_address': '2120 OLD MAIN STREET ANYTOWN TX 12345-0000'}