In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from imutils.perspective import four_point_transform
from skimage import exposure, img_as_ubyte
from pdf2image import convert_from_path
from ModelUnetPlus import PrepareImage
from ModelUnetPlus import CreateModel
from itertools import combinations
from os.path import join as pjoin
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import numpy as np
import collections
import imutils
import time
import cv2
import sys


In [2]:
!pip install streamlit



In [3]:
!pip install -U openmim
!mim install mmcv-lite

Looking in links: https://download.openmmlab.com/mmcv/dist/cpu/torch2.0.0/index.html


In [2]:
def decode_map(label_mask):
    label_colors = get_card_colormap()
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()
    for ll in range(0, 2):
        r[label_mask == ll] = label_colors[ll, 0]
        g[label_mask == ll] = label_colors[ll, 1]
        b[label_mask == ll] = label_colors[ll, 2]
    rgb = np.zeros((label_mask.shape[0], label_mask.shape[1], 3))
    rgb[:, :, 0] = r
    rgb[:, :, 1] = g
    rgb[:, :, 2] = b

    return rgb.astype(np.uint8)

In [3]:
def get_card_colormap():
    return np.asarray([[0, 0, 0], [255, 255, 255]])

In [4]:
def get_cnt(edged, img, ratio):
    kernel = np.ones((3, 3), np.uint8)
    edged = cv2.dilate(edged, kernel, iterations=1)
    mask = np.zeros((edged.shape[0], edged.shape[1]), np.uint8)
    mask[10:edged.shape[0] - 10, 10:edged.shape[1] - 10] = 1
    edged = edged * mask

    cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    # cnts = cnts[0] if imutils.is_cv2() else cnts[1]
    cnts = cnts[0] if imutils.is_cv2(or_better=True) else cnts[1]
    cnts = sorted(cnts, key=lambda c: cv2.arcLength(c, True), reverse=True)
    edgelines = np.zeros(edged.shape, np.uint8)
    cNum = 4

    for i in range(min(cNum, len(cnts))):
        TH = 1 / 20.0
        if cv2.contourArea(cnts[i]) < TH * img.shape[0] * img.shape[1]:
            cv2.drawContours(edgelines, [cnts[i]], 0, (255, 255, 255), -1)
        else:
            cv2.drawContours(edgelines, [cnts[i]], 0, (1, 1, 1), -1)
            edgelines = edgelines * edged
            break
        cv2.drawContours(edgelines, [cnts[i]], 0, (255, 255, 255), -1)

    if debug:
        cv2.imwrite(pjoin(debug_dir, name + "_edgelines.png"), edgelines)

    lines = cv2.HoughLines(edgelines, 1, np.pi / 180, 200)

    if lines is None or len(lines) < 4:
        if debug:
            print("Detection Error: Could not find enough lines (must more than 4) from the detected edge.")
        raise Exception("Lines not found.")

    if debug:
        lines_draw = np.zeros((len(lines), 4), dtype=int)
        img_draw = img.copy()
        for i in range(0, len(lines)):
            rho, theta = lines[i][0][0], lines[i][0][1]
            a = np.cos(theta)
            b = np.sin(theta)
            x0 = a * rho
            y0 = b * rho
            lines_draw[i][0] = int(x0 + 1000 * (-b))
            lines_draw[i][1] = int(y0 + 1000 * (a))
            lines_draw[i][2] = int(x0 - 1000 * (-b))
            lines_draw[i][3] = int(y0 - 1000 * (a))
            cv2.line(img_draw, (lines_draw[i][0], lines_draw[i][1]), (lines_draw[i][2], lines_draw[i][3]), (0, 255, 0),
                     1)
        cv2.imwrite(pjoin(debug_dir, name + '_hough1.png'), img_draw)

    strong_lines = np.zeros([4, 1, 2])
    n2 = 0

    for n1 in range(0, len(lines)):
        if n2 == 4:
            break
        for rho, theta in lines[n1]:
            if n1 == 0:
                strong_lines[n2] = lines[n1]
                n2 = n2 + 1
            else:
                c1 = np.isclose(abs(rho), abs(strong_lines[0:n2, 0, 0]), atol=80)
                c2 = np.isclose(np.pi - theta, strong_lines[0:n2, 0, 1], atol=np.pi / 36)
                c = np.all([c1, c2], axis=0)
                if any(c):
                    continue
                closeness_rho = np.isclose(rho, strong_lines[0:n2, 0, 0], atol=40)
                closeness_theta = np.isclose(theta, strong_lines[0:n2, 0, 1], atol=np.pi / 36)
                closeness = np.all([closeness_rho, closeness_theta], axis=0)
                if not any(closeness) and n2 < 4 and theta != 0:
                    strong_lines[n2] = lines[n1]
                    n2 = n2 + 1

    # draw strong lines
    lines1 = np.zeros((len(strong_lines), 4), dtype=int)
    for i in range(0, len(strong_lines)):
        rho, theta = strong_lines[i][0][0], strong_lines[i][0][1]
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a * rho
        y0 = b * rho
        lines1[i][0] = int(x0 + 1000 * (-b))
        lines1[i][1] = int(y0 + 1000 * (a))
        lines1[i][2] = int(x0 - 1000 * (-b))
        lines1[i][3] = int(y0 - 1000 * (a))

        if debug:
            cv2.line(img, (lines1[i][0], lines1[i][1]), (lines1[i][2], lines1[i][3]), (0, 255, 0), 3)

    approx = np.zeros((len(strong_lines), 1, 2), dtype=int)
    index = 0
    combs = list((combinations(lines1, 2)))
    for twoLines in combs:
        x1, y1, x2, y2 = twoLines[0]
        x3, y3, x4, y4 = twoLines[1]
        [x, y] = cross_point([x1, y1, x2, y2], [x3, y3, x4, y4])
        if 0 < x < img.shape[1] and 0 < y < img.shape[0] and index < 4:
            cv2.circle(img, (int(x), int(y)), 5, (0, 0, 255), 3)
            approx[index] = (int(x), int(y))
            index = index + 1

    if debug:
        cv2.imwrite(pjoin(debug_dir, name + '_hough2.png'), img)

    if checked_valid_transform(approx):
        return approx * ratio

In [5]:
def cross_point(line1, line2):
    x = 0
    y = 0
    x1 = line1[0]
    y1 = line1[1]
    x2 = line1[2]
    y2 = line1[3]
    x3 = line2[0]
    y3 = line2[1]
    x4 = line2[2]
    y4 = line2[3]
    if (x2 - x1) == 0:
        k1 = None
    else:
        k1 = (y2 - y1) * 1.0 / (x2 - x1)
        b1 = y1 * 1.0 - x1 * k1 * 1.0
    if (x4 - x3) == 0:
        k2 = None
        b2 = 0
    else:
        k2 = (y4 - y3) * 1.0 / (x4 - x3)
        b2 = y3 * 1.0 - x3 * k2 * 1.0
    if k1 is None:
        if not k2 is None:
            x = x1
            y = k2 * x1 + b2
    elif k2 is None:
        x = x3
        y = k1 * x3 + b1
    elif not k2 == k1:
        x = (b2 - b1) * 1.0 / (k1 - k2)
        y = k1 * x * 1.0 + b1 * 1.0

    return [x, y]

In [6]:
def checked_valid_transform(approx):
    hull = cv2.convexHull(approx)
    TH_ANGLE = 45
    if len(hull) == 4:
        for i in range(4):
            p1 = hull[(i - 1) % 4]
            p2 = hull[i]
            p3 = hull[(i + 1) % 4]
            angel = get_angle(p1, p2, p3)
            if 90 - TH_ANGLE < angel < 90 + TH_ANGLE:
                continue
            else:
                if debug:
                    print("Detection Error: The detected corners could not form a valid quadrilateral for transformation.")
                raise Exception("Corner points invalid.")from None
    else:
        if debug:
            print("Detection Error: Could not find four corners from the detected edge.")
        raise Exception("Corner points less than 4.")from None

    return True

In [7]:
def get_angle(sta_point, mid_point, end_point):
    ma_x = sta_point[0][0] - mid_point[0][0]
    ma_y = sta_point[0][1] - mid_point[0][1]
    mb_x = end_point[0][0] - mid_point[0][0]
    mb_y = end_point[0][1] - mid_point[0][1]
    ab_x = sta_point[0][0] - end_point[0][0]
    ab_y = sta_point[0][1] - end_point[0][1]
    ab_val2 = ab_x * ab_x + ab_y * ab_y
    ma_val2 = ma_x * ma_x + ma_y * ma_y
    mb_val2 = mb_x * mb_x + mb_y * mb_y
    cos_M = (ma_val2 + mb_val2 - ab_val2) / (2 * np.sqrt(ma_val2) * np.sqrt(mb_val2))
    angleAMB = np.arccos(cos_M) / np.pi * 180
    return angleAMB

In [8]:
def finetune(img, ratio):
    offset = int(2 * ratio)
    img = img[offset + 15:img.shape[0] - offset,
          int(offset * 2):img.shape[1] - int(offset * 2), :]
    if img.shape[0] < img.shape[1]:
        img = cv2.resize(img, (img.shape[1], int(img.shape[1] / 856 * 540)))
        r = int(img.shape[1] / 856 * 31.8)
    else:
        img = cv2.resize(img, (img.shape[1], int(img.shape[1] / 540 * 856)))
        r = int(img.shape[1] / 540 * 31.8)
    img = set_corner(img, r)
    if img.shape[0] > img.shape[1]:
        img = cv2.transpose(img)
        img = cv2.flip(img, 0)
    return img

In [9]:
def set_corner(img, r):
    b_channel, g_channel, r_channel = cv2.split(img)
    alpha_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * 255
    row = img.shape[0]
    col = img.shape[1]

    for i in range(0, r):
        for j in range(0, r):
            if (r - i) * (r - i) + (r - j) * (r - j) > r * r:
                alpha_channel[i][j] = 0

    for i in range(0, r):
        for j in range(col - r, col):
            if (r - i) * (r - i) + (r - col + j + 1) * (r - col + j + 1) > r * r:
                alpha_channel[i][j] = 0

    for i in range(row - r, row):
        for j in range(0, r):
            if (r - row + i + 1) * (r - row + i + 1) + (r - j) * (r - j) > r * r:
                alpha_channel[i][j] = 0

    for i in range(row - r, row):
        for j in range(col - r, col):
            if (r - row + i + 1) * (r - row + i + 1) + (r - col + j + 1) * (r - col + j + 1) > r * r:
                alpha_channel[i][j] = 0

    img_bgra = cv2.merge((b_channel, g_channel, r_channel, alpha_channel))

    return img_bgra


In [10]:
def detect_edge(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX)

    mean_gray = cv2.mean(gray)
    TH_LIGHT = 150
    if mean_gray[0] > TH_LIGHT:
        gray = exposure.adjust_gamma(gray, gamma=6)
        gray = exposure.equalize_adapthist(gray, kernel_size=None, clip_limit=0.02)
        gray = img_as_ubyte(gray)

    kernel = np.ones((15, 15), np.uint8)
    closing = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    blurred = cv2.medianBlur(closing, 5)
    blurred = cv2.bilateralFilter(blurred, d=0, sigmaColor=15, sigmaSpace=10)

    edged = cv2.Canny(blurred, 75, 200)

    if debug:
        cv2.imwrite(pjoin(debug_dir, name + "_Cannyedge.png"), edged)

    return edged

In [11]:
def AprroxmateContour(thresh):
    cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    c = max(cnts, key=cv2.contourArea)
    for eps in np.linspace(0.001, 0.05, 10):
        peri = cv2.arcLength(c, True)
        approx = cv2.approxPolyDP(c, eps * peri, True)
        if len(approx) ==4:
            break
    if len(approx) !=4 :
        raise Exception("Lines not found.")
    return approx

In [12]:
Model =CreateModel()
Model.load_weights("best_model.hdf5")

In [20]:
import os
from pdf2image import convert_from_path
import multiprocessing as mp

# Set the folder path where the PDF files are located
folder_path = r"C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\23-07-09"

def convert_pdf_to_jpeg(filename):
    try:
        # Convert the PDF file to a list of image pages using pdf2image
        pages = convert_from_path(os.path.join(folder_path, filename), poppler_path=r'C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\poppler-0.68.0_x86\poppler-0.68.0\bin')
        
        # Your image processing code...
        # Save each page of the PDF as a separate image file
        for i, image in enumerate(pages):
            image_size = image.size
            image_input = PrepareImage(image)
            predicted = np.where(Model.predict(image_input) > 0.9, 255, 0)
            mask_predicted = np.array(np.squeeze(predicted[0]), dtype='uint8')
            mask_predicted = cv2.resize(mask_predicted, (image_size[1], image_size[0]), interpolation=cv2.INTER_LINEAR)
            elements_count = collections.Counter(predicted[0].reshape(1, 256*256)[0])
            print(elements_count[255])
            if elements_count[255] > 350:
                image.save(os.path.join(r'C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\IDs_2', f'{filename[:-4]}_{i+1}.jpg'), 'JPEG')
            else:
                image.save(os.path.join(r'C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\Contracts_2', f'{filename[:-4]}_{i+1}.jpg'), 'JPEG')

    except Exception as e:
        print(f"Error processing PDF file: {filename}")
        print(f"Error message: {str(e)}")
        return

pdf_files = [filename for filename in os.listdir(folder_path) if filename.endswith(".pdf")]

# Convert each PDF file in the list to a list of image pages using pdf2image
for filename in pdf_files:
    convert_pdf_to_jpeg(filename)


3121
0


In [None]:
print(len(os.listdir(".\imagesPdf")))

1750


In [None]:
print(len(os.listdir(".\Contract")))

729


In [19]:
import os
from pdf2image import convert_from_path
import multiprocessing as mp

# Set the folder path where the PDF files are located
folder_path = r"C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\23-07-09"

def convert_pdf_to_jpeg(filename):
    try:
        # Convert the PDF file to a list of image pages using pdf2image
        pages = convert_from_path(os.path.join(folder_path, filename), poppler_path=r'C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\poppler-0.68.0_x86\poppler-0.68.0\bin')
        
        # Your image processing code...
        # Save each page of the PDF as a separate image file
        for i, image in enumerate(pages):

            image.save(os.path.join(r'C:\Users\v23ASayed2\Desktop\Vodafone\National_IDs_splitting\test', f'{filename[:-4]}_{i+1}.jpg'), 'JPEG')


    except Exception as e:
        print(f"Error processing PDF file: {filename}")
        print(f"Error message: {str(e)}")
        return

pdf_files = [filename for filename in os.listdir(folder_path) if filename.endswith(".pdf")]

# Convert each PDF file in the list to a list of image pages using pdf2image
for filename in pdf_files:
    convert_pdf_to_jpeg(filename)


Error processing PDF file: 202211_C_2021011_1037_01090743579.pdf
Error message: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Error processing PDF file: 202211_C_2021011_1046_01016246328.pdf
Error message: Unable to get page count.

Error processing PDF file: 202211_C_2021011_104_01093089496.pdf
Error message: Unable to get page count.

Error processing PDF file: 202211_C_2021011_1099_01090692685.pdf
Error message: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Error processing PDF file: 202211_C_2021011_1169_01030176623.pdf
Error message: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Error processing PDF file: 202211_C_2021011_1205_01091576431.pdf
Er