In [8]:
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

from pdf2image import convert_from_path
import os
import cv2
import re

import pandas as pd

from fuzzysearch import find_near_matches
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [126]:
class Item:
    
    def __init__(self, pdf_path, keywords_path):
        self.list_jpg = self.pdf_to_img(pdf_path)
        self.catch_length = 20
        self.txt = ""
        self.df = pd.read_excel(keywords_path)
        self.banks = self.df.columns[1:]
        self.extract_basic()
        self.choose_bank()
        self.keywords = self.df[self.bank].tolist()
    
       
    def extract_basic(self):
        """
        Extract string with basic scan of each pages from pdf
        @print string
        @return none
        """
        final_txt=''
        for item_jpg in self.list_jpg:
            add_string = self.img_to_string(item_jpg)
            final_txt = final_txt+f'\n page_{self.list_jpg.index(item_jpg)+1}\n'+add_string
            
        self.txt = final_txt
        #print(self.txt)
        
    def choose_bank(self):
        """
        Select bank name in string file
        @return none
        """
        for i in self.banks:
            if i in self.txt:
                self.bank = i
                break

    def img_to_string(self, img_path):
        """
        Scan element to string
        @return string with all scan
        """
        pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
        return pytesseract.image_to_string(Image.open(img_path))
    
    def pdf_to_img(self, pdf_path):
        """
        Convert and save all pdf pages in jpeg files
        @return list of jpeg files
        """
       
        pages = convert_from_path(pdf_path, 350, poppler_path = r'/usr/local/Cellar/poppler/21.09.0/bin' )
        
        i = 1
        name = os.path.basename(pdf_path).split('.')[0]
        tab_jpg = []
        
        for page in pages:
            image_name = name + str(i) + ".jpg"  
            page.save(image_name, "JPEG")
            i = i+1
            tab_jpg.append(image_name)
        
        return tab_jpg
    
    #def extract_with_cv2(self):
      #   """
      #  Extract string items from element scaned by cv2
      #  @print string
      #  @return none
      #  """
      #  self.string_cv2 = ""
      #  for img_item in self.list_jpg:
       #     
      #      add_string = self.mark_region(img_item)
       #     self.string_cv2 = self.string_cv2 + f'\n page_{self.list_jpg.index(img_item)+1}\n' + add_string
       #        
       # print(self.string_cv2)
        
    def mark_region(self,jpg_path):
        """
        define coordinates of string elements in page and extract strings
        @return strings
        """
        
        im = cv2.imread(jpg_path)

        gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        blur = cv2.GaussianBlur(gray, (9,9), 0)
        thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)

        # Dilate to combine adjacent text contours
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
        dilate = cv2.dilate(thresh, kernel, iterations=4)

        # Find contours, highlight text areas, and extract ROIs
        cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[0] if len(cnts) == 2 else cnts[1]

        line_items_coordinates = []
        for c in cnts:
            area = cv2.contourArea(c)
            x,y,w,h = cv2.boundingRect(c)

            if y >= 600 and x <= 1000:
                if area > 10000:
                    image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
                    line_items_coordinates.append([(x,y), (2200, y+h)])

            if y >= 2400 and x<= 2000:
                image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
                line_items_coordinates.append([(x,y), (2200, y+h)])

        textes = ""
        
        for i in range(len(line_items_coordinates)):
            # get co-ordinates to crop the image
            c = line_items_coordinates[i]

            # cropping image img = image[y0:y1, x0:x1]
            img = im[c[0][1]:c[1][1], c[0][0]:c[1][0]]    

            # convert the image to black and white for better OCR
            ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)

            # pytesseract image to string to get results
            text = str(pytesseract.image_to_string(thresh1, config='--psm 6'))

            textes = textes + '\n' + text

        return textes
    
    def catch_result(self, key_word):
    
        results = find_near_matches( key_word, self.txt, max_l_dist = 1)
        if len(results) != 0:
            results_score = [fuzz.token_sort_ratio(key_word, i.matched) for i in results]

            start = results[results_score.index(max(results_score))].end
            final_list = re.findall('\d+[\d\s,.]\d+|[\d\s,.]\s\d+|\d', self.txt[start:start + self.catch_length].replace(' ',''))
            if len(final_list) == 0:
                return "Not found"

            else:
                return final_list[0]
        else:
            return "Not found"
    

In [127]:
test = Item('num_files/Banque_courtois.pdf', 'alim_keywords.xlsx')

In [131]:
test.catch_result("Revenus mensuels")

'4514'

In [50]:
variable = "Frais de Notaire"

In [51]:
test.catch_result(variable)

'15450'

In [30]:
#def catch_result(key_word, global_text):
    
    #results = find_near_matches( key_word, global_text, max_l_dist = 1)
    #results_score = [fuzz.token_sort_ratio(key_word, i.matched) for i in results]
    
    #start = results[results_score.index(max(results_score))].end
    
    #return re.sub("[^\d,\.]+","", global_text[start:start+10])

In [38]:
def catch_result(key_word, global_text):
    
    results = find_near_matches( key_word, global_text, max_l_dist = 1)
    results_score = [fuzz.token_sort_ratio(key_word, i.matched) for i in results]
    
    start = results[results_score.index(max(results_score))].end
    return re.findall('\d+[\d\s,.]\d+|[\d\s,.]\s\d+|\d', global_text[start:start+50].replace(' ',''))[0]

In [41]:
catch_result(variable, test.txt)

'15450'

In [None]:
def extract_item(string_item, len_item = 10, texte):
    b = test.txt.find(string_item) + len(string_item)+1
    return texte[b : b + len_item]

In [None]:
print(extract_item("Montant d'emprunt", 10, test.txt))

In [27]:
re.findall('\d+[,.]\d+', '@ 12a,38')[0]

IndexError: list index out of range

In [28]:
result = re.sub("[^\d,\.]+","", '@ 12a,38')
result

'12,38'