# Customised Text Extractions from Pictures

In [2]:
###############################################################################
#### EXTRACT DATA FROM THE PDF FILES ####
###############################################################################
# code is from: https://github.com/zoumdatascience/OCR/blob/main/TextFromAnyPDF.ipynb
!pip install pdf2image
!pip install pytesseract


from pdf2image import convert_from_path
from pytesseract import image_to_string
from PIL import Image

[0m

In [3]:
# since the invoices and the bank statements are scanned PDF 
# we need to prepare the data so that we can extract the data

#convert pdf to image, returns an iterable containing image format of all the pages of the pdf
def convert_pdf_to_img(pdf_file):
    return convert_from_path(pdf_file)

# convert image to text, returns the textual content of a single image
def convert_image_to_text(file):    
    text = image_to_string(file)
    return text

# combining the two previous functions, returns the textual content of all pages
def get_text_from_any_pdf(pdf_file):
    images = convert_pdf_to_img(pdf_file)
    final_text = ""
    for pg, img in enumerate(images):
        
        final_text += convert_image_to_text(img)
        #print("Page n°{}".format(pg))
        #print(convert_image_to_text(img))
    
    return final_text

# Testing with one document
pdf_file = 'data/invoice_scan/invoice_3_only_first_page.pdf'

text = get_text_from_any_pdf(pdf_file)

# Printing result in terminal to see if it is correct
print(text)

Snack Bar
IL RIFUGIO Sagl
Riva Caccia 1D
6900 Lugano
Fattura 983 360
Numero cliente 7726 T2A
Collaboratore Isabella
Novazzano 26.02.2024 Pagina 1/2
Indirizzo di fornitura: Snack Bar, IL RIFUGIO Sagl, Riva Caccia 1D, 6900 Lugano
FD - Pagamento E-Banking
Numero articolo / Descrizione Quantita Prezzo unit. Ribasso % Totale
E201G Caffé Extra-Milano in grani 4.00 pz 23.00 92.00 23
10009
Lotto diproduzione 241 T29GE, 4.00 pz
29.07.2025
D304 Decaffeinato in grani 1.00 pz 11.50 11.50 23
500g
Lotto di produzione L11N, 31.12.2025 1.00 pz
1778 Teé menta 1.00 box 2.40 2.40 23
box 15 bustine
Gardenhouse
Lotto diproduzione £202G, 21.07.2026 1.00 box
01080 Choco & Cereals 1.00 pz 21.00 21.00 23
1000g
Mangini
Lotto diproduzione 1223, 22.01.2026 1.00 pz
Totale intermedio 126.90
IVA 2.6% netta (Codice 23) di 126.90 3.30

Totale CHF
Condizioni di pagamento: 10 giorni netto.




In [5]:
#############################################
# Different Language Text Extraction
#############################################

import re
import pandas as pd

#  'input_string' contains the invoice text
def extract_invoice_data(input_string):
    # Define patterns
    patterns = {
        'Invoice Number': r'Rechnung\s+([\d ]+)\n|Fattura\s+([\d ]+)\n|Facture\s+([\d ]+)\n',
        'Customer Number': r"Kundennummer\s+(\d{1,3}(?:'\d{3})*)|Numéro de client\s+(\d{1,3}(?:'\d{3})*)|Numero cliente\s+(\d{1,3}'?\d{3}|\d{5})",
        'Employee Name': r"Sachebearbeiter\s+(\w+)\n|Collaboratore\s+(\w+)\n|Collaborateur\s+(\w+)\n",
        'Delivery Note': r"Lieferschein\s*-\s*(\d{3}\s{1}\d{3})|Bollettino di consegna\s*-\s*(\d{3}\s{1}\d{3})\n|Bulletin de Livraison\s*-\s*(\d{3}\s{1}\d{3})",
        'Delivery Address': r'Lieferadresse \s*(.*?)\n|Adresse de fourniture \s*(.*?)\n|Indirizzo di fornitura:\s*(.*?)\n|\d+\s+-\s+(.*?,\s+\d{4}.*?)(?=\n)',
        'Date': r'(\d{2}\.\d{2}\.\d{4})|(\d{2}/\d{2}/\d{4})',
        'Subtotal': r'Total intermédiaire\s+([\d\s]+\.\d{2})|Totale intermedio\s+([\d\s]+\.\d{2})|Zwischentotal\s+([\d\s]+\.\d{2})',
        'Total': r"Total CHF\s+(\d+\.\d{2})|Totale CHF\s+(\d+\.\d{2})"
    }
    
    # Extract information
    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, input_string, re.IGNORECASE)
        if match:
            # Extract the first non-None group
            extracted_data[key] = next((m for m in match.groups() if m is not None), 'N/A')
        else:
            extracted_data[key] = 'N/A'
    
    return extracted_data

# Test function
extract_invoice_data(text)

{'Invoice Number': '983 360',
 'Customer Number': '7726',
 'Employee Name': 'Isabella',
 'Delivery Note': 'N/A',
 'Delivery Address': 'Snack Bar, IL RIFUGIO Sagl, Riva Caccia 1D, 6900 Lugano',
 'Date': '26.02.2024',
 'Subtotal': '126.90',
 'Total': 'N/A'}

In [8]:
#############################################
# Different Language Text Extraction
#############################################

# Prepare the DataFrame outside the function as a global variable
df_orders = pd.DataFrame({
    'Invoice Number':  [],
    'Customer Number': [],
    'Employee Name': [],
    'Delivery Note': [],
    'Delivery Address': [],
    'Subtotal': [],
    'Total': []
})

# Function to add data to the DF
def add_data_to_df(orders_data):
    global df_orders  # Declare df_orders as global so that you can modify it
    # Add a new row to the DataFrame with the values from the data dictionary
    new_row = pd.DataFrame(orders_data, index=[0])
    df_orders = pd.concat([df_orders, new_row], ignore_index=True)

# Testing function for extraction
# This would be done for each document's extracted data
extracted_data = extract_invoice_data(text)  # Replace 'text' with your actual document text
add_data_to_df(extracted_data)

df_orders

Unnamed: 0,Invoice Number,Customer Number,Employee Name,Delivery Note,Delivery Address,Subtotal,Total,Date
0,983 360,7726,Isabella,,"Snack Bar, IL RIFUGIO Sagl, Riva Caccia 1D, 69...",126.9,,26.02.2024
