In [1]:
import pytesseract
import numpy as np
from PIL import Image, ImageTk
import os
import re
from typing import List, Dict
import csv
import json
import sqlite3
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading


#Configure Tesseract path if needed
#pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

custom_oem_psm_config = r'--oem 3 --psm 12'

Partie de L'interface, dans cette section de code est pour la creation d'une interface pour notre application

In [3]:
class InvoiceExtractorApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Invoice Information Extractor")
        self.root.geometry("1000x700")
        
        self.results = []
        self.current_images = []
        self.processing_lock = threading.Lock()
        
        self.create_widgets()



##############################################################################
# Ces méthodes ici sont pour la conception de l'interface,la fenetre d'application la creation des buttons pour les fonctions d'exportation...etc
#NOTE:On a utilise l'aide de la IA ici
##############################################################################



    def create_widgets(self):
        # Top frame for buttons
        top_frame = tk.Frame(self.root)
        top_frame.pack(pady=10)
        
        # Load Folder Button
        load_folder_btn = tk.Button(top_frame, text="Load Folder", command=self.load_folder)
        load_folder_btn.pack(side=tk.LEFT, padx=5)
        
        # Load Files Button
        load_files_btn = tk.Button(top_frame, text="Load Files", command=self.load_files)
        load_files_btn.pack(side=tk.LEFT, padx=5)
        
        # Export CSV Button
        export_csv_btn = tk.Button(top_frame, text="Export to CSV", command=lambda: self.export_to_file('csv'))
        export_csv_btn.pack(side=tk.LEFT, padx=5)
        
        # Export JSON Button
        export_json_btn = tk.Button(top_frame, text="Export to JSON", command=lambda: self.export_to_file('json'))
        export_json_btn.pack(side=tk.LEFT, padx=5)
        
        # Export SQLite Button
        export_sqlite_btn = tk.Button(top_frame, text="Export to SQLite", command=lambda: self.export_to_file('sqlite'))
        export_sqlite_btn.pack(side=tk.LEFT, padx=5)
        
        # Clear Button
        clear_btn = tk.Button(top_frame, text="Clear Results", command=self.clear_results)
        clear_btn.pack(side=tk.LEFT, padx=5)
        
        # Progress bar
        self.progress = ttk.Progressbar(top_frame, orient=tk.HORIZONTAL, length=200, mode='determinate')
        self.progress.pack(side=tk.LEFT, padx=10)
        
        # Status label
        self.status_label = tk.Label(top_frame, text="Ready")
        self.status_label.pack(side=tk.LEFT, padx=5)
        
        # Main display area
        main_frame = tk.Frame(self.root)
        main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
        
        # Treeview for results
        self.tree = ttk.Treeview(main_frame, columns=("Image", "Bill Number", "Date", "Client Name", "Total Amount", "VAT"), show="headings")
        
        # Configure columns
        self.tree.heading("Image", text="Image")
        self.tree.heading("Bill Number", text="Bill Number")
        self.tree.heading("Date", text="Date")
        self.tree.heading("Client Name", text="Client Name")
        self.tree.heading("Total Amount", text="Total Amount")
        self.tree.heading("VAT", text="VAT")
        
        # Set column widths
        self.tree.column("Image", width=150)
        self.tree.column("Bill Number", width=100)
        self.tree.column("Date", width=100)
        self.tree.column("Client Name", width=200)
        self.tree.column("Total Amount", width=100)
        self.tree.column("VAT", width=100)
        
        # Add scrollbar
        scrollbar = ttk.Scrollbar(main_frame, orient=tk.VERTICAL, command=self.tree.yview)
        self.tree.configure(yscroll=scrollbar.set)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.tree.pack(fill=tk.BOTH, expand=True)
        
        # Image preview area
        self.image_frame = tk.Frame(self.root, height=200, bg='white')
        self.image_frame.pack(fill=tk.X, padx=10, pady=10)
        self.image_label = tk.Label(self.image_frame)
        self.image_label.pack()
        
        # Bind treeview selection
        self.tree.bind('<<TreeviewSelect>>', self.show_selected_image)



    def update_status(self, message):
        self.status_label.config(text=message)
        self.root.update_idletasks()
    
    def load_folder(self):
        folder_path = filedialog.askdirectory(title="Select Folder with Invoice Images")
        if not folder_path:
            return
            
        jpg_files = self.get_jpg_files(folder_path)
        if not jpg_files:
            messagebox.showwarning("No Images", "No JPG files found in the selected folder.")
            return
            
        self.process_files(jpg_files)
    
    def load_files(self):
        file_paths = filedialog.askopenfilenames(
            title="Select Invoice Images",
            filetypes=[("Image files", "*.jpg;*.jpeg;*.png"), ("All files", "*.*")]
        )
        
        if not file_paths:
            return
            
        self.process_files(file_paths)



    def process_files(self, file_paths):
        self.clear_results()
        self.current_images = file_paths
        total_files = len(file_paths)
        
        self.progress['maximum'] = total_files
        self.progress['value'] = 0
        
        self.results = []
        self.update_status(f"Processing {total_files} files...")





        
#######UTILISATION des threads pour reduire le temps de processing, (En premier lieu, le temps d'execution de notre application été trop long, donc on 
####### décidé d'implementer des threads pour réduire le temps) on a implementer 10 threads.




        
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(self.process_single_image, file_path): file_path for file_path in file_paths}
            
            for i, future in enumerate(as_completed(futures), 1):
                file_path = futures[future]
                try:
                    result = future.result()
                    if result:
                        with self.processing_lock:
                            self.results.append(result)
                            self.add_to_treeview(result)
                except Exception as e:
                    messagebox.showerror("Processing Error", f"Error processing {file_path}:\n{str(e)}")
                
                self.progress['value'] = i
                self.update_status(f"Processed {i}/{total_files}: {os.path.basename(file_path)}")
                self.root.update_idletasks()
        
        self.update_status(f"Processed {len(self.results)} of {total_files} files")
        
        if self.results:
            messagebox.showinfo("Processing Complete", f"Successfully processed {len(self.results)} invoice(s).")
        else:
            messagebox.showwarning("No Results", "No invoice data was extracted from the selected files.")
    
    def add_to_treeview(self, result):
        self.tree.insert("", tk.END, values=(
            os.path.basename(result["Image Path"]),
            result["Bill Number"],
            result["Date"],
            result["Client Name"],
            result["Total Amount"],
            result["VAT"]
        ))



    def show_selected_image(self, event):
        selected_item = self.tree.focus()
        if not selected_item:
            return
            
        item_data = self.tree.item(selected_item)
        image_name = item_data['values'][0]
        
        # Find the full path of the selected image
        image_path = next((path for path in self.current_images if os.path.basename(path) == image_name), None)
        
        if image_path:
            try:
                image = Image.open(image_path)
                # Resize for display
                image.thumbnail((300, 300))
                photo = ImageTk.PhotoImage(image)
                
                self.image_label.config(image=photo)
                self.image_label.image = photo  # Keep reference
            except Exception as e:
                messagebox.showerror("Image Error", f"Cannot display image: {str(e)}")



###
#Ici c'est les fonctions ou on choisit quelle format a exporter les données obtenu
###


    def export_to_file(self, file_type):
        if not self.results:
            messagebox.showwarning("No Data", "No results to export.")
            return
            
        if file_type == 'csv':
            output_file = filedialog.asksaveasfilename(
                title="Save CSV File",
                defaultextension=".csv",
                filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
            )
            if output_file:
                try:
                    self.save_to_csv(self.results, output_file)
                    messagebox.showinfo("Success", f"Results successfully saved to:\n{output_file}")
                except Exception as e:
                    messagebox.showerror("Export Error", f"Failed to save CSV file:\n{str(e)}")
        
        elif file_type == 'json':
            output_file = filedialog.asksaveasfilename(
                title="Save JSON File",
                defaultextension=".json",
                filetypes=[("JSON files", "*.json"), ("All files", "*.*")]
            )
            if output_file:
                try:
                    self.save_to_json(self.results, output_file)
                    messagebox.showinfo("Success", f"Results successfully saved to:\n{output_file}")
                except Exception as e:
                    messagebox.showerror("Export Error", f"Failed to save JSON file:\n{str(e)}")
        
        elif file_type == 'sqlite':
            output_file = filedialog.asksaveasfilename(
                title="Save SQLite Database",
                defaultextension=".db",
                filetypes=[("SQLite files", "*.db"), ("All files", "*.*")]
            )
            if output_file:
                try:
                    self.save_to_sqlite(self.results, output_file)
                    messagebox.showinfo("Success", f"Results successfully saved to:\n{output_file}")
                except Exception as e:
                    messagebox.showerror("Export Error", f"Failed to save SQLite database:\n{str(e)}")



    def clear_results(self):
        self.tree.delete(*self.tree.get_children())
        self.image_label.config(image='')
        self.image_label.image = None
        self.results = []
        self.current_images = []
        self.progress['value'] = 0
        self.update_status("Ready")
    
    def get_jpg_files(self, folder_path: str) -> List[str]:
        jpg_files = []
        for file in os.listdir(folder_path):
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                jpg_files.append(os.path.join(folder_path, file))
        return jpg_files




############
# ICI c'est le deroulement de la procession des images choisis depuis l'application
############





    def process_single_image(self, image_path: str) -> Dict[str, str]:
        image = self.load_image(image_path)
        if image is None:
            return {}
        
        deskewed_image = self.deskew_tesseract(image)
        preprocessed_image = self.preprocess_image(deskewed_image)
        final_image = self.deskew_tesseract(preprocessed_image)
        text = self.extract_text(final_image)
        invoice_info = self.extract_invoice_info(text)
        invoice_info["Image Path"] = image_path
        
        return invoice_info
    
    def load_image(self, image_path: str) -> Image.Image:
        try:
            image = Image.open(image_path)
            return image
        except Exception as e:
            messagebox.showerror("Image Error", f"Error loading image {image_path}:\n{str(e)}")
            return None
    
    def deskew_tesseract(self, image: Image.Image) -> Image.Image:
        try:
            osd = pytesseract.image_to_osd(image, config='--psm 0')
            angle = int(re.search(r'Rotate: (\d+)', osd).group(1))
            if angle != 0:
                image = image.rotate(-angle, expand=True)
            return image
        except Exception as e:
            print(f"Deskewing error: {e}")
            return image
    
    def preprocess_image(self, image: Image.Image) -> Image.Image:
        try:
            gray_img = image.convert("L")
            binary_img = gray_img.point(lambda p: 255 if p > 150 else 0)
            return binary_img
        except Exception as e:
            print(f"Preprocessing error: {e}")
            return image
    
    def extract_text(self, image: Image.Image) -> str:
        try:
            text = pytesseract.image_to_string(image, lang='eng', config=custom_oem_psm_config)
            return text
        except Exception as e:
            messagebox.showerror("OCR Error", f"Error during OCR processing:\n{str(e)}")
            return ""
    
    def extract_invoice_info(self, text: str) -> Dict[str, str]:
        try:
            bill_match = re.search(r"Invoice no:\s*(\d+)", text)
            bill_id = bill_match.group(1) if bill_match else "Not found"

            date_match = re.search(r"Date of issue:?\s*\n\s*(\d{2}/\d{2}/\d{4})", text)
            date = date_match.group(1) if date_match else "Not found"

            client_match = re.search(r"Client:\s*\n\s*[^\n]+\s*\n\s*([^\n]+)", text)
            client_name = client_match.group(1).strip() if client_match else "Not found"

            amount_match = re.search(r"Total\s*\$\s*[\d\s]+,\d+\s*\$\s*[\d\s]+,\d+\s*\$\s*([\d\s]+,\d+)", text)
            amount = amount_match.group(1) if amount_match else "Not found"

            VAT_match = re.search(r"Total\s*\$\s*[\d\s]+,\d+\s*\$\s*([\d\s]+,\d+)", text)
            VAT = VAT_match.group(1) if VAT_match else "Not found"
            
            return {
                "Bill Number": bill_id,
                "Date": date,
                "Client Name": client_name,
                "Total Amount": amount,
                "VAT": VAT,
                "Image Path": ""  # Will be filled later
            }
        except Exception as e:
            messagebox.showerror("Extraction Error", f"Error extracting invoice info:\n{str(e)}")
            return {}



    def save_to_csv(self, results: List[Dict[str, str]], output_file: str):
        if not results:
            messagebox.showwarning("No Data", "No results to save")
            return
        
        fieldnames = ["Bill Number", "Date", "Client Name", "Total Amount", "VAT"]
        
        try:
            with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                
                for result in results:
                    cleaned_result = {
                        "Bill Number": result.get("Bill Number", "").strip(),
                        "Date": result.get("Date", "").strip(),
                        "Client Name": result.get("Client Name", "").strip(),
                        "Total Amount": result.get("Total Amount", "").replace(" ", ""),
                        "VAT": result.get("VAT", "").replace(" ", "")
                    }
                    writer.writerow(cleaned_result)
        except Exception as e:
            raise Exception(f"Error saving CSV file: {e}")
    
    def save_to_json(self, results: List[Dict[str, str]], output_file: str):
        if not results:
            messagebox.showwarning("No Data", "No results to save")
            return
        
        try:
            # Prepare data for JSON export
            export_data = []
            for result in results:
                export_data.append({
                    "Bill Number": result.get("Bill Number", "").strip(),
                    "Date": result.get("Date", "").strip(),
                    "Client Name": result.get("Client Name", "").strip(),
                    "Total Amount": result.get("Total Amount", "").replace(" ", ""),
                    "VAT": result.get("VAT", "").replace(" ", ""),
                    "Image": os.path.basename(result.get("Image Path", ""))
                })
            
            with open(output_file, 'w', encoding='utf-8') as jsonfile:
                json.dump(export_data, jsonfile, indent=4, ensure_ascii=False)
        except Exception as e:
            raise Exception(f"Error saving JSON file: {e}")
    
    def save_to_sqlite(self, results: List[Dict[str, str]], output_file: str):
        if not results:
            messagebox.showwarning("No Data", "No results to save")
            return
        
        try:
            # Connecter a la base de donnees SQL
            #(Si elle existe)
            conn = sqlite3.connect(output_file)
            cursor = conn.cursor()
            
            
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS invoices (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    bill_number TEXT,
                    date TEXT,
                    client_name TEXT,
                    total_amount TEXT,
                    vat TEXT,
                    image_path TEXT
                )
            ''')
            
            # Insert data
            for result in results:
                cursor.execute('''
                    INSERT INTO invoices (bill_number, date, client_name, total_amount, vat, image_path)
                    VALUES (?, ?, ?, ?, ?, ?)
                ''', (
                    result.get("Bill Number", "").strip(),
                    result.get("Date", "").strip(),
                    result.get("Client Name", "").strip(),
                    result.get("Total Amount", "").replace(" ", ""),
                    result.get("VAT", "").replace(" ", ""),
                    os.path.basename(result.get("Image Path", ""))
                ))
            
            # Commit changes and close connection
            conn.commit()
            conn.close()
        except Exception as e:
            raise Exception(f"Error saving SQLite database: {e}")




Code Pour Charger une image

In [5]:
    def load_image(self, image_path: str) -> Image.Image:
        try:
            image = Image.open(image_path)
            return image
        except Exception as e:
            messagebox.showerror("Image Error", f"Error loading image {image_path}:\n{str(e)}")
            return None

Code pour Redressement 

In [7]:
    def deskew_tesseract(self, image: Image.Image) -> Image.Image:
        try:
            osd = pytesseract.image_to_osd(image, config='--psm 0')
            angle = int(re.search(r'Rotate: (\d+)', osd).group(1))
            if angle != 0:
                image = image.rotate(-angle, expand=True)
            return image
        except Exception as e:
            print(f"Deskewing error: {e}")
            return image

Code pour rendre image gris et la binarisation

In [9]:
    def preprocess_image(self, image: Image.Image) -> Image.Image:
        try:
            gray_img = image.convert("L")
            binary_img = gray_img.point(lambda p: 255 if p > 150 else 0)
            return binary_img
        except Exception as e:
            print(f"Preprocessing error: {e}")
            return image

Code pour l'extraction de text 

In [11]:
    def extract_text(self, image: Image.Image) -> str:
        try:
            text = pytesseract.image_to_string(image, lang='eng', config=custom_oem_psm_config)
            return text
        except Exception as e:
            messagebox.showerror("OCR Error", f"Error during OCR processing:\n{str(e)}")
            return ""
    

Les expressions régulières pour choisir les information réquis depuis le text (NOTE, le texte extrait est disorganisé donc nos Expressions sont
généralement des expressions de saute de lignes et de identification exacte)

In [13]:
    def extract_invoice_info(self, text: str) -> Dict[str, str]:
        try:
            bill_match = re.search(r"Invoice no:\s*(\d+)", text)
            bill_id = bill_match.group(1) if bill_match else "Not found"

            date_match = re.search(r"Date of issue:?\s*\n\s*(\d{2}/\d{2}/\d{4})", text)
            date = date_match.group(1) if date_match else "Not found"

            client_match = re.search(r"Client:\s*\n\s*[^\n]+\s*\n\s*([^\n]+)", text)
            client_name = client_match.group(1).strip() if client_match else "Not found"

            amount_match = re.search(r"Total\s*\$\s*[\d\s]+,\d+\s*\$\s*[\d\s]+,\d+\s*\$\s*([\d\s]+,\d+)", text)
            amount = amount_match.group(1) if amount_match else "Not found"

            VAT_match = re.search(r"Total\s*\$\s*[\d\s]+,\d+\s*\$\s*([\d\s]+,\d+)", text)
            VAT = VAT_match.group(1) if VAT_match else "Not found"
            
            return {
                "Bill Number": bill_id,
                "Date": date,
                "Client Name": client_name,
                "Total Amount": amount,
                "VAT": VAT,
                "Image Path": ""  # Will be filled later
            }
        except Exception as e:
            messagebox.showerror("Extraction Error", f"Error extracting invoice info:\n{str(e)}")
            return {}

In [14]:
if __name__ == "__main__":
    root = tk.Tk()
    app = InvoiceExtractorApp(root)
    root.mainloop()