In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Example usage
pdf_path = 'path/to/my/pdf/file.pdf'
text = extract_text_from_pdf(pdf_path)
print(text)


In [None]:
import re
from collections import defaultdict

def extract_features(text):
    features = defaultdict(str)
    # Extract invoice number
    invoice_number_match = re.search(r'Invoice Number: (\w+)', text)
    if invoice_number_match:
        features['invoice_number'] = invoice_number_match.group(1)
    
    # Extract date
    date_match = re.search(r'Date: (\d{2}/\d{2}/\d{4})', text)
    if date_match:
        features['date'] = date_match.group(1)
    
    # Extract total amount
    amount_match = re.search(r'Total Amount: \â‚¬([\d,]+\.\d{2})', text) 
    if amount_match:
        features['total_amount'] = amount_match.group(1)
    
    # Extract keywords
    keywords = set(re.findall(r'\b\w+\b', text))
    features['keywords'] = keywords
    
    return features

# Example usage
features = extract_features(text)
print(features)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]



In [None]:
import os
class InvoiceDatabase:
    def __init__(self):
        self.invoices = {}

    def add_invoice(self, invoice_id, features):
        self.invoices[invoice_id] = features

    def find_most_similar(self, features):
        best_match = None
        highest_similarity = 0
        for invoice_id, stored_features in self.invoices.items():
            sim_score = calculate_similarity(features['text'], stored_features['text'])
            if sim_score > highest_similarity:
                highest_similarity = sim_score
                best_match = invoice_id
        return best_match, highest_similarity

# Main function to process an input invoice and find the most similar invoice in the database
def main(input_pdf_path, database_folder):
    # Initialize the database
    db = InvoiceDatabase()

    # Load existing invoices into the database
    for filename in os.listdir(database_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(database_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            features = extract_features(text)
            invoice_id = os.path.splitext(filename)[0]
            db.add_invoice(invoice_id, features)

    # Extract features from the input invoice
    input_text = extract_text_from_pdf(input_pdf_path)
    input_features = extract_features(input_text)

    # Find the most similar invoice in the database
    best_match, similarity_score = db.find_most_similar(input_features)

    # Output the result
    print(f'Most similar invoice: {best_match}')
    print(f'Similarity score: {similarity_score}')

# Example usage
input_pdf_path = 'path/to/input_invoice.pdf'
database_folder = 'path/to/database_folder'
main(input_pdf_path, database_folder)