In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [2]:

# Load datasets once
df1 = pd.read_csv("data/dataset1.csv")  # Ensure no NaN values
df2 = pd.read_csv("data/dataset2.csv")

# Extract invoice numbers
invoices1 = df1["Data Set 1"].astype(str).tolist()
invoices2 = df2["Data Set 2"].astype(str).tolist()


In [3]:
len(invoices1)

11573

In [4]:
type(invoices1)

list

In [5]:

# Create TF-IDF Vectorizer (train on both datasets)
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4))
vectorizer.fit(invoices1 + invoices2)


In [6]:

# Transform invoices
tfidf_matrix1 = vectorizer.transform(invoices1)
tfidf_matrix2 = vectorizer.transform(invoices2)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def batch_cosine_similarity(matrix1, matrix2, batch_size=1000):
    num_rows = matrix1.shape[0]
    similarity_matrix = np.zeros((matrix1.shape[0], matrix2.shape[0]))
    
    for i in range(0, num_rows, batch_size):
        end = min(i + batch_size, num_rows)
        similarity_matrix[i:end] = cosine_similarity(
            matrix1[i:end], 
            matrix2
        )
    
    return similarity_matrix

In [9]:
import numpy as np
import pandas as pd

similarity_matrix = batch_cosine_similarity(tfidf_matrix1, tfidf_matrix2)
threshold = np.mean(similarity_matrix) + np.std(similarity_matrix)
matches = []

# Create masks for tracking matched invoices
matched_invoice2 = set()

for i, row in enumerate(similarity_matrix):
    best_match_idx = np.argmax(row)
    best_match_score = row[best_match_idx]
    
    if best_match_score >= threshold and best_match_idx not in matched_invoice2:
        matches.append((invoices1[i], invoices2[best_match_idx], best_match_score))
        matched_invoice2.add(best_match_idx)

matches_df = pd.DataFrame(matches, columns=["Invoice1", "Matched_Invoice2", "Similarity_Score"])

try:
    matches_df.to_csv("matched_invoices.csv", index=False)
except IOError as e:
    print(f"Error saving matches to CSV: {e}")

In [10]:

# **Feedback Handling & Model Retraining**
try:
    feedback_df = pd.read_csv("feedback.csv")  # Load feedback if it exists
except FileNotFoundError:
    feedback_df = pd.DataFrame(columns=["invoice1", "invoice2", "similarity_score", "label"])


In [11]:
feedback_df

Unnamed: 0,invoice1,invoice2,similarity_score,label


In [12]:
# Validate feedback_df columns
required_columns = ["invoice1", "invoice2"]
if not all(col in feedback_df.columns for col in required_columns):
    raise ValueError(f"feedback_df missing required columns: {required_columns}")

# Merge with validation
train_df = pd.DataFrame(matches, columns=["invoice1", "invoice2", "similarity_score"])
merged_df = train_df.merge(feedback_df, on=["invoice1", "invoice2"], how="left")


In [13]:
merged_df

Unnamed: 0,invoice1,invoice2,similarity_score_x,similarity_score_y,label
0,5514104810,5514104892,0.668748,,
1,5719101009,5719101588,0.624549,,
2,5514105044,5514105198,0.587657,,
3,B2064/23-24,210 /23-24,0.264735,,
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246,,
...,...,...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428,,
9360,SI2223/4067,KIN/SI2223/4067,0.653983,,
9361,SPB-0049-0423,SPB 0049-0423,0.739689,,
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166,,


In [14]:

try:
    merged_df.to_csv("mergedf.csv", index=False)
except IOError as e:
    print(f"Error saving matches to CSV: {e}")

In [15]:
merged_df.shape

(9364, 5)

### Cleaning


In [16]:
# Check merge inputs
print("Shape before merge:", train_df.shape)
print("Shape of feedback_df:", feedback_df.shape)
print("\nSample of unmatched records:")
merged_df = train_df.merge(feedback_df, on=["invoice1", "invoice2"], how="left")
unmatched = merged_df[merged_df["label"].isna()]
print(unmatched.head())

Shape before merge: (9364, 3)
Shape of feedback_df: (0, 4)

Sample of unmatched records:
      invoice1                   invoice2  similarity_score_x  \
0   5514104810                 5514104892            0.668748   
1   5719101009                 5719101588            0.624549   
2   5514105044                 5514105198            0.587657   
3  B2064/23-24                 210 /23-24            0.264735   
4  B2855/23-24  SALES B - Bill # 000B2855            0.276246   

  similarity_score_y label  
0                NaN   NaN  
1                NaN   NaN  
2                NaN   NaN  
3                NaN   NaN  
4                NaN   NaN  


In [17]:
# Data cleaning before merge
def clean_and_merge():
    # Standardize invoice IDs
    train_df["invoice1"] = train_df["invoice1"].astype(str).str.strip()
    train_df["invoice2"] = train_df["invoice2"].astype(str).str.strip()
    feedback_df["invoice1"] = feedback_df["invoice1"].astype(str).str.strip()
    feedback_df["invoice2"] = feedback_df["invoice2"].astype(str).str.strip()
    
    # Check for mismatched keys
    train_keys = set(zip(train_df["invoice1"], train_df["invoice2"]))
    feedback_keys = set(zip(feedback_df["invoice1"], feedback_df["invoice2"]))
    missing_keys = train_keys - feedback_keys
    
    if missing_keys:
        print(f"Found {len(missing_keys)} invoice pairs without feedback")
        print("Sample missing pairs:", list(missing_keys)[:5])
    
    # Perform merge
    return train_df.merge(feedback_df, on=["invoice1", "invoice2"], how="left")

merged_df = clean_and_merge()

Found 9364 invoice pairs without feedback
Sample missing pairs: [('JP/222301149', '1149'), ('1066', '1066'), ('JBH/4795/23-24', '4795'), ('JP/222301484', '1484'), ('JP/222301639', '1639')]


In [18]:
print("Train columns:", train_df.columns.tolist())
print("Feedback columns:", feedback_df.columns.tolist())

Train columns: ['invoice1', 'invoice2', 'similarity_score']
Feedback columns: ['invoice1', 'invoice2', 'similarity_score', 'label']


In [19]:
train_df

Unnamed: 0,invoice1,invoice2,similarity_score
0,5514104810,5514104892,0.668748
1,5719101009,5719101588,0.624549
2,5514105044,5514105198,0.587657
3,B2064/23-24,210 /23-24,0.264735
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246
...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428
9360,SI2223/4067,KIN/SI2223/4067,0.653983
9361,SPB-0049-0423,SPB 0049-0423,0.739689
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166


In [20]:
merged_df = train_df.merge(
    feedback_df[["invoice1", "invoice2", "label"]],  # Only select needed columns
    on=["invoice1", "invoice2"],
    how="left"
)

In [21]:
merged_df

Unnamed: 0,invoice1,invoice2,similarity_score,label
0,5514104810,5514104892,0.668748,
1,5719101009,5719101588,0.624549,
2,5514105044,5514105198,0.587657,
3,B2064/23-24,210 /23-24,0.264735,
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246,
...,...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428,
9360,SI2223/4067,KIN/SI2223/4067,0.653983,
9361,SPB-0049-0423,SPB 0049-0423,0.739689,
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166,


In [22]:
train_df

Unnamed: 0,invoice1,invoice2,similarity_score
0,5514104810,5514104892,0.668748
1,5719101009,5719101588,0.624549
2,5514105044,5514105198,0.587657
3,B2064/23-24,210 /23-24,0.264735
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246
...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428
9360,SI2223/4067,KIN/SI2223/4067,0.653983
9361,SPB-0049-0423,SPB 0049-0423,0.739689
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166


In [23]:


# Create label based on similarity score thresholds
def assign_label(similarity_score):

    # Define labels
    UNMATCHED = 0
    PARTIAL_MATCH = 1
    EXACT_MATCH = 2

    if similarity_score <= 0.034:
        return UNMATCHED
    elif similarity_score >= 0.95:
        return EXACT_MATCH
    else:
        return PARTIAL_MATCH

# Apply labeling logic
merged_df["label"] = merged_df["similarity_score"].apply(assign_label)

# Feature Engineering
X_train = vectorizer.transform(merged_df["invoice1"] + " | " + merged_df["invoice2"])
y_train = merged_df["label"].astype(int)

# Print distribution of labels
print("Label distribution:")
print(merged_df["label"].value_counts().sort_index())

Label distribution:
label
1    5593
2    3771
Name: count, dtype: int64


In [24]:
y_train

0       1
1       1
2       1
3       1
4       1
       ..
9359    1
9360    1
9361    1
9362    1
9363    1
Name: label, Length: 9364, dtype: int32

In [None]:

# Initialize model if not exists
if 'model' not in locals():
    model = PassiveAggressiveClassifier(max_iter=1000, random_state=42)

# Train model once
try:
    model.partial_fit(X_train, y_train, classes=np.array([0, 1, 2]))
except ValueError as e:
    print(f"Training error: {e}")
    # Verify input shapes
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"Unique labels: {np.unique(y_train)}")
    raise

# Validate and save
if hasattr(model, 'classes_') and hasattr(vectorizer, 'vocabulary_'):
    joblib.dump(model, "invoice_match_model.joblib")
    joblib.dump(vectorizer, "vectorizer.joblib")
else:
    raise ValueError("Model or vectorizer not properly fitted")

In [26]:
test_df = pd.read_csv("data/exact_match_dataset.csv")

test_df

Unnamed: 0,Match ID,Data Set 1,Data Set 2,Status
0,1,JP/232400001,1,Partial Matched
1,2,OMPL/003/2018-19,3,Partial Matched
2,3,JP/222300006,6,Partial Matched
3,4,JP/222300007,7,Partial Matched
4,5,JP/222300008,8,Partial Matched
...,...,...,...,...
11628,11629,24303BO1123,,Unmatched
11629,11630,24303BO963,,Unmatched
11630,11631,24303BO369,,Unmatched
11631,11632,24303BO967,,Unmatched


In [27]:
# Define labels
UNMATCHED = 0
PARTIAL_MATCH = 1
EXACT_MATCH = 2

# Create label based on similarity score thresholds
def assign_label(status):
    if status == "Unmatched":
        return UNMATCHED
    elif status == "Exact Matched":
        return EXACT_MATCH
    else:
        return PARTIAL_MATCH

# Apply labeling logic
test_df["label"] = test_df["Status"].apply(assign_label)

In [28]:
test_df

Unnamed: 0,Match ID,Data Set 1,Data Set 2,Status,label
0,1,JP/232400001,1,Partial Matched,1
1,2,OMPL/003/2018-19,3,Partial Matched,1
2,3,JP/222300006,6,Partial Matched,1
3,4,JP/222300007,7,Partial Matched,1
4,5,JP/222300008,8,Partial Matched,1
...,...,...,...,...,...
11628,11629,24303BO1123,,Unmatched,0
11629,11630,24303BO963,,Unmatched,0
11630,11631,24303BO369,,Unmatched,0
11631,11632,24303BO967,,Unmatched,0


In [30]:
test_df

Unnamed: 0,Match ID,Data Set 1,Data Set 2,Status,label
0,1,JP/232400001,1,Partial Matched,1
1,2,OMPL/003/2018-19,3,Partial Matched,1
2,3,JP/222300006,6,Partial Matched,1
3,4,JP/222300007,7,Partial Matched,1
4,5,JP/222300008,8,Partial Matched,1
...,...,...,...,...,...
11628,11629,24303BO1123,,Unmatched,0
11629,11630,24303BO963,,Unmatched,0
11630,11631,24303BO369,,Unmatched,0
11631,11632,24303BO967,,Unmatched,0


In [31]:
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Handle NaN values in test data
test_df_clean = test_df.copy()
test_df_clean["Data Set 1"] = test_df_clean["Data Set 1"].fillna("")
test_df_clean["Data Set 2"] = test_df_clean["Data Set 2"].fillna("")
test_df_clean["label"] = test_df_clean["label"].fillna(-1)  # Mark missing labels

# Filter out rows with missing labels
valid_mask = test_df_clean["label"] != -1
test_df_valid = test_df_clean[valid_mask]


In [32]:
test_df_valid

Unnamed: 0,Match ID,Data Set 1,Data Set 2,Status,label
0,1,JP/232400001,1,Partial Matched,1
1,2,OMPL/003/2018-19,3,Partial Matched,1
2,3,JP/222300006,6,Partial Matched,1
3,4,JP/222300007,7,Partial Matched,1
4,5,JP/222300008,8,Partial Matched,1
...,...,...,...,...,...
11628,11629,24303BO1123,,Unmatched,0
11629,11630,24303BO963,,Unmatched,0
11630,11631,24303BO369,,Unmatched,0
11631,11632,24303BO967,,Unmatched,0


In [33]:

# Validate model performance
X_test = vectorizer.transform(test_df_valid["Data Set 1"].astype(str) + " | " + test_df_valid["Data Set 2"].astype(str))
y_pred = model.predict(X_test)

In [34]:
y_pred

array([1, 1, 1, ..., 2, 2, 2])

In [35]:
# Convert model predictions to labels
test_df["y_pred"] = y_pred

# If you need to map predictions to specific labels, use:
def map_predictions(pred):
    label_map = {
        0: "Unmatched",
        1: "Partial Match", 
        2: "Exact Match"
    }
    return label_map.get(pred, "Unknown")

# Apply mapping if needed
test_df["y_pred_label"] = test_df["y_pred"].map(map_predictions)

In [36]:
import joblib
# Generate classification report for valid predictions only
print(classification_report(test_df_valid["label"], y_pred))

# Calculate accuracy on valid data
accuracy = (y_pred == test_df_valid["label"]).mean()
print(f"Number of valid test samples: {len(test_df_valid)}")
print(f"Number of samples with missing data: {len(test_df) - len(test_df_valid)}")

if accuracy >= 0.80:
    joblib.dump(model, "invoice_match_model.joblib")
    joblib.dump(vectorizer, "vectorizer.joblib")
else:
    print(f"Model accuracy {accuracy:.2f} below threshold, not saving")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1021
           1       0.92      0.99      0.96      6826
           2       0.89      1.00      0.94      3786

    accuracy                           0.91     11633
   macro avg       0.60      0.66      0.63     11633
weighted avg       0.83      0.91      0.87     11633

Number of valid test samples: 11633
Number of samples with missing data: 0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:

try:
    test_df.to_csv("final_csv.csv", index=False)
except IOError as e:
    print(f"Error saving matches to CSV: {e}")

In [38]:
train_df

Unnamed: 0,invoice1,invoice2,similarity_score
0,5514104810,5514104892,0.668748
1,5719101009,5719101588,0.624549
2,5514105044,5514105198,0.587657
3,B2064/23-24,210 /23-24,0.264735
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246
...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428
9360,SI2223/4067,KIN/SI2223/4067,0.653983
9361,SPB-0049-0423,SPB 0049-0423,0.739689
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166


In [39]:
merged_df

Unnamed: 0,invoice1,invoice2,similarity_score,label
0,5514104810,5514104892,0.668748,1
1,5719101009,5719101588,0.624549,1
2,5514105044,5514105198,0.587657,1
3,B2064/23-24,210 /23-24,0.264735,1
4,B2855/23-24,SALES B - Bill # 000B2855,0.276246,1
...,...,...,...,...
9359,KAA-2223/332,KAA/2223/332,0.700428,1
9360,SI2223/4067,KIN/SI2223/4067,0.653983,1
9361,SPB-0049-0423,SPB 0049-0423,0.739689,1
9362,TAC-23-24-0002,TAC/23-24/0002,0.415166,1


In [40]:
import os

In [42]:
class InvoiceMatchingSystem:
    def __init__(self, model=None, vectorizer=None):
        self.model = model or PassiveAggressiveClassifier(max_iter=1000, random_state=42)
        self.vectorizer = vectorizer
        self.feedback_buffer = []
    
    def get_next_prediction(self, invoice1, invoice2):
        """Get prediction for a single pair"""
        X = self.vectorizer.transform([f"{invoice1} | {invoice2}"])
        # prob = self.model.predict_proba(X)[0]
        prediction = self.model.predict(X)[0]
        return {
            'Data Set 1': invoice1,
            'Data Set 2': invoice2,
            'predicted_label': prediction,
            # 'confidence': prob[prediction],
            'similarity_score': cosine_similarity(
                self.vectorizer.transform([invoice1]), 
                self.vectorizer.transform([invoice2])
            )[0][0]
        }
    
    def update_model(self, invoice1, invoice2, user_label):
        """Update model with user feedback"""
        X = self.vectorizer.transform([f"{invoice1} | {invoice2}"])
        self.model.partial_fit(X, np.array([user_label]), classes=np.array([0, 1, 2]))
        
        # Save feedback for later analysis
        self.feedback_buffer.append({
            'Data Set 1': invoice1,
            'Data Set 2': invoice2,
            'user_label': user_label,
            'timestamp': pd.Timestamp.now()
        })
        
        # Periodically save model
        if len(self.feedback_buffer) % 10 == 0:  # Save every 10 feedbacks
            self.save_model()
    
    def save_model(self):
        """Save model and feedback data"""
        joblib.dump(self.model, "invoice_match_model.joblib")
        
        # Save feedback history
        pd.DataFrame(self.feedback_buffer).to_csv(
            "feedback_history.csv", 
            mode='a', 
            header=not os.path.exists("feedback_history.csv"),
            index=False
        )

    import pandas as pd
import numpy as np
import os
from sklearn.utils.class_weight import compute_class_weight
from datetime import datetime

def process_batch_feedback(matcher, batch_size=5):
    """Process feedback in batches and update model incrementally."""
    feedback_file = "feedback_history.csv"
    potential_matches = [
        (row['Invoice1'], row['Matched_Invoice2']) 
        for _, row in matches_df.iterrows()
    ][:batch_size]
    
    feedback_batch = []
    batch_stats = {'matches': 0, 'non_matches': 0, 'partial': 0}
    
    for inv1, inv2 in potential_matches:
        result = matcher.get_next_prediction(inv1, inv2)
        
        print(f"\nPotential Match:")
        print(f"Invoice 1: {inv1}")
        print(f"Invoice 2: {inv2}")
        print(f"Similarity Score: {result['similarity_score']:.3f}")
        print(f"Predicted Label: {result['predicted_label']}")

        while True:
            feedback = input("Enter feedback (0=no match, 1=partial, 2=exact, s=skip): ")
            if feedback in ['0', '1', '2', 's']:
                break
                
        if feedback != 's':
            label = int(feedback)
            feedback_batch.append({
                'invoice1': inv1,
                'invoice2': inv2, 
                'label': label,
                'similarity_score': result['similarity_score'],
                'predicted_label': result['predicted_label'],
                'timestamp': datetime.now()
            })
            
            # Update stats
            if label == 0:
                batch_stats['non_matches'] += 1
            elif label == 1:
                batch_stats['partial'] += 1
            else:
                batch_stats['matches'] += 1

    if feedback_batch:
        # **Load Previous Feedback (Avoid Duplicates)**
        if os.path.exists(feedback_file):
            existing_feedback = pd.read_csv(feedback_file)
            feedback_df = pd.DataFrame(feedback_batch)
            feedback_df = pd.concat([existing_feedback, feedback_df], ignore_index=True).drop_duplicates()
        else:
            feedback_df = pd.DataFrame(feedback_batch)
        
        # **Save Updated Feedback**
        feedback_df.to_csv(feedback_file, index=False)

        # **Train Model on New Feedback Only**
        X_batch = [f"{f['invoice1']} | {f['invoice2']}" for _, f in feedback_df.iterrows()]
        y_batch = feedback_df["label"].values

        # **Handle Class Imbalance**
        class_weights = compute_class_weight("balanced", classes=np.array([0, 1, 2]), y=y_batch)
        class_weight_dict = {i: class_weights[i] for i in range(3)}

        # **Vectorize & Incrementally Train Model**
        X_transformed = matcher.vectorizer.transform(X_batch)
        matcher.model.partial_fit(X_transformed, y_batch, classes=np.array([0, 1, 2]), sample_weight=[class_weight_dict[y] for y in y_batch])

        matcher.save_model()

        # **Display Batch Statistics**
        print("\nBatch Statistics:")
        print(f"Exact Matches: {batch_stats['matches']}")
        print(f"Partial Matches: {batch_stats['partial']}")
        print(f"Non-matches: {batch_stats['non_matches']}")

        # **Calculate and Display Accuracy**
        print("\nModel Performance:")
        accuracy = (feedback_df['predicted_label'] == feedback_df['label']).mean()
        print(f"Accuracy: {accuracy:.2f}")

        # **Show Recent Daily Feedback Counts**
        feedback_df['timestamp'] = pd.to_datetime(feedback_df['timestamp'])
        daily_counts = feedback_df.groupby(feedback_df['timestamp'].dt.date)['label'].count()
        print("\nDaily Feedback Counts:")
        print(daily_counts.tail())

    return len(feedback_batch)




In [43]:
# Check column names
print("Available columns:", matches_df.columns.tolist())

Available columns: ['Invoice1', 'Matched_Invoice2', 'Similarity_Score']


In [45]:
from typing import List, Dict, Tuple
from datetime import datetime
import pandas as pd
import numpy as np
import os

In [46]:
def process_batch_feedback(batch_size=5):
    """Process feedback in batches and maintain feedback history"""
    feedback_file = "feedback_history.csv"
    potential_matches = [
        (row['Invoice1'], row['Matched_Invoice2']) 
        for _, row in matches_df.iterrows()
    ][:batch_size]
    
    feedback_batch = []
    batch_stats = {'matches': 0, 'non_matches': 0, 'partial': 0}
    
    for inv1, inv2 in potential_matches:
        result = matcher.get_next_prediction(inv1, inv2)
        
        print(f"\nPotential Match:")
        print(f"Invoice 1: {inv1}")
        print(f"Invoice 2: {inv2}")
        print(f"Similarity Score: {result['similarity_score']:.3f}")
        print(f"Predicted Label: {result['predicted_label']}")
        
        while True:
            feedback = input("Enter feedback (0=no match, 1=partial, 2=exact, s=skip): ")
            if feedback in ['0', '1', '2', 's']:
                break
                
        if feedback != 's':
            label = int(feedback)
            feedback_batch.append({
                'invoice1': inv1,
                'invoice2': inv2, 
                'label': label,
                'similarity_score': result['similarity_score'],
                'predicted_label': result['predicted_label'],
                'timestamp': pd.Timestamp.now()
            })
            
            # Update stats
            if label == 0:
                batch_stats['non_matches'] += 1
            elif label == 1:
                batch_stats['partial'] += 1
            else:
                batch_stats['matches'] += 1
    
    # Save feedback and update model
    if feedback_batch:
        # Save feedback to CSV
        feedback_df = pd.DataFrame(feedback_batch)
        feedback_df.to_csv(feedback_file, mode='a', header=not os.path.exists(feedback_file), index=False)
        
        # Update model
        X_batch = [f"{f['invoice1']} | {f['invoice2']}" for f in feedback_batch]
        y_batch = [f['label'] for f in feedback_batch]
        X_transformed = matcher.vectorizer.transform(X_batch)
        matcher.model.partial_fit(X_transformed, np.array(y_batch), classes=np.array([0, 1, 2]))
        matcher.save_model()
        
        # Display batch statistics
        print("\nBatch Statistics:")
        print(f"Exact Matches: {batch_stats['matches']}")
        print(f"Partial Matches: {batch_stats['partial']}")
        print(f"Non-matches: {batch_stats['non_matches']}")
        
        # Display overall statistics
        if os.path.exists(feedback_file):
            all_feedback = pd.read_csv(feedback_file)
            print("\nOverall Statistics:")
            print("Label Distribution:")
            print(all_feedback['label'].value_counts())
            print("\nModel Performance:")
            accuracy = (all_feedback['predicted_label'] == all_feedback['label']).mean()
            print(f"Accuracy: {accuracy:.2f}")
            
            # Show daily feedback counts
            all_feedback['timestamp'] = pd.to_datetime(all_feedback['timestamp'])
            daily_counts = all_feedback.groupby(all_feedback['timestamp'].dt.date)['label'].count()
            print("\nDaily Feedback Counts:")
            print(daily_counts.tail())
        
    return len(feedback_batch)

# Usage
matcher = InvoiceMatchingSystem(model=model, vectorizer=vectorizer)
processed_count = process_batch_feedback(batch_size=5)


Potential Match:
Invoice 1: 5514104810
Invoice 2: 5514104892
Similarity Score: 0.669
Predicted Label: 1

Potential Match:
Invoice 1: 5719101009
Invoice 2: 5719101588
Similarity Score: 0.625
Predicted Label: 1

Potential Match:
Invoice 1: 5514105044
Invoice 2: 5514105198
Similarity Score: 0.588
Predicted Label: 1

Potential Match:
Invoice 1: B2064/23-24
Invoice 2: 210 /23-24
Similarity Score: 0.265
Predicted Label: 1

Potential Match:
Invoice 1: B2855/23-24
Invoice 2: SALES B - Bill # 000B2855
Similarity Score: 0.276
Predicted Label: 1

Batch Statistics:
Exact Matches: 0
Partial Matches: 5
Non-matches: 0

Overall Statistics:
Label Distribution:
label
1    5
Name: count, dtype: int64

Model Performance:
Accuracy: 1.00

Daily Feedback Counts:
timestamp
2025-02-08    5
Name: label, dtype: int64


In [48]:
# Usage Example
matcher = InvoiceMatchingSystem(model=model, vectorizer=vectorizer)
processed_count = process_batch_feedback(batch_size=5) 


Potential Match:
Invoice 1: 5514104810
Invoice 2: 5514104892
Similarity Score: 0.669
Predicted Label: 1

Potential Match:
Invoice 1: 5719101009
Invoice 2: 5719101588
Similarity Score: 0.625
Predicted Label: 1

Potential Match:
Invoice 1: 5514105044
Invoice 2: 5514105198
Similarity Score: 0.588
Predicted Label: 1

Potential Match:
Invoice 1: B2064/23-24
Invoice 2: 210 /23-24
Similarity Score: 0.265
Predicted Label: 1

Potential Match:
Invoice 1: B2855/23-24
Invoice 2: SALES B - Bill # 000B2855
Similarity Score: 0.276
Predicted Label: 1

Batch Statistics:
Exact Matches: 0
Partial Matches: 4
Non-matches: 1

Overall Statistics:
Label Distribution:
label
1    13
0     2
Name: count, dtype: int64

Model Performance:
Accuracy: 0.87

Daily Feedback Counts:
timestamp
2025-02-08    15
Name: label, dtype: int64
