In [3]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import random

# Load JSON receipt files
def load_receipts(folder_path):
    receipts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            with open(os.path.join(folder_path, file_name), "r") as f:
                data = json.load(f)
                receipts.extend(data)
    return receipts

# Convert receipts to a DataFrame
def preprocess_receipts(receipts):
    data = []
    for receipt in receipts:
        data.append({
            "receipt_number": receipt["receipt_number"],
            "merchant_name": receipt["merchant_name"],
            "transaction_date": datetime.strptime(receipt["transaction_date"], "%Y-%m-%d"),
            "total_spent": receipt["total"],
            "num_items": len(receipt["items"]),
            "subtotal": receipt["subtotal"],
            "tax": receipt["tax"] if receipt["tax"] is not None else 0,
            "tip": receipt["tip"] if receipt["tip"] is not None else 0
        })
    df = pd.DataFrame(data)
    df["month"] = df["transaction_date"].dt.month
    df["year"] = df["transaction_date"].dt.year
    return df

# Assign users and generate spending time frames
def assign_users_and_timeframes(df, num_users=50):
    df["user_id"] = np.random.randint(1, num_users + 1, size=len(df))
    user_time_frames = {}
    for user in df["user_id"].unique():
        start_date = df[df["user_id"] == user]["transaction_date"].min()
        days = random.randint(7, 30)  # Random time frame
        user_time_frames[user] = start_date + timedelta(days=days)
    df["cutoff_date"] = df["user_id"].map(user_time_frames)
    return df

# Define target spending for each user
def set_target_spending(row):
    days_elapsed = row["transaction_date"].day
    days_in_month = 30
    return (row["total_spent"] / days_elapsed) * days_in_month

# Load and preprocess receipts
folder_path = "path_to_your_receipts"  # Update this path
receipts = load_receipts(folder_path)
df = preprocess_receipts(receipts)
df = assign_users_and_timeframes(df)


In [1]:
# Compute total spending so far
total_spending_so_far = df.groupby(["user_id", "year", "month"])["total_spent"].sum().reset_index()
df["target_monthly_spending"] = df.apply(set_target_spending, axis=1)

# Split data into train and test
df_train = df[df["transaction_date"] <= df["cutoff_date"]]
df_test = df[df["transaction_date"] > df["cutoff_date"]]

X_train = df_train.drop(columns=["total_spent", "transaction_date", "target_monthly_spending"])
y_train = df_train["target_monthly_spending"]
X_test = df_test.drop(columns=["total_spent", "transaction_date", "target_monthly_spending"])
y_test = df_test["target_monthly_spending"]


In [None]:
# Train XGBoost Model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Predict monthly spending
y_pred = model.predict(X_test)

In [None]:
# Compare predictions with actual targets
df_results = pd.DataFrame({"Actual Spending Target": y_test, "Predicted Spending": y_pred})
df_results.reset_index(drop=True, inplace=True)

# Plot actual vs predicted spending
plt.figure(figsize=(10, 5))
plt.plot(df_results["Actual Spending Target"], label="Actual Monthly Target", linestyle="--", marker="o")
plt.plot(df_results["Predicted Spending"], label="Predicted Spending", linestyle="-", marker="s")
plt.xlabel("Users")
plt.ylabel("Spending ($)")
plt.title("Actual vs Predicted Monthly Spending")
plt.legend()
plt.show()

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")