In [None]:
import pickle
import os
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# === Load features and targets from user_total.json ===
def load_user_totals(root_dir):
    data = []
    targets = []
    for user in os.listdir(root_dir):
        user_path = os.path.join(root_dir, user)
        total_path = os.path.join(user_path, 'user_total.json')

        if os.path.isfile(total_path):
            try:
                with open(total_path, 'r') as f:
                    info = json.load(f)

                total = info.get('total')
                date_range = info.get('date_range')
                recurring_total = info.get('recurring_total', 0)
                projected = info.get('projected_monthly_spending')

                if None not in (total, date_range, projected) and date_range > 0:
                    daily_nonrec = (total - recurring_total) / date_range
                    projected_baseline = daily_nonrec * 30 + recurring_total

                    features = {
                        'total': total,
                        'date_range': date_range,
                        'recurring_total': recurring_total,
                        'daily_nonrec': daily_nonrec,
                        'projected_baseline': projected_baseline
                    }

                    data.append(features)
                    targets.append(projected)

            except Exception as e:
                print(f"Skipping {user} due to error: {e}")

    return pd.DataFrame(data), np.array(targets)

# === Load datasets ===
X_train_full, y_train = load_user_totals('train')
X_test_full, y_test = load_user_totals('test')

# Drop leakage feature before training
X_train = X_train_full.drop(columns=['projected_baseline'])
X_test = X_test_full.drop(columns=['projected_baseline'])

print(f"\n📚 Training on {len(X_train)} users with features: {list(X_train.columns)}")
print(f"🧪 Testing on {len(X_test)} users\n")

# === Phase 1: Train the model ===
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)

# Training metrics
train_mae = mean_absolute_error(y_train, train_preds)
train_acc = 1 - (train_mae / np.mean(y_train))

print("✅ Training Complete.")
print(f"📈 Training MAE: {train_mae:.2f}")
print(f"📈 Training Accuracy: {train_acc * 100:.2f}%")

# === Phase 2: Test the model ===
test_preds = model.predict(X_test)

# Test metrics
test_mae = mean_absolute_error(y_test, test_preds)
test_acc = 1 - (test_mae / np.mean(y_test))

print("\n🎯 Final Evaluation on Test Set")
print(f"📊 Test MAE: {test_mae:.2f}")
print(f"📊 Test Accuracy: {test_acc * 100:.2f}%")

# === Optional: Print per-user prediction breakdown ===
print("\n🔍 Per-user predictions:")
for i in range(len(y_test)):
    print(f"User {i+1:3}: Predicted = {test_preds[i]:9.2f} | Actual = {y_test[i]:9.2f} | Error = {abs(test_preds[i] - y_test[i]):7.2f}")

# === Optional: Baseline model ===
baseline_preds = X_test_full['projected_baseline']
baseline_mae = mean_absolute_error(y_test, baseline_preds)
baseline_acc = 1 - (baseline_mae / np.mean(y_test))
print(f"\n📉 Baseline MAE: {baseline_mae:.2f}, Accuracy: {baseline_acc * 100:.2f}%")

# === Optional: Linear Regression model ===
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_mae = mean_absolute_error(y_test, lr_preds)
lr_acc = 1 - (lr_mae / np.mean(y_test))
print(f"🔎 Linear Regression MAE: {lr_mae:.2f}, Accuracy: {lr_acc * 100:.2f}%")
