In [8]:
import os
import json
import random

users_dir = 'users'
valid_categories = {"Housing & Utilities", "Leisure", "Personal Care & Education"}

for user_folder in os.listdir(users_dir):
    user_path = os.path.join(users_dir, user_folder)
    if not os.path.isdir(user_path) or not user_folder.startswith('user_'):
        continue

    eligible_receipt_paths = []

    for filename in os.listdir(user_path):
        if not filename.endswith('.json') or filename == 'user_total.json':
            continue

        filepath = os.path.join(user_path, filename)

        try:
            with open(filepath, 'r') as f:
                content = json.load(f)

            if (
                isinstance(content, list) and
                len(content) == 1 and
                content[0].get("receipt_type") in valid_categories
            ):
                eligible_receipt_paths.append(filepath)

        except Exception as e:
            print(f"Error reading {filepath}: {e}")

    if len(eligible_receipt_paths) < 2:
        print(f"{user_folder}: only {len(eligible_receipt_paths)} eligible receipts. Skipping.")
        continue

    num_to_mark = min(len(eligible_receipt_paths), random.randint(2, 3))
    selected_paths = random.sample(eligible_receipt_paths, num_to_mark)

    for path in selected_paths:
        try:
            with open(path, 'r') as f:
                content = json.load(f)

            content[0]['recurring'] = True  # Since each file contains a list of 1 receipt

            with open(path, 'w') as f:
                json.dump(content, f, indent=4)

        except Exception as e:
            print(f"Error updating {path}: {e}")

    print(f"{user_folder}: marked {num_to_mark} receipt(s) as recurring.")


user_115: marked 2 receipt(s) as recurring.
user_112: marked 2 receipt(s) as recurring.
user_124: marked 3 receipt(s) as recurring.
user_123: marked 3 receipt(s) as recurring.
user_177: marked 2 receipt(s) as recurring.
user_183: marked 3 receipt(s) as recurring.
user_148: marked 3 receipt(s) as recurring.
user_5: marked 2 receipt(s) as recurring.
user_184: marked 2 receipt(s) as recurring.
user_170: marked 3 receipt(s) as recurring.
user_2: marked 3 receipt(s) as recurring.
user_146: marked 2 receipt(s) as recurring.
user_179: marked 2 receipt(s) as recurring.
user_141: marked 3 receipt(s) as recurring.
user_122: only 0 eligible receipts. Skipping.
user_125: only 0 eligible receipts. Skipping.
user_113: marked 2 receipt(s) as recurring.
user_114: marked 3 receipt(s) as recurring.
user_140: marked 2 receipt(s) as recurring.
user_147: marked 2 receipt(s) as recurring.
user_178: marked 2 receipt(s) as recurring.
user_171: marked 3 receipt(s) as recurring.
user_185: marked 3 receipt(s) as

In [16]:
import os
import json

users_dir = 'test'

for user_folder in os.listdir(users_dir):
    user_path = os.path.join(users_dir, user_folder)
    if not os.path.isdir(user_path) or not user_folder.startswith('user_'):
        continue

    recurring_total = 0.0

    # Iterate through all receipt JSON files (excluding user_total.json)
    for filename in os.listdir(user_path):
        if not filename.endswith('.json') or filename == 'user_total.json':
            continue

        filepath = os.path.join(user_path, filename)
        try:
            with open(filepath, 'r') as f:
                receipts = json.load(f)

            for receipt in receipts:
                total = receipt.get("total")
                if receipt.get("recurring", False) and isinstance(total, (int, float)):
                    recurring_total += total
                elif receipt.get("recurring", False):
                    print(f"Skipping receipt in {filename} (user {user_folder}) with invalid total: {total}")

        except Exception as e:
            print(f"Error reading {filename} in {user_folder}: {e}")
            continue

    # Update or create user_total.json
    user_total_path = os.path.join(user_path, 'user_total.json')
    if os.path.exists(user_total_path):
        with open(user_total_path, 'r') as f:
            data = json.load(f)
    else:
        data = {}

    data['recurring_total'] = round(recurring_total, 2)

    with open(user_total_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Updated {user_folder} with recurring_total: ${data['recurring_total']}")


Updated user_112 with recurring_total: $128.54
Updated user_124 with recurring_total: $207.17
Updated user_123 with recurring_total: $365.94
Updated user_183 with recurring_total: $265.14
Updated user_184 with recurring_total: $306.45
Updated user_140 with recurring_total: $66.73
Updated user_171 with recurring_total: $3121.32
Updated user_185 with recurring_total: $118.55
Updated user_96 with recurring_total: $2053.4
Updated user_62 with recurring_total: $1403.9
Updated user_14 with recurring_total: $964.35
Updated user_13 with recurring_total: $9193.35
Updated user_40 with recurring_total: $114.21
Updated user_15 with recurring_total: $237.35
Updated user_23 with recurring_total: $95.72
Updated user_77 with recurring_total: $5742.33
Updated user_131 with recurring_total: $35.9
Updated user_163 with recurring_total: $2735.87
Updated user_106 with recurring_total: $107.79
Updated user_137 with recurring_total: $623.1
Updated user_6 with recurring_total: $55.13
Updated user_142 with rec

In [11]:
import os
import shutil
import random

# Set your path to the 'users' directory
users_dir = 'users'
train_dir = 'train'
test_dir = 'test'

# Create train and test directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Get all user folders
user_folders = [f for f in os.listdir(users_dir) if os.path.isdir(os.path.join(users_dir, f))]

# Shuffle the folders for random split
random.shuffle(user_folders)

# Calculate split index
split_index = int(len(user_folders) * 0.8)

# Split into train and test
train_users = user_folders[:split_index]
test_users = user_folders[split_index:]

# Move the folders
for user in train_users:
    shutil.move(os.path.join(users_dir, user), os.path.join(train_dir, user))

for user in test_users:
    shutil.move(os.path.join(users_dir, user), os.path.join(test_dir, user))

print(f"Moved {len(train_users)} users to '{train_dir}/' and {len(test_users)} users to '{test_dir}/'.")


Moved 157 users to 'train/' and 40 users to 'test/'.


In [1]:
import os
import json

users_dir = 'test'

for user_folder in os.listdir(users_dir):
    user_path = os.path.join(users_dir, user_folder)
    if not os.path.isdir(user_path) or not user_folder.startswith('user_'):
        continue

    user_total_path = os.path.join(user_path, 'user_total.json')
    if not os.path.exists(user_total_path):
        print(f"Skipping {user_folder}: no user_total.json found.")
        continue

    try:
        with open(user_total_path, 'r') as f:
            data = json.load(f)

        total = data.get("total")
        date_range = data.get("date_range")
        recurring_total = data.get("recurring_total", 0.0)

        if not all(isinstance(x, (int, float)) for x in [total, date_range, recurring_total]) or date_range == 0:
            print(f"Skipping {user_folder}: invalid or missing values.")
            continue

        # Normalize
        non_recurring = total - recurring_total
        monthly_non_recurring = (non_recurring / date_range) * 30
        projected_monthly = round(monthly_non_recurring + recurring_total, 2)

        # Update JSON
        data["projected_monthly_spending"] = projected_monthly

        with open(user_total_path, 'w') as f:
            json.dump(data, f, indent=4)

        print(f"{user_folder}: projected_monthly_spending = ${projected_monthly}")

    except Exception as e:
        print(f"Error processing {user_folder}: {e}")

user_103: projected_monthly_spending = $19718.66
user_106: projected_monthly_spending = $100009.95
user_112: projected_monthly_spending = $9057.89
user_123: projected_monthly_spending = $182288.81
user_124: projected_monthly_spending = $1330369.39
user_126: projected_monthly_spending = $505325.36
user_13: projected_monthly_spending = $21776.09
user_131: projected_monthly_spending = $1092.5
user_134: projected_monthly_spending = $346.8
user_137: projected_monthly_spending = $5852.51
user_14: projected_monthly_spending = $1052.32
user_140: projected_monthly_spending = $1384.33
user_142: projected_monthly_spending = $95854.86
user_145: projected_monthly_spending = $16891.35
user_15: projected_monthly_spending = $3054.02
user_160: projected_monthly_spending = $64962.1
user_163: projected_monthly_spending = $501278.44
user_171: projected_monthly_spending = $7660.24
user_172: projected_monthly_spending = $33145.34
user_181: projected_monthly_spending = $87425.45
user_183: projected_monthly_s

In [17]:
import os
import json

test_dir = 'test'
recurring_totals_file = 'recurring_totals_test.json'
projected_log = {}

# Load recurring totals
with open(recurring_totals_file, 'r') as f:
    recurring_totals = json.load(f)

for user_folder in os.listdir(test_dir):
    user_path = os.path.join(test_dir, user_folder)
    if not os.path.isdir(user_path) or not user_folder.startswith('user_'):
        continue

    user_total_path = os.path.join(user_path, 'user_total.json')
    if not os.path.exists(user_total_path):
        continue

    try:
        with open(user_total_path, 'r') as f:
            data = json.load(f)

        total = data.get("total")
        date_range = data.get("date_range")
        recurring_total = recurring_totals.get(user_folder, 0.0)

        if not isinstance(total, (int, float)) or not isinstance(date_range, (int, float)) or date_range == 0:
            print(f"Skipping {user_folder}: invalid total or date_range.")
            continue

        non_recurring = total - recurring_total
        monthly_non_recurring = (non_recurring / date_range) * 30
        projected = round(monthly_non_recurring + recurring_total, 2)

        projected_log[user_folder] = projected
        print(f"{user_folder}: projected_monthly_spending = ${projected}")

    except Exception as e:
        print(f"Error processing {user_folder}: {e}")

# Save final results
with open("projected_spending_test.json", "w") as f:
    json.dump(projected_log, f, indent=4)

print(f"\nSaved projected monthly spendings for {len(projected_log)} users to 'projected_spending_test.json'")


user_112: projected_monthly_spending = $9057.89
user_124: projected_monthly_spending = $1330369.39
user_123: projected_monthly_spending = $182288.81
user_183: projected_monthly_spending = $95069.39
user_184: projected_monthly_spending = $38298.12
user_140: projected_monthly_spending = $1384.33
user_171: projected_monthly_spending = $7660.24
user_185: projected_monthly_spending = $16858.63
user_96: projected_monthly_spending = $24176.15
user_62: projected_monthly_spending = $22742.99
user_14: projected_monthly_spending = $1052.32
user_13: projected_monthly_spending = $21776.09
user_40: projected_monthly_spending = $626.07
user_15: projected_monthly_spending = $3054.02
user_23: projected_monthly_spending = $111600.76
user_77: projected_monthly_spending = $6145.59
user_131: projected_monthly_spending = $1092.5
user_163: projected_monthly_spending = $501278.44
user_106: projected_monthly_spending = $100009.95
user_137: projected_monthly_spending = $5852.51
user_6: projected_monthly_spendin

In [5]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Directories
train_dir = 'users'

def load_training_data(train_dir):
    train_data = []
    category_totals = {}

    for user_id in range(1, 198):  # Users are named user_1 to user_157
        user_folder = f'user_{user_id}'
        user_path = os.path.join(train_dir, user_folder)
        user_total_path = os.path.join(user_path, 'user_total.json')

        if not os.path.exists(user_total_path):
            continue

        try:
            with open(user_total_path, 'r') as f:
                user_data = json.load(f)

            total = user_data.get("total")
            date_range = user_data.get("date_range")
            recurring_total = user_data.get("recurring_total", 0.0)
            projected_spending = user_data.get("projected_monthly_spending")

            if not all(isinstance(x, (int, float)) for x in [total, date_range, recurring_total, projected_spending]) or date_range == 0:
                continue

            avg_daily_spending = total / date_range
            train_data.append([total, date_range, recurring_total, avg_daily_spending, projected_spending])

            # Process category breakdown from receipts
            for receipt_file in os.listdir(user_path):
                if receipt_file.endswith(".json") and receipt_file != "user_total.json":
                    receipt_path = os.path.join(user_path, receipt_file)
                    with open(receipt_path, 'r') as f:
                        receipts = json.load(f)
                    for receipt in receipts:
                        category = receipt.get("receipt_type", "Unknown")
                        category_total = receipt.get("total", 0.0)
                        if isinstance(category_total, (int, float)):
                            category_totals[category] = category_totals.get(category, 0) + category_total

        except Exception as e:
            print(f"Skipping {user_folder} due to error: {e}")

    return train_data, category_totals

# Load Data
train_data, category_totals = load_training_data(train_dir)
train_df = pd.DataFrame(train_data, columns=["total", "date_range", "recurring_total", "avg_daily_spending", "projected_spending"])

# Train-test split
X = train_df.drop(columns=["projected_spending"])
y = train_df["projected_spending"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
for epoch in range(1, 11):  # Simulating 10 epochs
    model.fit(X_train, y_train)
    print(f"Epoch {epoch} completed")

# Validate model
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation MAE: {mae:.2f}")
print(f"Validation R² Score: {r2:.2f}")

# Normalize category totals for proportional distribution
total_category_spending = sum(category_totals.values())
category_distribution = {cat: val / total_category_spending for cat, val in category_totals.items()}

# Save category distribution
with open("category_distribution.json", "w") as f:
    json.dump(category_distribution, f, indent=4)

print("Training complete. Category distribution saved.")


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed
Epoch 10 completed
Validation MAE: 29429.13
Validation R² Score: 0.91
Training complete. Category distribution saved.
