# Needs vs. Wants Transaction Classifier

This notebook demonstrates a machine learning model that classifies financial transactions as either 'needs' or 'wants'.

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib
from IPython.display import display
import ipywidgets as widgets
from IPython.display import clear_output


## Generate Synthetic Transaction Data

Since we're not using Plaid API, we'll create realistic synthetic transaction data for our model.

In [4]:
# Function to create synthetic transaction data
def generate_synthetic_transactions(num_transactions=500):
    # Transaction categories
    need_categories = [
        'Food and Drink > Groceries',
        'Housing > Rent',
        'Housing > Mortgage',
        'Transfer > Deposit',
        'Payment > Credit Card',
        'Travel > Public Transportation',
        'Healthcare > Pharmacy',
        'Service > Utilities',
        'Service > Phone',
        'Service > Internet'
    ]
    
    want_categories = [
        'Food and Drink > Restaurants',
        'Food and Drink > Coffee Shop',
        'Shopping > Clothing',
        'Shopping > Electronics',
        'Travel > Vacation',
        'Travel > Rideshare',
        'Recreation > Gym',
        'Recreation > Entertainment',
        'Personal Care > Spa',
        'Food and Drink > Alcohol & Bars'
    ]
    
    # Merchant names for each category
    merchants = {
        'Food and Drink > Groceries': ['Kroger', 'Safeway', 'Whole Foods', 'Trader Joe\'s', 'Aldi', 'Publix'],
        'Housing > Rent': ['RENT PAYMENT', 'APARTMENT MANAGEMENT', 'PROPERTY MGMT'],
        'Housing > Mortgage': ['MORTGAGE PAYMENT', 'HOME LOAN', 'BANK MORTGAGE'],
        'Transfer > Deposit': ['DIRECT DEPOSIT', 'ACH TRANSFER', 'BANK TRANSFER'],
        'Payment > Credit Card': ['CREDIT CARD PAYMENT', 'CARD SERVICES', 'AMEX PAYMENT', 'CHASE CARD'],
        'Travel > Public Transportation': ['METRO', 'SUBWAY', 'BUS SERVICE', 'TRAIN TICKET', 'TRANSIT AUTHORITY'],
        'Healthcare > Pharmacy': ['CVS', 'Walgreens', 'Rite Aid', 'PHARMACY', 'PRESCRIPTION'],
        'Service > Utilities': ['ELECTRIC COMPANY', 'WATER UTILITY', 'GAS BILL', 'UTILITY PAYMENT'],
        'Service > Phone': ['VERIZON', 'AT&T', 'T-MOBILE', 'SPRINT', 'PHONE BILL'],
        'Service > Internet': ['COMCAST', 'SPECTRUM', 'INTERNET SERVICE', 'WIFI PROVIDER', 'BROADBAND'],
        'Food and Drink > Restaurants': ['McDonald\'s', 'Chipotle', 'Olive Garden', 'Cheesecake Factory', 'Chili\'s', 'Restaurant'],
        'Food and Drink > Coffee Shop': ['Starbucks', 'Dunkin', 'Peet\'s Coffee', 'Coffee Shop', 'Cafe'],
        'Shopping > Clothing': ['H&M', 'Zara', 'Nike', 'Gap', 'Old Navy', 'Macy\'s', 'CLOTHING STORE'],
        'Shopping > Electronics': ['Best Buy', 'Apple Store', 'Amazon', 'Samsung Store', 'Electronics'],
        'Travel > Vacation': ['AIRLINE TICKET', 'HOTEL STAY', 'Airbnb', 'VRBO', 'TRAVEL AGENCY', 'BOOKING.COM'],
        'Travel > Rideshare': ['Uber', 'Lyft', 'TAXI', 'CAB SERVICE'],
        'Recreation > Gym': ['LA Fitness', 'Planet Fitness', 'GYM MEMBERSHIP', 'FITNESS CENTER', '24 Hour Fitness'],
        'Recreation > Entertainment': ['MOVIE THEATER', 'Concert Tickets', 'THEME PARK', 'Netflix', 'Spotify', 'HBO'],
        'Personal Care > Spa': ['MASSAGE', 'SPA SERVICE', 'HAIR SALON', 'NAIL SALON', 'BARBERSHOP'],
        'Food and Drink > Alcohol & Bars': ['BAR', 'LIQUOR STORE', 'Wine Shop', 'Brewery', 'NIGHTCLUB']
    }
    
    # Amount ranges for each category
    amount_ranges = {
        'Food and Drink > Groceries': (30, 200),
        'Housing > Rent': (800, 2500),
        'Housing > Mortgage': (1000, 3000),
        'Transfer > Deposit': (500, 3000),
        'Payment > Credit Card': (200, 1500),
        'Travel > Public Transportation': (2, 15),
        'Healthcare > Pharmacy': (10, 100),
        'Service > Utilities': (50, 200),
        'Service > Phone': (50, 150),
        'Service > Internet': (50, 120),
        'Food and Drink > Restaurants': (15, 100),
        'Food and Drink > Coffee Shop': (3, 15),
        'Shopping > Clothing': (20, 200),
        'Shopping > Electronics': (50, 1000),
        'Travel > Vacation': (200, 2000),
        'Travel > Rideshare': (10, 50),
        'Recreation > Gym': (20, 100),
        'Recreation > Entertainment': (10, 150),
        'Personal Care > Spa': (30, 200),
        'Food and Drink > Alcohol & Bars': (20, 100)
    }
    
    # Create transactions
    transactions = []
    end_date = datetime.datetime.now().date()
    start_date = end_date - datetime.timedelta(days=365)  # One year of data
    
    # Explicitly categorized needs and wants
    need_want_map = {}
    for category in need_categories:
        need_want_map[category] = 'need'
    for category in want_categories:
        need_want_map[category] = 'want'
    
    # Generate transactions
    for _ in range(num_transactions):
        # Determine if this is a need or want (roughly 60% needs, 40% wants)
        is_need = random.random() < 0.6
        
        # Choose category
        if is_need:
            category = random.choice(need_categories)
        else:
            category = random.choice(want_categories)
        
        # Choose merchant name from that category
        name = random.choice(merchants[category])
        
        # Transaction amount based on category
        min_amount, max_amount = amount_ranges[category]
        amount = round(random.uniform(min_amount, max_amount), 2)
        
        # Transaction date - more recent dates are more likely
        days_ago = int(random.triangular(0, 365, 30))  # Most transactions in last month
        date = end_date - datetime.timedelta(days=days_ago)
        
        # Need vs Want classification
        need_want = need_want_map[category]
        
        # Create transaction
        transaction = {
            'date': date,
            'name': name,
            'amount': amount,
            'category': category,
            'need_want': need_want
        }
        
        transactions.append(transaction)
    
    # Create DataFrame and sort by date
    df = pd.DataFrame(transactions)
    df = df.sort_values('date', ascending=False)
    
    return df

# Generate synthetic transaction data
df = generate_synthetic_transactions(500)

# Display the first few rows
print(f"Generated {len(df)} synthetic transactions")
df.head()

Generated 500 synthetic transactions


Unnamed: 0,date,name,amount,category,need_want
359,2025-04-04,RENT PAYMENT,2400.18,Housing > Rent,need
104,2025-04-03,CREDIT CARD PAYMENT,1464.14,Payment > Credit Card,need
95,2025-04-02,APARTMENT MANAGEMENT,1774.5,Housing > Rent,need
128,2025-04-01,BANK TRANSFER,1724.91,Transfer > Deposit,need
4,2025-03-30,Cheesecake Factory,63.53,Food and Drink > Restaurants,want


## Education Category Override

Let's ensure all education-related transactions are correctly classified as needs by implementing an override.

In [None]:
# Update the prediction function to ensure education expenses are always classified as needs
def predict_single_transaction_with_education_override(transaction_data):
    # First get the regular prediction
    result = predict_single_transaction(transaction_data)
    
    # Check if this is an education-related expense
    name = transaction_data.get('name', '').lower()
    category = transaction_data.get('category', '').lower()
    
    # Define education-related terms
    education_terms = [
        'education', 'tuition', 'university', 'college', 'school', 'textbook', 'student',
        'course', 'class', 'degree', 'academic', 'campus', 'professor', 'lecture',
        'scholarship', 'financial aid'
    ]
    
    # Check if any education term is present in name or category
    is_education = any(term in name or term in category for term in education_terms)
    
    # If it's education-related but was classified as a want, override to need
    if is_education and result['prediction'] == 'want':
        original_prediction = result['prediction']
        original_confidence = result['confidence']
        
        # Override the prediction
        result['prediction'] = 'need'
        result['confidence'] = 100.0  # Maximum confidence for override
        result['override_applied'] = True
        result['original_prediction'] = original_prediction
        result['original_confidence'] = original_confidence
        
        # Add a note in features
        result['features']['education_override'] = 1
    else:
        result['override_applied'] = False
    
    return result

# Test the education override functionality
education_test_cases = [
    {"name": "UNIVERSITY TUITION", "category": "Education > Tuition", "amount": 3000.00},
    {"name": "COLLEGE BOOKSTORE", "category": "Education > Books", "amount": 250.00},
    {"name": "STUDENT LOANS", "category": "Payment > Education", "amount": 300.00},
    {"name": "SCHOOL SUPPLIES", "category": "Education > Supplies", "amount": 75.00},
    {"name": "ONLINE COURSE", "category": "Education > Online Learning", "amount": 49.99}
]

print("Testing education-related transactions with override:")
for tx in education_test_cases:
    # Get prediction with original model
    original_result = predict_single_transaction(tx)
    
    # Get prediction with education override
    override_result = predict_single_transaction_with_education_override(tx)
    
    # Show before and after
    print(f"\nTransaction: {tx['name']} - ${tx['amount']}")
    print(f"  Original: {original_result['prediction'].upper()} with {original_result['confidence']:.1f}% confidence")
    print(f"  With override: {override_result['prediction'].upper()} with {override_result['confidence']:.1f}% confidence")
    if override_result['override_applied']:
        print(f"  → OVERRIDE APPLIED: Education expense automatically classified as NEED")

In [None]:
# Update save function to include the education override functionality
def save_models_with_education_override():
    # Save the model and supporting files as before
    joblib.dump(rf_model, 'rf_model.pkl')
    joblib.dump(label_encoder, 'label_encoder.pkl')
    joblib.dump(X.columns.tolist(), 'model_columns.pkl')
    joblib.dump(need_indicators, 'need_indicators.pkl')
    joblib.dump(want_indicators, 'want_indicators.pkl')
    
    # Save the education terms for detecting education expenses
    education_terms = [
        'education', 'tuition', 'university', 'college', 'school', 'textbook', 'student',
        'course', 'class', 'degree', 'academic', 'campus', 'professor', 'lecture',
        'scholarship', 'financial aid'
    ]
    joblib.dump(education_terms, 'education_terms.pkl')
    
    # Also save a flag file indicating that education override should be used
    joblib.dump(True, 'use_education_override.pkl')
    
    print("Model and supporting files saved successfully with education override!")
    
# Execute the save function
save_models_with_education_override()

## Feature Analysis and Visualization

Let's examine all the features used in the model and how they influence the classification.

In [None]:
# Comprehensive analysis of all features
def analyze_all_features():
    # Get all features used in the model
    all_features = X.columns.tolist()
    print(f"Total number of features: {len(all_features)}")
    
    # Group features by type
    feature_groups = {
        'Amount-based': [f for f in all_features if any(x in f for x in ['amount', 'small', 'medium', 'large'])],
        'Category-based': [f for f in all_features if 'category' in f],
        'Time-based': [f for f in all_features if any(x in f for x in ['weekend', 'month', 'recurring'])],
        'Need indicators': [f for f in all_features if f.startswith('name_has_') and 
                           f.replace('name_has_', '').replace('_', ' ') in need_indicators],
        'Want indicators': [f for f in all_features if f.startswith('name_has_') and 
                           f.replace('name_has_', '').replace('_', ' ') in want_indicators]
    }
    
    # Count features in each group
    for group, features in feature_groups.items():
        print(f"\n{group}: {len(features)} features")
        if len(features) <= 10:
            # Show all features if there are 10 or fewer
            for f in features:
                print(f"  - {f}")
        else:
            # Show just the first 5 features if there are more than 10
            for f in features[:5]:
                print(f"  - {f}")
            print(f"  - ... and {len(features)-5} more")
    
    # Show feature importance by group
    feature_importance = dict(zip(X.columns, rf_model.feature_importances_))
    
    # Calculate total importance by group
    group_importance = {}
    for group, features in feature_groups.items():
        group_importance[group] = sum(feature_importance[f] for f in features)
    
    # Create a pie chart of importance by feature group
    plt.figure(figsize=(10, 8))
    plt.pie(group_importance.values(), labels=group_importance.keys(), 
            autopct='%1.1f%%', startangle=90, shadow=True)
    plt.title('Feature Importance by Feature Group', fontsize=16)
    plt.axis('equal')  # Equal aspect ratio ensures pie is drawn as a circle
    plt.tight_layout()
    plt.show()
    
    # Show top 20 individual features by importance
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    top_features = sorted_features[:20]
    
    plt.figure(figsize=(12, 8))
    plt.barh([f[0] for f in top_features], [f[1] for f in top_features])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Top 20 Features by Importance', fontsize=16)
    plt.gca().invert_yaxis()  # Invert to show most important at top
    plt.tight_layout()
    plt.show()
    
    # Return detailed feature data for further analysis
    return {
        'all_features': all_features,
        'feature_groups': feature_groups,
        'feature_importance': feature_importance,
        'top_features': top_features
    }

# Execute the analysis
feature_data = analyze_all_features()

# Create a table of all features with their importance
feature_importance_df = pd.DataFrame([
    {'Feature': feature, 'Importance': importance, 
     'Group': next((group for group, features in feature_data['feature_groups'].items() if feature in features), 'Other')}
    for feature, importance in feature_data['feature_importance'].items()
]).sort_values('Importance', ascending=False)

print("\nDetailed feature importance table:")
display(feature_importance_df.head(30))