# Needs vs. Wants Transaction Classifier

This notebook demonstrates a machine learning model that classifies financial transactions as either 'needs' or 'wants'.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib
from IPython.display import display


## Generate Synthetic Transaction Data

Since we're not using Plaid API, we'll create realistic synthetic transaction data for our model.

In [None]:
# Function to create synthetic transaction data
def generate_synthetic_transactions(num_transactions=500):
    # Transaction categories
    need_categories = [
        'Food and Drink > Groceries',
        'Housing > Rent',
        'Housing > Mortgage',
        'Housing > Utilities',
        'Transfer > Deposit',
        'Payment > Credit Card',
        'Payment > Loan',
        'Travel > Public Transportation',
        'Healthcare > Medical',
        'Healthcare > Pharmacy',
        'Healthcare > Insurance',
        'Service > Utilities',
        'Service > Phone',
        'Service > Internet',
        'Service > Subscription',
        'Education > Tuition',
        'Education > Books'
    ]
    
    want_categories = [
        'Food and Drink > Restaurants',
        'Food and Drink > Coffee Shop',
        'Food and Drink > Alcohol & Bars',
        'Shopping > Clothing',
        'Shopping > Electronics',
        'Shopping > Home',
        'Shopping > Gifts',
        'Travel > Vacation',
        'Travel > Rideshare',
        'Travel > Hotel',
        'Travel > Air Travel',
        'Recreation > Gym',
        'Recreation > Entertainment',
        'Recreation > Sports',
        'Recreation > Hobbies',
        'Personal Care > Spa',
        'Personal Care > Beauty'
    ]
    
    # Merchant names for each category (expanded)
    merchants = {
        # Need Categories
        'Food and Drink > Groceries': ['Kroger', 'Safeway', 'Whole Foods', 'Trader Joe\'s', 'Aldi', 'Publix', 'Walmart Grocery', 'Target Grocery', 'Food Lion', 'Giant Eagle', 'Meijer'],
        'Housing > Rent': ['RENT PAYMENT', 'APARTMENT MANAGEMENT', 'PROPERTY MGMT', 'LANDLORD', 'LEASING OFFICE', 'REAL ESTATE MGMT'],
        'Housing > Mortgage': ['MORTGAGE PAYMENT', 'HOME LOAN', 'BANK MORTGAGE', 'LOAN SERVICES', 'WELLS FARGO MORTGAGE', 'CHASE MORTGAGE'],
        'Housing > Utilities': ['ELECTRIC BILL', 'WATER COMPANY', 'GAS SERVICES', 'UTILITY PROVIDER', 'ENERGY COMPANY', 'LOCAL UTILITIES'],
        'Transfer > Deposit': ['DIRECT DEPOSIT', 'ACH TRANSFER', 'BANK TRANSFER', 'PAYROLL DEPOSIT', 'ELECTRONIC DEPOSIT', 'ZELLE TRANSFER'],
        'Payment > Credit Card': ['CREDIT CARD PAYMENT', 'CARD SERVICES', 'AMEX PAYMENT', 'CHASE CARD', 'DISCOVER PAYMENT', 'CITI CARD SERVICE'],
        'Payment > Loan': ['LOAN PAYMENT', 'STUDENT LOAN', 'CAR PAYMENT', 'AUTO LOAN', 'PERSONAL LOAN', 'LOAN SERVICES'],
        'Travel > Public Transportation': ['METRO', 'SUBWAY', 'BUS SERVICE', 'TRAIN TICKET', 'TRANSIT AUTHORITY', 'PUBLIC TRANSIT', 'LIGHT RAIL'],
        'Healthcare > Medical': ['DOCTOR OFFICE', 'MEDICAL CENTER', 'HOSPITAL BILL', 'CLINIC PAYMENT', 'MEDICAL SERVICE', 'HEALTH CENTER'],
        'Healthcare > Pharmacy': ['CVS', 'Walgreens', 'Rite Aid', 'PHARMACY', 'PRESCRIPTION', 'DRUG STORE', 'MEDICINE SHOPPE'],
        'Healthcare > Insurance': ['HEALTH INSURANCE', 'MEDICAL INSURANCE', 'INSURANCE PREMIUM', 'DENTAL INSURANCE', 'VISION INSURANCE', 'INSURANCE PROVIDER'],
        'Service > Utilities': ['ELECTRIC COMPANY', 'WATER UTILITY', 'GAS BILL', 'UTILITY PAYMENT', 'ENERGY PROVIDER', 'POWER COMPANY'],
        'Service > Phone': ['VERIZON', 'AT&T', 'T-MOBILE', 'SPRINT', 'PHONE BILL', 'WIRELESS SERVICE', 'MOBILE PROVIDER'],
        'Service > Internet': ['COMCAST', 'SPECTRUM', 'INTERNET SERVICE', 'WIFI PROVIDER', 'BROADBAND', 'XFINITY', 'COX INTERNET'],
        'Service > Subscription': ['ESSENTIAL SUBSCRIPTION', 'SECURITY SERVICE', 'CLOUD STORAGE', 'ANTIVIRUS', 'SOFTWARE LICENSE', 'WORK TOOLS'],
        'Education > Tuition': ['UNIVERSITY PAYMENT', 'COLLEGE TUITION', 'SCHOOL PAYMENT', 'EDUCATION FEE', 'TUITION PAYMENT', 'ACADEMIC SERVICES'],
        'Education > Books': ['TEXTBOOKS', 'SCHOOL BOOKS', 'ACADEMIC MATERIALS', 'EDUCATION SUPPLIES', 'LEARNING MATERIALS', 'STUDENT BOOKS'],
        
        # Want Categories
        'Food and Drink > Restaurants': ['McDonald\'s', 'Chipotle', 'Olive Garden', 'Cheesecake Factory', 'Chili\'s', 'Restaurant', 'Applebee\'s', 'Outback', 'IHOP', 'Burger King', 'Wendy\'s', 'Taco Bell', 'Pizza Hut', 'Subway'],
        'Food and Drink > Coffee Shop': ['Starbucks', 'Dunkin', 'Peet\'s Coffee', 'Coffee Shop', 'Cafe', 'Dutch Bros', 'Coffee Bean', 'Local Cafe', 'Costa Coffee'],
        'Food and Drink > Alcohol & Bars': ['BAR', 'LIQUOR STORE', 'Wine Shop', 'Brewery', 'NIGHTCLUB', 'PUB', 'COCKTAIL BAR', 'WINE BAR', 'DISTILLERY'],
        'Shopping > Clothing': ['H&M', 'Zara', 'Nike', 'Gap', 'Old Navy', 'Macy\'s', 'CLOTHING STORE', 'Forever 21', 'Nordstrom', 'TJ Maxx', 'Ross'],
        'Shopping > Electronics': ['Best Buy', 'Apple Store', 'Amazon', 'Samsung Store', 'Electronics', 'GameStop', 'Micro Center', 'Newegg'],
        'Shopping > Home': ['IKEA', 'Bed Bath & Beyond', 'HomeGoods', 'FURNITURE STORE', 'HOME DECOR', 'Pottery Barn', 'Crate & Barrel', 'At Home'],
        'Shopping > Gifts': ['GIFT SHOP', 'Hallmark', 'Gift Purchase', 'NOVELTY STORE', 'SOUVENIR SHOP', 'CARD STORE', 'SPECIALTY GIFTS'],
        'Travel > Vacation': ['AIRLINE TICKET', 'HOTEL STAY', 'Airbnb', 'VRBO', 'TRAVEL AGENCY', 'BOOKING.COM', 'EXPEDIA', 'CRUISE LINE', 'RESORT'],
        'Travel > Rideshare': ['Uber', 'Lyft', 'TAXI', 'CAB SERVICE', 'RIDESHARE APP', 'RIDE SERVICE', 'TRANSPORT APP'],
        'Travel > Hotel': ['MARRIOTT', 'HILTON', 'HYATT', 'HOTEL BOOKING', 'LODGING', 'ACCOMMODATION', 'INN', 'MOTEL'],
        'Travel > Air Travel': ['DELTA', 'UNITED', 'AMERICAN AIRLINES', 'SOUTHWEST', 'FLIGHT BOOKING', 'AIRPORT PURCHASE', 'JETBLUE'],
        'Recreation > Gym': ['LA Fitness', 'Planet Fitness', 'GYM MEMBERSHIP', 'FITNESS CENTER', '24 Hour Fitness', 'EQUINOX', 'CRUNCH FITNESS', 'GOLD\'S GYM'],
        'Recreation > Entertainment': ['MOVIE THEATER', 'Concert Tickets', 'THEME PARK', 'Netflix', 'Spotify', 'HBO', 'Disney+', 'HULU', 'AMC THEATERS', 'EVENT TICKETS'],
        'Recreation > Sports': ['SPORTING EVENT', 'TICKET MASTER', 'STADIUM', 'ARENA', 'SPORTS TICKETS', 'GOLF COURSE', 'BOWLING'],
        'Recreation > Hobbies': ['HOBBY SHOP', 'CRAFT STORE', 'HOBBY LOBBY', 'MICHAELS', 'GUITAR CENTER', 'ART SUPPLIES', 'GAME STORE'],
        'Personal Care > Spa': ['MASSAGE', 'SPA SERVICE', 'HAIR SALON', 'NAIL SALON', 'BARBERSHOP', 'BEAUTY SPA', 'DAY SPA'],
        'Personal Care > Beauty': ['SEPHORA', 'ULTA', 'BEAUTY STORE', 'COSMETICS', 'MAKEUP', 'BEAUTY PRODUCTS', 'LUSH']
    }
    
    # Amount ranges for each category
    amount_ranges = {
        # Need categories
        'Food and Drink > Groceries': (30, 200),
        'Housing > Rent': (800, 2500),
        'Housing > Mortgage': (1000, 3000),
        'Housing > Utilities': (80, 300),
        'Transfer > Deposit': (500, 3000),
        'Payment > Credit Card': (200, 1500),
        'Payment > Loan': (200, 1000),
        'Travel > Public Transportation': (2, 15),
        'Healthcare > Medical': (20, 300),
        'Healthcare > Pharmacy': (10, 100),
        'Healthcare > Insurance': (100, 500),
        'Service > Utilities': (50, 200),
        'Service > Phone': (50, 150),
        'Service > Internet': (50, 120),
        'Service > Subscription': (5, 50),
        'Education > Tuition': (500, 3000),
        'Education > Books': (50, 300),
        
        # Want categories
        'Food and Drink > Restaurants': (15, 100),
        'Food and Drink > Coffee Shop': (3, 15),
        'Food and Drink > Alcohol & Bars': (20, 100),
        'Shopping > Clothing': (20, 200),
        'Shopping > Electronics': (50, 1000),
        'Shopping > Home': (30, 500),
        'Shopping > Gifts': (20, 150),
        'Travel > Vacation': (200, 2000),
        'Travel > Rideshare': (10, 50),
        'Travel > Hotel': (100, 500),
        'Travel > Air Travel': (200, 800),
        'Recreation > Gym': (20, 100),
        'Recreation > Entertainment': (10, 150),
        'Recreation > Sports': (30, 200),
        'Recreation > Hobbies': (20, 200),
        'Personal Care > Spa': (30, 200),
        'Personal Care > Beauty': (20, 150)
    }
    
    # Create transactions
    transactions = []
    end_date = datetime.datetime.now().date()
    start_date = end_date - datetime.timedelta(days=365)  # One year of data
    
    # Explicitly categorized needs and wants
    need_want_map = {}
    for category in need_categories:
        need_want_map[category] = 'need'
    for category in want_categories:
        need_want_map[category] = 'want'
    
    # Generate transactions
    for _ in range(num_transactions):
        # Determine if this is a need or want (roughly 60% needs, 40% wants)
        is_need = random.random() < 0.6
        
        # Choose category
        if is_need:
            category = random.choice(need_categories)
        else:
            category = random.choice(want_categories)
        
        # Choose merchant name from that category
        name = random.choice(merchants[category])
        
        # Transaction amount based on category
        min_amount, max_amount = amount_ranges[category]
        amount = round(random.uniform(min_amount, max_amount), 2)
        
        # Transaction date - more recent dates are more likely
        days_ago = int(random.triangular(0, 365, 30))  # Most transactions in last month
        date = end_date - datetime.timedelta(days=days_ago)
        
        # Need vs Want classification
        need_want = need_want_map[category]
        
        # Create transaction
        transaction = {
            'date': date,
            'name': name,
            'amount': amount,
            'category': category,
            'need_want': need_want
        }
        
        transactions.append(transaction)
    
    # Create DataFrame and sort by date
    df = pd.DataFrame(transactions)
    df = df.sort_values('date', ascending=False)
    
    return df

# Generate synthetic transaction data
df = generate_synthetic_transactions(500)

# Display the first few rows
print(f"Generated {len(df)} synthetic transactions")
df.head()

Generated 500 synthetic transactions


Unnamed: 0,date,name,amount,category,need_want
199,2025-03-27,WATER UTILITY,183.05,Service > Utilities,need
158,2025-03-26,FITNESS CENTER,95.75,Recreation > Gym,want
196,2025-03-26,BUS SERVICE,3.16,Travel > Public Transportation,need
421,2025-03-25,H&M,122.61,Shopping > Clothing,want
39,2025-03-24,GAS BILL,134.54,Service > Utilities,need


In [None]:
# Function to categorize transactions as 'need' or 'want' based on category and name
def categorize_need_want(category, name=None):
    # Comprehensive list of needs categories and keywords
    needs_keywords = [
        # Housing
        'rent', 'mortgage', 'housing', 'hoa', 'apartment', 'property', 'landlord', 'lease',
        # Utilities
        'utilities', 'electric', 'power', 'gas', 'water', 'sewer', 'trash', 'utility', 'energy',
        # Bills & Services
        'bill', 'phone', 'mobile', 'internet', 'cable', 'wifi', 'broadband', 'fiber',
        # Essential Subscriptions
        'essential subscription', 'security', 'antivirus', 'cloud storage', 'password',
        # Food Necessities
        'grocery', 'groceries', 'supermarket', 'food lion', 'market', 'produce', 'farmer',
        # Health
        'pharmacy', 'medical', 'healthcare', 'doctor', 'hospital', 'clinic', 'dental', 'health',
        'medicine', 'prescription', 'drug store', 'urgent care', 'emergency',
        # Insurance
        'insurance', 'life insurance', 'health insurance', 'auto insurance', 'car insurance',
        'home insurance', 'renter', 'policy',
        # Transportation
        'gas', 'fuel', 'petrol', 'transit', 'transportation', 'bus', 'train', 'metro', 'subway',
        'commute', 'transport', 'light rail', 'toll', 'parking',
        # Education
        'education', 'tuition', 'school', 'university', 'college', 'student', 'book', 'textbook',
        'class', 'course', 'academic', 'educational',
        # Debt and Loans
        'loan', 'debt', 'credit card payment', 'student loan', 'finance charge', 'payment', 'transfer',
        'interest', 'principal', 'refinance', 'consolidation'
    ]
    
    # Explicit want categories (expanded)
    wants_keywords = [
        # Dining & Drinks
        'restaurant', 'dining', 'bar', 'cafe', 'coffee', 'takeout', 'fast food', 'alcohol',
        'brewery', 'pub', 'bistro', 'eatery', 'grill', 'diner', 'steakhouse', 'pizzeria',
        'taco', 'burger', 'sushi', 'delivery', 'doordash', 'grubhub', 'ubereats', 'starbucks',
        # Entertainment
        'entertainment', 'movie', 'theater', 'cinema', 'concert', 'sport', 'game', 'show',
        'netflix', 'hulu', 'disney+', 'spotify', 'music', 'streaming', 'subscription', 'premium',
        'stadium', 'ticket', 'event', 'festival', 'amusement', 'museum', 'zoo', 'aquarium',
        # Shopping
        'shopping', 'clothing', 'shoes', 'accessories', 'electronics', 'gadget', 'fashion',
        'jewelry', 'watch', 'handbag', 'sunglasses', 'boutique', 'retail', 'mall', 'outlet',
        'department store', 'online shopping', 'amazon', 'ebay', 'etsy', 'luxury',
        # Recreation
        'hobby', 'recreation', 'gym', 'fitness', 'sport', 'golf', 'ski', 'tennis', 'yoga',
        'outdoor', 'camping', 'hiking', 'fishing', 'gaming', 'video game', 'playstation', 'xbox',
        'craft', 'art', 'pottery', 'photography',
        # Travel
        'travel', 'hotel', 'vacation', 'trip', 'flight', 'airline', 'resort', 'cruise', 'airbnb',
        'booking', 'tourism', 'tour', 'sightseeing', 'souvenir', 'beach', 'island', 'holiday',
        'rental car', 'airfare', 'international', 'passport', 'luggage',
        # Personal Care & Beauty
        'beauty', 'spa', 'salon', 'cosmetics', 'haircut', 'manicure', 'pedicure', 'massage',
        'facial', 'makeup', 'skincare', 'perfume', 'cologne', 'grooming', 'barbershop',
        'non-essential subscription', 'lifestyle', 'membership'
    ]
    
    # Convert inputs to lowercase for case-insensitive matching
    category_lower = category.lower() if isinstance(category, str) else ''
    name_lower = name.lower() if isinstance(name, str) else ''
    
    # First check explicit wants (stronger indicator)
    for want in wants_keywords:
        if (want in category_lower) or (name_lower and want in name_lower):
            return 'want'
    
    # Then check for needs
    for need in needs_keywords:
        if (need in category_lower) or (name_lower and need in name_lower):
            return 'need'
    
    # If special categories or common transactions
    if 'transfer' in category_lower and 'internal' in category_lower:
        return 'transfer'  # or 'need' depending on your preference
    
    # If we reach here, default to 'want' (for unrecognized transactions)
    return 'want'

# Test the categorization function on a few sample transactions
sample_transactions = [
    {'name': 'STARBUCKS COFFEE', 'category': 'Food and Drink > Coffee Shop'},
    {'name': 'KROGER', 'category': 'Food and Drink > Groceries'},
    {'name': 'RENT PAYMENT', 'category': 'Housing > Rent'},
    {'name': 'APPLE STORE', 'category': 'Shopping > Electronics'}
]

for tx in sample_transactions:
    category = tx['category']
    name = tx['name']
    classification = categorize_need_want(category, name)
    print(f"{name} - {category}: Classified as {classification.upper()}")

STARBUCKS COFFEE - Food and Drink > Coffee Shop: Classified as WANT
KROGER - Food and Drink > Groceries: Classified as NEED
RENT PAYMENT - Housing > Rent: Classified as NEED
APPLE STORE - Shopping > Electronics: Classified as WANT


In [None]:
# Feature engineering for the model
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Enhanced need and want indicators for better classification
need_indicators = [
    # Basic necessities
    'grocery', 'bill', 'utility', 'gas', 'rent', 'medical', 'insurance', 'health', 'pharmacy',
    'prescription', 'doctor', 'hospital', 'mortgage', 'housing', 'electric', 'water', 'education',
    'tuition', 'loan', 'payment', 'deposit', 'transfer', 'banking', 'withdrawal', 'atm', 'fee',
    # Transportation necessities
    'transit', 'bus', 'subway', 'metro', 'train', 'commute', 'toll', 'parking',
    # Communication necessities
    'phone', 'mobile', 'internet', 'service', 'broadband', 'utility',
    # Education necessities
    'school', 'college', 'university', 'books', 'textbook', 'tuition', 'student'
]

want_indicators = [
    # Food & Drink wants
    'restaurant', 'coffee', 'cafe', 'dining', 'takeout', 'delivery', 'bar', 'pub', 'brewery',
    'alcohol', 'wine', 'beer', 'liquor', 'fast food', 'starbucks', 'mcdonald', 'doordash',
    # Entertainment wants
    'entertainment', 'movie', 'theater', 'cinema', 'concert', 'show', 'ticket', 'netflix',
    'hulu', 'spotify', 'disney', 'subscription', 'streaming', 'game', 'gaming',
    # Shopping wants
    'shopping', 'clothing', 'apparel', 'shoes', 'electronics', 'accessory', 'gadget', 'amazon',
    'ebay', 'etsy', 'retail', 'store', 'mall', 'boutique', 'jewelry', 'gift',
    # Recreation wants
    'recreation', 'hobby', 'gym', 'fitness', 'sport', 'golf', 'yoga', 'workout',
    'outdoor', 'camping', 'fishing', 'hiking', 'craft', 'art',
    # Travel wants
    'travel', 'vacation', 'hotel', 'airbnb', 'flight', 'airline', 'trip', 'uber', 'lyft',
    'rideshare', 'rental', 'resort', 'cruise', 'booking', 'expedia', 'tour',
    # Personal care wants
    'beauty', 'spa', 'salon', 'cosmetics', 'makeup', 'skincare', 'massage', 'manicure',
    'pedicure', 'haircut', 'barbershop'
]

# 1. Process amount (likely our most important feature)
# Amount is often the strongest predictor of need vs want
df['amount_abs'] = df['amount'].abs()  # Use absolute value since expenses might be negative

# Create amount bins - larger purchases more likely to be wants
df['is_small_purchase'] = (df['amount_abs'] < 20).astype(int)  # Small purchases
df['is_medium_purchase'] = ((df['amount_abs'] >= 20) & (df['amount_abs'] < 100)).astype(int)  # Medium
df['is_large_purchase'] = (df['amount_abs'] >= 100).astype(int)  # Large purchases
df['is_very_large_purchase'] = (df['amount_abs'] >= 500).astype(int)  # Very large purchases

# 2. Extract features from transaction name
# Create keyword indicators for common needs/wants
for keyword in need_indicators:
    df[f'name_has_{keyword}'] = df['name'].str.lower().str.contains(keyword, na=False).astype(int)

for keyword in want_indicators:
    df[f'name_has_{keyword}'] = df['name'].str.lower().str.contains(keyword, na=False).astype(int)

# 3. Time-based features
df['date'] = pd.to_datetime(df['date'])
df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)  # Weekend purchases more likely wants
df['day_of_month'] = df['date'].dt.day
df['is_end_of_month'] = (df['day_of_month'] > 25).astype(int)  # End of month often bills/needs
df['is_beginning_of_month'] = (df['day_of_month'] < 5).astype(int)  # Beginning of month often bills/needs
df['month'] = df['date'].dt.month
df['is_weekend_night'] = (df['is_weekend'] & (df['date'].dt.hour >= 20)).astype(int)  # Weekend night purchases

# 4. Category-based features
# Fix: Ensure all categories are strings
df['category'] = df['category'].apply(lambda x: " > ".join(x) if isinstance(x, list) else x)

# Create category groups based on need vs want patterns
need_categories = ['Bank Fees', 'Food and Drink > Groceries', 'Housing', 'Transfer', 'Payment',
                  'Travel > Public Transportation', 'Healthcare', 'Service', 'Utilities', 'Education']
want_categories = ['Food and Drink > Restaurants', 'Shopping', 'Travel', 'Recreation', 
                  'Food and Drink > Coffee', 'Entertainment', 'Personal Care']

df['category_is_likely_need'] = df['category'].apply(lambda x: 
                                             any(need_cat.lower() in x.lower() 
                                                 for need_cat in need_categories if isinstance(x, str))).astype(int)
df['category_is_likely_want'] = df['category'].apply(lambda x: 
                                             any(want_cat.lower() in x.lower() 
                                                 for want_cat in want_categories if isinstance(x, str))).astype(int)

# 5. Periodic spending patterns
# Recurring transactions of same amount are often needs
amount_counts = df.groupby('amount_abs')['amount_abs'].transform('count')
df['is_recurring_amount'] = (amount_counts > 1).astype(int)

# Essential timing patterns (bills are often paid at beginning/end of month)
df['is_likely_bill_timing'] = (df['is_end_of_month'] | df['is_beginning_of_month']).astype(int)

# Context-specific features
# Large weekend purchases are often wants
df['is_large_weekend_purchase'] = (df['is_weekend'] & df['is_large_purchase']).astype(int)

# Small regular purchases might be daily needs
df['is_small_regular_purchase'] = (df['is_small_purchase'] & df['is_recurring_amount']).astype(int)

# Combine all features
features_cols = ['amount_abs', 'is_small_purchase', 'is_medium_purchase', 'is_large_purchase', 'is_very_large_purchase',
                'is_weekend', 'is_end_of_month', 'is_beginning_of_month', 'month', 'is_weekend_night',
                'category_is_likely_need', 'category_is_likely_want', 'is_recurring_amount', 'is_likely_bill_timing',
                'is_large_weekend_purchase', 'is_small_regular_purchase'] + [col for col in df.columns if col.startswith('name_has_')]

features_df = df[features_cols]

# Show the features dataframe
print(f"Number of features: {features_df.shape[1]}")
features_df.head(300)

In [None]:
def preprocess_transactions(transactions):
    """Preprocess transactions to match the model's expected input format."""
    # Handle missing or invalid values
    transactions['date'] = pd.to_datetime(transactions['date'], errors='coerce')
    transactions['name'] = transactions['name'].fillna('Unknown').astype(str)
    transactions['amount'] = pd.to_numeric(transactions['amount'], errors='coerce').fillna(0)
    transactions['category'] = transactions['category'].fillna('Uncategorized').astype(str)

    # Create features for the model
    features = pd.DataFrame()

    # Amount features
    features['amount_abs'] = transactions['amount'].abs()
    features['is_small_purchase'] = (features['amount_abs'] < 20).astype(int)
    features['is_medium_purchase'] = ((features['amount_abs'] >= 20) & (features['amount_abs'] < 100)).astype(int)
    features['is_large_purchase'] = (features['amount_abs'] >= 100).astype(int)
    features['is_very_large_purchase'] = (features['amount_abs'] >= 500).astype(int)

    # Enhanced name-based features with broader keyword detection
    for keyword in need_indicators:
        features[f'name_has_{keyword}'] = transactions['name'].str.lower().str.contains(keyword, na=False).astype(int)
    for keyword in want_indicators:
        features[f'name_has_{keyword}'] = transactions['name'].str.lower().str.contains(keyword, na=False).astype(int)

    # Time-based features
    features['is_weekend'] = transactions['date'].dt.dayofweek.isin([5, 6]).astype(int)
    features['day_of_month'] = transactions['date'].dt.day.fillna(15).astype(int)
    features['is_end_of_month'] = (features['day_of_month'] > 25).astype(int)
    features['is_beginning_of_month'] = (features['day_of_month'] < 5).astype(int)
    features['month'] = transactions['date'].dt.month.fillna(1).astype(int)
    features['is_weekend_night'] = (features['is_weekend'] & (transactions['date'].dt.hour >= 20)).astype(int)

    # Category-based features with expanded categories
    # Needs categories
    needs_categories = ['bank fees', 'groceries', 'housing', 'utilities', 'healthcare', 'transportation',
                      'education', 'insurance', 'mortgage', 'rent', 'loan', 'bill', 'pharmacy', 'medical',
                      'service', 'deposit', 'transfer', 'payment', 'books', 'tuition']

    # Wants categories
    wants_categories = ['restaurants', 'shopping', 'travel', 'entertainment', 'coffee', 'vacation',
                       'rideshare', 'hotel', 'air travel', 'gym', 'recreation', 'alcohol', 'spa',
                       'beauty', 'electronics', 'clothing', 'gifts', 'sports', 'hobbies']

    features['category_is_likely_need'] = transactions['category'].str.lower().apply(
        lambda x: any(need in x for need in needs_categories)
    ).astype(int)
    features['category_is_likely_want'] = transactions['category'].str.lower().apply(
        lambda x: any(want in x for want in wants_categories)
    ).astype(int)

    # Enhanced pattern detection
    # Recurring transactions of same amount are often needs
    amount_counts = transactions.groupby('amount')['amount'].transform('count')
    features['is_recurring_amount'] = (amount_counts > 1).astype(int)

    # Essential timing patterns (bills are often paid at beginning/end of month)
    features['is_likely_bill_timing'] = (features['is_end_of_month'] | features['is_beginning_of_month']).astype(int)

    # Context-specific features
    # Large weekend purchases are often wants
    features['is_large_weekend_purchase'] = (features['is_weekend'] & features['is_large_purchase']).astype(int)

    # Small regular purchases might be daily needs
    features['is_small_regular_purchase'] = (features['is_small_purchase'] & features['is_recurring_amount']).astype(int)

    # Ensure all expected columns are present
    for col in features_cols:
        if col not in features:
            features[col] = 0

    # Ensure column order matches the model's training data
    features = features[features_cols]
    return features

In [None]:
# Function to predict need vs want for a single transaction
def predict_single_transaction(transaction_data):
    # Create a dataframe with this single transaction
    test_transaction = pd.DataFrame([transaction_data])
    
    # Create the same features we used for training
    features = {}
    
    # Amount features
    amount_abs = abs(float(transaction_data.get('amount', 0)))
    features['amount_abs'] = amount_abs
    features['is_small_purchase'] = 1 if amount_abs < 20 else 0
    features['is_medium_purchase'] = 1 if 20 <= amount_abs < 100 else 0
    features['is_large_purchase'] = 1 if 100 <= amount_abs < 500 else 0
    features['is_very_large_purchase'] = 1 if amount_abs >= 500 else 0
    
    # Extract name-based features
    name = transaction_data.get('name', '').lower()
    
    # Need indicators (extended)
    for keyword in need_indicators:
        features[f'name_has_{keyword}'] = 1 if keyword in name else 0
        
    # Want indicators (extended)
    for keyword in want_indicators:
        features[f'name_has_{keyword}'] = 1 if keyword in name else 0
    
    # Time-based features
    if 'date' in transaction_data:
        date = pd.to_datetime(transaction_data['date'])
        features['is_weekend'] = 1 if date.dayofweek >= 5 else 0
        features['day_of_month'] = date.day
        features['is_end_of_month'] = 1 if date.day > 25 else 0
        features['is_beginning_of_month'] = 1 if date.day < 5 else 0
        features['month'] = date.month
        # Assuming we don't have hour information in most cases
        features['is_weekend_night'] = 0
    else:
        features['is_weekend'] = 0
        features['day_of_month'] = 15  # Default to middle of month
        features['is_end_of_month'] = 0
        features['is_beginning_of_month'] = 0
        features['month'] = datetime.datetime.now().month
        features['is_weekend_night'] = 0
    
    # Category-based features (expanded)
    category = transaction_data.get('category', '').lower()
    
    # Need categories (expanded)
    needs_categories = ['bank fees', 'groceries', 'housing', 'utilities', 'healthcare', 'transportation',
                      'education', 'insurance', 'mortgage', 'rent', 'loan', 'bill', 'pharmacy', 'medical',
                      'service', 'deposit', 'transfer', 'payment', 'books', 'tuition']

    # Want categories (expanded)
    wants_categories = ['restaurants', 'shopping', 'travel', 'entertainment', 'coffee', 'vacation',
                       'rideshare', 'hotel', 'air travel', 'gym', 'recreation', 'alcohol', 'spa',
                       'beauty', 'electronics', 'clothing', 'gifts', 'sports', 'hobbies']
    
    features['category_is_likely_need'] = 1 if any(need_cat in category for need_cat in needs_categories) else 0
    features['category_is_likely_want'] = 1 if any(want_cat in category for want_cat in wants_categories) else 0
    
    # Pattern-based features - for single transaction we don't know if it's recurring
    features['is_recurring_amount'] = 0  # Default to not recurring
    
    # Additional context features
    features['is_likely_bill_timing'] = features['is_end_of_month'] | features['is_beginning_of_month']
    features['is_large_weekend_purchase'] = features['is_weekend'] & features['is_large_purchase']
    features['is_small_regular_purchase'] = features['is_small_purchase'] & features['is_recurring_amount']
    
    # Convert to DataFrame
    features_df = pd.DataFrame([features])
    
    # Ensure all expected features are present
    for feature in X.columns:
        if feature not in features_df.columns:
            features_df[feature] = 0
    
    # Ensure column order matches training data
    features_df = features_df[X.columns]
    
    # Make prediction
    prediction = rf_model.predict(features_df)[0]
    prediction_label = label_encoder.inverse_transform([prediction])[0]
    
    # Get prediction probability
    proba = rf_model.predict_proba(features_df)[0]
    confidence = proba[prediction] * 100  # Convert to percentage
    
    return {
        'prediction': prediction_label,
        'confidence': confidence,
        'features': features
    }

# Test the model with a transaction example
test_transaction = {
    'date': '2023-06-15',
    'name': 'pharmacy',
    'amount': 49,
    'category': 'healthcare'
}

# Run the prediction
result = predict_single_transaction(test_transaction)

# Display results
print(f"\n===== TRANSACTION DETAILS =====")
print(f"Date: {test_transaction['date']}")
print(f"Name: {test_transaction['name']}")
print(f"Amount: ${test_transaction['amount']:.2f}")
print(f"Category: {test_transaction['category']}")

print(f"\n===== PREDICTION RESULTS =====")
print(f"Classification: This is a {result['prediction'].upper()}")
print(f"Confidence: {result['confidence']:.1f}%")

print(f"\n===== KEY FEATURES =====")
features = result['features']

# Display amount-based features
print(f"Amount: ${features['amount_abs']:.2f}")
if features['is_small_purchase']:
    print("- This is a small purchase (under $20)")
elif features['is_medium_purchase']:
    print("- This is a medium-sized purchase ($20-$100)")
else:
    print("- This is a large purchase (over $100)")

# Show matched keywords
matched_needs = [kw for kw in need_indicators if features.get(f'name_has_{kw}', 0) == 1]
if matched_needs:
    print(f"- Contains need-related keywords: {', '.join(matched_needs)}")

matched_wants = [kw for kw in want_indicators if features.get(f'name_has_{kw}', 0) == 1]
if matched_wants:
    print(f"- Contains want-related keywords: {', '.join(matched_wants)}")

# Category info
if features['category_is_likely_need']:
    print("- Category typically associated with needs")
if features['category_is_likely_want']:
    print("- Category typically associated with wants")


===== TRANSACTION DETAILS =====
Date: 2023-06-15
Name: pharmacy
Amount: $49.00
Category: healthcare

===== PREDICTION RESULTS =====
Classification: This is a NEED
Confidence: 77.9%

===== KEY FEATURES =====
Amount: $49.00
- This is a medium-sized purchase ($20-$100)
- Category typically associated with needs


In [None]:
import joblib

# Save the trained model
joblib.dump(rf_model, 'rf_model.pkl')

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

# Save the list of features used during training
joblib.dump(X.columns.tolist(), 'model_columns.pkl')

# Save expanded need and want indicators (so your API can reuse them)
joblib.dump(need_indicators, 'need_indicators.pkl')
joblib.dump(want_indicators, 'want_indicators.pkl')

print("Enhanced model and supporting files saved successfully!")

Model and supporting files saved successfully!
