# Model
Based on the original model I built for the initial v0.1.0 release, I transitioned to a synthetic dataset specifically designed for this project. The model architecture and training process have been refined, leading to improved accuracy. Notably, Sam's Club is now correctly categorized as **Retail** instead of **Restaurant**.

## Notebook Outline
1. **Load & Explore Data**
2. **Tokenize Merchant Names & Prepare Vocabulary**
3. **Convert Data to PyTorch Tensors**
4. **Define Model**
5. **Train Model**
6. **Make Predictions**
7. **Export Model and Vocab**

## 1. Load and Explore Data

In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv("../data/synthetic_transactions.csv")

# Print the first few rows
print(df.head())


                         transaction_id customer_id  card_number  \
0  feec7465-afb5-4de2-9c73-3a753004b084   CUST-2219         5173   
1  2273843f-df20-4532-a295-f4daed302890   CUST-1938         2099   
2  8a732724-0eb6-4a77-a3c7-1fbac59542a3   CUST-3634         2146   
3  83562abd-848f-4e03-99a6-f19a12a4c6d9   CUST-9425         7495   
4  4a52fa36-00a4-438d-af9d-446e6d1f5918   CUST-7861         1539   

             timestamp merchant_category merchant_name  amount  \
0  2024-04-10 22:19:27          Wireless        Sprint  138.02   
1  2025-01-13 16:15:58         Groceries        Kroger  168.69   
2  2024-03-07 07:08:36          Wireless  Boost Mobile   30.60   
3  2024-07-25 12:07:13         Groceries  Trader Joe's  155.86   
4  2024-12-21 15:58:34         Education     Princeton  609.30   

      card_provider   channel   device  
0              VISA  Physical      POS  
1  American Express  Physical  Desktop  
2          Discover  Physical      POS  
3              VISA  Physical 

## 2. Tokenize Merchant Names & Prepare Vocabulary

In [3]:
import re
from collections import Counter

# Tokenize merchant names into words
def tokenize_merchant(merchant):
    merchant = str(merchant).lower()  # Ensure it's a string
    merchant = re.sub(r"[^a-z0-9 ]", "", merchant)  # Remove special characters
    return merchant.split()

# Apply tokenization
df["merchant_tokens"] = df["merchant_name"].apply(tokenize_merchant)

# Build vocabulary from all merchant names
all_tokens = [token for tokens in df["merchant_tokens"] for token in tokens]
token_counts = Counter(all_tokens)
vocab = {word: idx + 1 for idx, (word, count) in enumerate(token_counts.items())}  # +1 to reserve 0 for padding

# Add "unknown" token to vocab (CS50P structure)
if "unknown" not in vocab:
    vocab["unknown"] = len(vocab) + 1  

# Encode merchants as sequences of token IDs
df["merchant_encoded"] = df["merchant_tokens"].apply(lambda tokens: [vocab.get(token, vocab["unknown"]) for token in tokens])

# Print Sample
print(f"Vocabulary Size: {len(vocab)}")
print(f"Example Tokenized Merchant: {df.iloc[0]['merchant_tokens']} → {df.iloc[0]['merchant_encoded']}")

Vocabulary Size: 392
Example Tokenized Merchant: ['sprint'] → [1]


## 3. Convert Data to PyTorch Tensors

In [4]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode merchant categories (target labels)
category_encoder = LabelEncoder()
df["merchant_category_encoded"] = category_encoder.fit_transform(df["merchant_category"])

# Convert tokenized merchant names into padded tensors
def encode_and_pad(tokens, max_length=10):
    token_ids = [vocab.get(token, vocab["unknown"]) for token in tokens]
    padded_tensor = torch.zeros(max_length, dtype=torch.long)
    padded_tensor[:len(token_ids)] = torch.tensor(token_ids[:max_length], dtype=torch.long)
    return padded_tensor

df["padded_tokens"] = df["merchant_tokens"].apply(lambda x: encode_and_pad(x, max_length=10))

# Convert categories to tensor
category_tensors = torch.tensor(df["merchant_category_encoded"].values, dtype=torch.long)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(df["padded_tokens"].tolist(), category_tensors, test_size=0.2, random_state=42)

# Convert lists to tensors
X_train = torch.stack(X_train)
X_val = torch.stack(X_val)

print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}")

Train Shape: torch.Size([60000, 10]), Validation Shape: torch.Size([15000, 10])


## 4. Define Model

In [5]:
import torch.nn as nn

class TransactionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=0)  
        self.fc1 = nn.Linear(embed_dim, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x).mean(dim=1)  # Averaging token embeddings
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Model Parameters (CS50P Settings)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(vocab) + 1 
embed_dim = 32  
hidden_size = 64  
output_size = len(df["merchant_category"].unique())  

# Initialize Model
model = TransactionClassifier(vocab_size, embed_dim, hidden_size, output_size).to(device)
print(model)

TransactionClassifier(
  (embedding): Embedding(394, 32, padding_idx=0)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=64, out_features=13, bias=True)
)


## 5. Train Model

In [6]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Create PyTorch Dataloaders
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for merchant_batch, labels in train_loader:
        merchant_batch, labels = merchant_batch.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(merchant_batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(train_loader):.4f}")

Epoch [1/20], Loss: 0.8841
Epoch [2/20], Loss: 0.0447
Epoch [3/20], Loss: 0.0210
Epoch [4/20], Loss: 0.0165
Epoch [5/20], Loss: 0.0152
Epoch [6/20], Loss: 0.0143
Epoch [7/20], Loss: 0.0138
Epoch [8/20], Loss: 0.0139
Epoch [9/20], Loss: 0.0136
Epoch [10/20], Loss: 0.0136
Epoch [11/20], Loss: 0.0135
Epoch [12/20], Loss: 0.0134
Epoch [13/20], Loss: 0.0134
Epoch [14/20], Loss: 0.0133
Epoch [15/20], Loss: 0.0135
Epoch [16/20], Loss: 0.0133
Epoch [17/20], Loss: 0.0134
Epoch [18/20], Loss: 0.0133
Epoch [19/20], Loss: 0.0133
Epoch [20/20], Loss: 0.0132


## 6. Make Predictions

In [7]:
import numpy as np

def predict_transaction(model, merchant, vocab, category_list):
    """
    Predicts the category of a transaction using padded merchant names.
    """
    try:
        # Tokenize & encode merchant name
        tokens = tokenize_merchant(merchant)
        token_ids = [vocab.get(token, vocab["unknown"]) for token in tokens]
        padded_tokens = torch.zeros(1, 10, dtype=torch.long)
        padded_tokens[:, :len(token_ids)] = torch.tensor(token_ids[:10], dtype=torch.long)

        # Predict category
        model.eval()
        with torch.no_grad():
            output = model(padded_tokens.to(device))
            _, predicted = torch.max(output, 1)  # Get most confident category

        predicted_category_index = int(predicted.item())  # Convert tensor to Python int
        predicted_category = category_list[predicted_category_index]  # Map index to category name

        return predicted_category

    except Exception as e:
        print(f"Error during prediction: {e}")
        return None


# Run Predictions
test_merchants = ["Sams Club", "Amazon", "United", "McDonald's", "Target"]
predicted_categories = [predict_transaction(model, m, vocab, category_encoder.classes_) for m in test_merchants]

print(f"Test Transactions: {test_merchants}")
print(f"Predicted Categories: {predicted_categories}")

Test Transactions: ['Sams Club', 'Amazon', 'United', "McDonald's", 'Target']
Predicted Categories: ['Retail', 'Retail', 'Airlines', 'Fast Food', 'Retail']


## 7. Export Model and Vocab

In [9]:
import torch
import pickle

# Save the trained model
torch.save(model.state_dict(), '../model/trained_model.pth')

# Save the tokenizer/vocabulary
with open('../model/tokenizer.pkl', 'wb') as f:
    pickle.dump(vocab, f)