<a href="https://colab.research.google.com/github/ahanam05/deep-learning/blob/main/Transaction_Classifier_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import torch
import torch.nn as nn
import torch.optim as optim
from time import time
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter

# configurations
EMBEDDING_DIM = 50
HIDDEN_SIZE = 100
MAX_LENGTH = 7
NUM_EPOCHS = 20
BATCH_SIZE = 7
TEST_SPLIT_RATIO = 0.2

# sample data (173 examples)
DATA = [
    ("bought vegetables and rice from supermarket", "grocery"),
    ("picked up fresh chicken from local butcher", "grocery"),
    ("bought snacks and juice from convenience store", "grocery"),
    ("morning coffee from café near office", "grocery"),
    ("purchased fruits for the week", "grocery"),
    ("bought bread and butter from bakery", "grocery"),
    ("weekly grocery run at hypermarket", "grocery"),
    ("ordered pizza for dinner", "grocery"),
    ("purchased chocolates from duty free shop", "grocery"),
    ("bought spices and lentils for cooking", "grocery"),
    ("picked up cereal and milk", "grocery"),
    ("bought onions, tomatoes, and coriander", "grocery"),
    ("monthly stock-up of packaged food", "grocery"),
    ("purchased cookies and tea bags", "grocery"),
    ("grabbed a sandwich on the go", "grocery"),
    ("picked up frozen food items", "grocery"),
    ("purchased pasta and sauce bottles", "grocery"),
    ("bought protein bars at store", "grocery"),
    ("weekly fresh produce market visit", "grocery"),
    ("picked up biscuits and wafers", "grocery"),
    ("bought cheese slices and yoghurt", "grocery"),
    ("purchased coffee beans", "grocery"),
    ("bought cooking oil and sugar", "grocery"),
    ("picked up chocolates for friends", "grocery"),
    ("bought water bottles and soft drinks", "grocery"),
    ("purchased instant noodles pack", "grocery"),
    ("bought salad ingredients", "grocery"),
    ("picked up green tea packets", "grocery"),
    ("ordered sushi from restaurant", "grocery"),
    ("bought ice cream for dessert", "grocery"),
    ("purchased eggs and bread for breakfast", "grocery"),
    ("picked up burger meal", "grocery"),
    ("bought kitchen spices kit", "grocery"),
    ("purchased bakery pastries", "grocery"),
    ("picked up fresh herbs from store", "grocery"),
    ("bought frozen vegetables", "grocery"),
    ("ordered biryani for lunch", "grocery"),
    ("purchased tea and sugar", "grocery"),
    ("bought tortillas and salsa", "grocery"),
    ("picked up canned beans", "grocery"),
    ("bought peanut butter and jam", "grocery"),
    ("purchased energy drinks", "grocery"),
    ("grabbed donuts for office", "grocery"),
    ("bought flour and yeast", "grocery"),
    ("picked up packaged chips", "grocery"),
    ("ordered burger and fries", "grocery"),
    ("bought marshmallows for camping", "grocery"),
    ("monthly grocery supply", "grocery"),
    ("picked up avocados and bananas", "grocery"),
    ("purchased soup packets", "grocery"),
    ("uber ride to office", "travel"),
    ("taxi to railway station", "travel"),
    ("bus ticket to downtown", "travel"),
    ("flight booking to Mumbai", "travel"),
    ("train reservation for weekend trip", "travel"),
    ("metro pass recharge", "travel"),
    ("cab ride to friend’s house", "travel"),
    ("airport shuttle service payment", "travel"),
    ("fuel refill for car", "travel"),
    ("toll booth payment", "travel"),
    ("hotel stay during conference", "travel"),
    ("car parking fee", "travel"),
    ("bike rental for city tour", "travel"),
    ("flight upgrade to premium economy", "travel"),
    ("bus fare to work", "travel"),
    ("long distance train ticket", "travel"),
    ("auto rickshaw fare", "travel"),
    ("ferry ride ticket", "travel"),
    ("car wash before road trip", "travel"),
    ("tourist sightseeing bus ticket", "travel"),
    ("cab ride to airport early morning", "travel"),
    ("overnight stay at roadside motel", "travel"),
    ("petrol top-up at gas station", "travel"),
    ("commuter bus monthly pass", "travel"),
    ("day trip bus rental", "travel"),
    ("airport lounge access fee", "travel"),
    ("checked baggage upgrade fee", "travel"),
    ("intercity taxi payment", "travel"),
    ("flight ticket cancellation fee", "travel"),
    ("hotel booking at beach resort", "travel"),
    ("auto ride to market", "travel"),
    ("weekly train commute pass", "travel"),
    ("car rental for business meeting", "travel"),
    ("taxi from workplace to home", "travel"),
    ("reserved sleeper coach ticket", "travel"),
    ("paid for shared cab ride", "travel"),
    ("purchase of travel insurance", "travel"),
    ("bike petrol refill", "travel"),
    ("bridge toll tax", "travel"),
    ("cruise boarding ticket", "travel"),
    ("interstate bus ticket", "travel"),
    ("flight seat selection add-on", "travel"),
    ("uber ride during rain", "travel"),
    ("taxi fare after midnight", "travel"),
    ("booking bus to hill station", "travel"),
    ("travel bag storage fee", "travel"),
    ("boat ride on weekend trip", "travel"),
    ("hotel late checkout fee", "travel"),
    ("visa fee payment", "travel"),
    ("parking ticket at mall", "travel"),
    ("bus ride to college", "travel"),
    ("train journey meal charge", "travel"),
    ("metro one-day pass", "travel"),
    ("uber share ride", "travel"),
    ("taxi hired for entire day", "travel"),
    ("flight drinks purchase", "travel"),
    ("shuttle to hotel from airport", "travel"),
    ("late night cab back home", "travel"),
    ("electricity bill payment", "bills"),
    ("internet broadband monthly bill", "bills"),
    ("water bill payment", "bills"),
    ("gas cylinder booking", "bills"),
    ("mobile postpaid plan payment", "bills"),
    ("spotify subscription renewal", "bills"),
    ("netflix membership fee", "bills"),
    ("amazon prime annual subscription", "bills"),
    ("credit card bill settlement", "bills"),
    ("insurance monthly premium", "bills"),
    ("rent payment for apartment", "bills"),
    ("cloud storage subscription", "bills"),
    ("youtube premium renewal", "bills"),
    ("phone recharge plan", "bills"),
    ("gym membership monthly fee", "bills"),
    ("newspaper subscription", "bills"),
    ("water service maintenance fee", "bills"),
    ("property tax payment", "bills"),
    ("car loan EMI", "bills"),
    ("bike EMI installment", "bills"),
    ("electricity surcharge fee", "bills"),
    ("annual health insurance payment", "bills"),
    ("wifi router maintenance bill", "bills"),
    ("monthly rent for co-working space", "bills"),
    ("office software license renewal", "bills"),
    ("vpn subscription fee", "bills"),
    ("website domain renewal cost", "bills"),
    ("server hosting charges", "bills"),
    ("gas pipeline monthly fee", "bills"),
    ("tv cable bill", "bills"),
    ("home loan EMI", "bills"),
    ("digital magazine subscription", "bills"),
    ("kindle unlimited subscription", "bills"),
    ("antivirus software plan", "bills"),
    ("school fee installment", "bills"),
    ("tuition payment", "bills"),
    ("electricity reconnection charge", "bills"),
    ("late payment fee for phone bill", "bills"),
    ("cloud computing usage bill", "bills"),
    ("landline phone bill", "bills"),
    ("music app yearly plan", "bills"),
    ("mobile insurance renewal", "bills"),
    ("home maintenance charges", "bills"),
    ("college exam fee", "bills"),
    ("canteen monthly card top-up", "bills"),
    ("apartment maintenance charges", "bills"),
    ("gas usage bill", "bills"),
    ("water tanker purchase", "bills"),
    ("laundry subscription fee", "bills"),
    ("online course subscription", "bills"),
    ("software update license fee", "bills"),
    ("streaming service add-on pack", "bills"),
    ("health checkup fee", "bills"),
    ("doctor consultation bill", "bills"),
    ("medical insurance co-pay", "bills"),
    ("mobile data add-on pack", "bills"),
    ("app subscription renewal", "bills"),
    ("electricity meter service fee", "bills"),
    ("club membership renewal", "bills"),
    ("internet late fee payment", "bills"),
    ("wifi installation charge", "bills"),
    ("school bus fee", "bills"),
    ("storage locker rent", "bills"),
    ("mortgage repayment", "bills"),
    ("kindergarten monthly fee", "bills"),
]

In [106]:
# preprocessing and data utilities

# tokenization and normalization
def preprocess(text):
  return [word.lower() for word in text.split(" ") if word.isalpha()]

# numericalization (building vocabulary and label maps)
def build_maps(data):
  # building the vocabulary map
  vocab_map = {}
  all_tokens = []

  for text, _ in data:
    tokens = preprocess(text)
    all_tokens.extend(tokens)

  unique_tokens = list(set(all_tokens))

  for index, token in enumerate(unique_tokens, start = 1):
    vocab_map[token] = index

  vocab_map['<PAD>']  = 0

  # building the label map
  label_map = {}

  unique_labels = sorted(list(set(label for _, label in data)))

  for index, label in enumerate(unique_labels):
    label_map[label] = index

  return vocab_map, label_map

class TransactionDataset(Dataset):
  def __init__(self, data, vocab_map, label_map, max_len):
    self.data = data
    self.vocab_map = vocab_map
    self.label_map = label_map
    self.max_len = max_len
    self.pad_idx = vocab_map['<PAD>']

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    text, label = self.data[idx]
    tokens = preprocess(text)

    token_integers = [self.vocab_map.get(token, self.pad_idx) for token in tokens]

    # padding and truncation
    if len(token_integers) < self.max_len:
        token_integers.extend([self.pad_idx] * (self.max_len - len(token_integers)))
    elif len(token_integers) > self.max_len:
        token_integers = token_integers[:self.max_len]

    text_tensor = torch.tensor(token_integers, dtype=torch.long)
    label_tensor = torch.tensor(self.label_map[label], dtype=torch.long)

    return text_tensor, label_tensor

In [107]:
# model architecture
class TransactionClassifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim = embedding_dim, padding_idx = 0)
    self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size, num_layers = 1, bias = True)
    self.linear = nn.Linear(in_features = hidden_size, out_features = num_classes)

  def forward(self, text_input):
    embedded = self.embedding(text_input)
    _, (h_n, _) = self.lstm(embedded)
    final_hidden_state = h_n[0]
    logits = self.linear(final_hidden_state)
    return logits

In [108]:
# evaluation logic
def evaluate(model, data_loader, criterion):
  model.eval()
  total_loss = 0
  correct_predictions = 0
  total_samples = 0

  with torch.no_grad():
    for inputs, labels in data_loader:
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      total_loss += loss.item() * inputs.size(0)

      _, predicted = torch.max(outputs, 1)
      correct_predictions += (predicted == labels).sum().item()
      total_samples += labels.size(0)

  accuracy = (correct_predictions/total_samples)*100
  average_loss = (total_loss/total_samples)

  return average_loss , accuracy


In [111]:
# execution
vocab_map, label_map = build_maps(DATA)
VOCAB_SIZE = len(vocab_map)
NUM_CLASSES = len(label_map)

# create and split the datasets (training and testing datasets)
full_dataset = TransactionDataset(DATA, vocab_map, label_map, MAX_LENGTH)
training_size = int((1 - TEST_SPLIT_RATIO) * len(full_dataset))
testing_size = len(full_dataset) - training_size
training_data, testing_data = random_split(full_dataset, [training_size, testing_size])

training_loader = DataLoader(training_data, batch_size = BATCH_SIZE, shuffle = True, drop_last = True)
testing_loader = DataLoader(testing_data, batch_size = BATCH_SIZE, shuffle = False)

# initialize model, optimizer and loss function
model = TransactionClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

print(f"Total Examples: {len(full_dataset)} | Train: {training_size} | Test: {testing_size} | Vocab Size: {VOCAB_SIZE}")
print(f"Starting Training ({NUM_EPOCHS} epochs)")
start_time = time()

# training loop
for epoch in range(NUM_EPOCHS):
  model.train()
  for inputs, labels in training_loader:
    # forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    # backward pass (gradient descent, parameter updates)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  train_loss, train_accuracy = evaluate(model, training_loader, criterion)
  test_loss, test_accuracy = evaluate(model, testing_loader, criterion)
  print(f"Epoch: {epoch} | Training loss: {train_loss} | Training accuracy: {train_accuracy} | Testing loss: {test_loss} | Testing accuracy: {test_accuracy}")

end_time = time()
print(f"Time taken to train model: {(end_time - start_time)} seconds")

Total Examples: 173 | Train: 138 | Test: 35 | Vocab Size: 313
Starting Training (20 epochs)
Epoch: 0 | Training loss: 1.0864427340658087 | Training accuracy: 33.08270676691729 | Testing loss: 1.1142228364944458 | Testing accuracy: 22.857142857142858
Epoch: 1 | Training loss: 1.0937497929522866 | Training accuracy: 36.09022556390977 | Testing loss: 1.1196852684020997 | Testing accuracy: 28.57142857142857
Epoch: 2 | Training loss: 1.0874060831571881 | Training accuracy: 42.857142857142854 | Testing loss: 1.1231327056884766 | Testing accuracy: 28.57142857142857
Epoch: 3 | Training loss: 1.0837098265949048 | Training accuracy: 37.59398496240601 | Testing loss: 1.1255239248275757 | Testing accuracy: 25.71428571428571
Epoch: 4 | Training loss: 1.1012830922478123 | Training accuracy: 38.34586466165413 | Testing loss: 1.1297470808029175 | Testing accuracy: 28.57142857142857
Epoch: 5 | Training loss: 1.0832093295298124 | Training accuracy: 43.609022556390975 | Testing loss: 1.1282519578933716 |