In [None]:
# ================================================================================
# LIBRARY IMPORTS - LSTM Sentiment Analysis Model
# ================================================================================
# Import necessary libraries for deep learning sentiment analysis

# Numerical and data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Machine learning utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# PyTorch deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Custom tokenizer for text preprocessing
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers, processors

# Google Drive integration
from google.colab import drive
import os

In [None]:
# Mount Google Drive to access datasets and save models
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Add custom module path for importing helper functions
import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')

In [None]:
%%writefile /content/drive/MyDrive/AAI-590/text_helpers.py

# ================================================================================
# TEXT PREPROCESSING AND SENTIMENT LABELING UTILITIES
# ================================================================================
# Helper functions for text cleaning and sentiment label conversion

import re
import pandas as pd
from typing import *

def build_text(title: str, review: str) -> str:
    """
    Concatenate review title and review text into a single lowercase string.
    
    Args:
        title (str): Review title
        review (str): Review text content
        
    Returns:
        str: Combined and lowercased text with title and review separated by newline
    """
    return (str(title or "") + " \n" + str(review or "")).strip().lower()

def safe_get(row: pd.Series, col: str) -> str:
    """
    Safely extract and clean text from a DataFrame row.
    
    Handles missing values (NaN) and removes non-English characters while preserving
    basic punctuation. Keeps only: letters, numbers, spaces, and .,!?-
    
    Args:
        row (pd.Series): DataFrame row
        col (str): Column name to extract
        
    Returns:
        str: Cleaned text string, empty string if value is missing
    """
    value = row[col] if col in row and not pd.isna(row[col]) else ""
    if isinstance(value, str):
        # Keep only English letters, numbers, spaces, and basic punctuation
        return re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', value)
    return str(value)

def rating_to_sentiment_label(rating):
    """
    Convert numerical rating to sentiment label for classification.
    
    Rating mapping:
        - 4-5 stars: 'positive'
        - 3 stars: 'neutral'
        - 1-2 stars: 'negative'
    
    Args:
        rating (float): Numerical rating value
        
    Returns:
        str or None: Sentiment label ('positive', 'neutral', 'negative') or None for invalid ratings
    """
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    elif rating <= 2:
        return 'negative'
    return None  # Handle cases where rating might be NaN or unexpected value

Overwriting /content/drive/MyDrive/AAI-590/text_helpers.py


In [None]:
# ================================================================================
# IMPORT HELPER FUNCTIONS
# ================================================================================
# Load text preprocessing and sentiment conversion utilities

import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')
from text_helpers import build_text, safe_get, rating_to_sentiment_label

print("build_text, safe_get, and rating_to_sentiment_label functions imported from text_helpers.py")

build_text, safe_get, and rating_to_sentiment_label functions imported from text_helpers.py


In [None]:
# ================================================================================
# LOAD AND COMBINE DATASETS
# ================================================================================
# Load product review datasets from CSV files and combine them

# Define file paths
csv_file_path_1 = '/content/drive/MyDrive/AAI-590/Womens-clothingReviews.csv'
csv_file_path_2 = '/content/drive/MyDrive/AAI-590/shoe-care-insolesReviews.csv'

# Load datasets
df1 = pd.read_csv(csv_file_path_1)
df2 = pd.read_csv(csv_file_path_2)

print("DataFrame loaded successfully.")
print("First 5 rows of the DataFrame:")
print(df1.head())

print("\nColumn names and their data types:")
print(df1.info())

# Select columns relevant for sentiment analysis
selected_columns = ['SourceClient', 'OriginalProductName', 'Title', 'ReviewText', 'Rating', 'IsRecommended']

# Add category labels to distinguish data sources
df_selected1 = df1[selected_columns]
df_selected1['Category'] = 'Women_Clothing'

df_selected2 = df2[selected_columns]
df_selected2['Category'] = 'Shoe_insole'

# Combine datasets
df_selected = pd.concat([df_selected1, df_selected2], ignore_index=True)

display(df_selected.head())

DataFrame loaded successfully.
First 5 rows of the DataFrame:
          Id                                   CID SourceClient  \
0  314505599  43bb9f86-6037-5f19-8dc4-06e2f59bb9f2     carhartt   
1  311030383  e2655a8b-e7d1-56e2-9196-193c93bd2dbf     carhartt   
2  311030373  287c81dc-05c3-5b2a-94b5-4c39e389cc21     carhartt   
3  276723966  5d33f88d-c9ec-52bb-ad02-7f59df3c8dd5     carhartt   
4  276451217  c146de06-eb01-5b8a-be47-c2574b577259     carhartt   

               LastModeratedTime           LastModificationTime  ProductId  \
0  2024-07-17T14:46:11.000+00:00  2024-07-17T14:46:11.000+00:00     328283   
1  2024-06-04T14:47:21.000+00:00  2024-06-04T14:47:21.000+00:00     328283   
2  2024-06-04T14:47:21.000+00:00  2024-06-04T14:47:21.000+00:00     328283   
3  2024-04-15T14:15:47.000+00:00  2024-04-15T14:15:47.000+00:00     328283   
4  2024-04-10T14:46:17.000+00:00  2024-04-10T14:46:17.000+00:00     328283   

  OriginalProductName    UserLocation                   AuthorId  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected1['Category'] = 'Women_Clothing'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected2['Category'] = 'Shoe_insole'


Unnamed: 0,SourceClient,OriginalProductName,Title,ReviewText,Rating,IsRecommended,Category
0,carhartt,Knit Cuffed Beanie,Great hat,"These hats are great. They are warm, comfortab...",5,,Women_Clothing
1,carhartt,Knit Cuffed Beanie,"Awkward fit, miss my old Carhartt beanie",I ordered this beanie to replace a Carhartt be...,2,,Women_Clothing
2,carhartt,Knit Cuffed Beanie,,But for these beanies and they all were great....,5,,Women_Clothing
3,carhartt,Knit Cuffed Beanie,Beanie2024Chicago,My son immediately worn his beanie to work. Ap...,5,,Women_Clothing
4,carhartt,Knit Cuffed Beanie,Best Beanie,Great fit and fast shipping. My Daughters need...,5,,Women_Clothing


In [7]:
%%writefile /content/drive/MyDrive/AAI-590/attribute_rules.py

rules_list = [
    # Warmth
    (r"\b(warmest|toasty|cozy|very warm|super warm|warm & toasty|warm hat)\b", "Warmth:high"),
    (r"\b(keeps .* head warm|keeps .* ears warm|kept .* head warm|keeps me warm|kept me warm)\b", "Warmth:high"),
    (r"\b(not warm|isn't warm|cold passes through|wind (?:passes|blows) right through|doesn't keep .* warm|not as warm)\b", "Warmth:low"),
    (r"\b(wind (?:blocking|proof)|blocks the wind)\b", "Warmth:wind_blocking"),
    # Fit / Sizing
    (r"\b(too tight|very tight|super tight|tight fit|gives me headaches)\b", "Fit:tight"),
    (r"\b(too small|runs small|child'?s size|youth size|smaller than|tiny)\b", "Fit:small"),
    (r"\b(snug fit|snug)\b", "Fit:snug"),
    (r"\b(too loose|loose fit|baggy|very loose|ill[- ]?fitting|slides off|slips off)\b", "Fit:loose"),
    (r"\b(rides up|creeps up|shrinks off my head)\b", "Fit:rides_up"),
    (r"\b(too long|very long|super long|tall|conehead|pointy|smurf|extra fabric|excess fabric|sticks up|floppy|looks like a cone|cat in the hat)\b", "Fit:excess_length"),
    (r"\b(one size fits all|osfa)\b", "Sizing:one_size"),
    (r"\b(inconsistent size|quality control|different sizes|wildly different sizes|manufacturing oversight)\b", "Sizing:inconsistent"),
    # Color
    (r"\b(color (?:exactly|as pictured|as described|true to (?:pic|picture|site)))\b", "Color:accurate"),
    (r"\b(color (?:wrong|off|not .* (?:as pictured|as described)|different in person)|looks (?:brown|dark) not .* (?:purple|burgundy|blackberry)|blackberry .* brown)\b", "Color:off"),
    (r"\b(too bright|very bright|hunter orange|blaze orange|brite lime|high(?:-)?vis|hi(?:-)?vis|osha)\b", "Color:bright_hi_vis"),
    (r"\b(darker than|much darker|very dark|almost black)\b", "Color:darker"),
    # Material / Hand feel
    (r"\b(soft|super soft|so soft)\b", "Material:soft"),
    (r"\b(itchy|makes my forehead itchy|not soft)\b", "Material:itchy"),
    (r"\b(thin|not thick|light ?weight)\b", "Material:thin"),
    (r"\b(thick|hefty knit|double layer)\b", "Material:thick"),
    (r"\bacrylic\b", "Material:acrylic"),
    (r"\b(quality (?:went down|declined|poor)|worse quality|quality change|not the same quality)\b", "Quality:declined"),
    # Stretch
    (r"\b(stretches (?:well|easily)|good stretch)\b", "Stretch:good"),
    (r"\b(doesn't stretch|won't stretch)\b", "Stretch:poor"),
    (r"\b(stretched out|gets baggy|stretch(?:es)? out easy)\b", "Stretch:stretches_out"),
    # Durability / Construction
    (r"\b(durable|lasts forever|last for years|very well made|well made)\b", "Durability:high"),
    (r"\b(stitching (?:crooked|came out)|logo (?:fell off|upside down|incorrect)|hole|defective|poor quality control)\b", "Durability:issues"),
    # Style / Use cases
    (r"\b(cute|stylish|looks great|fashion|trendy)\b", "Style:cute"),
    (r"\b(classic|staple|iconic)\b", "Style:classic"),
    (r"\b(hard hat|jobsite|work|warehouse|freezer|construction|farm)\b", "Use:work"),
    (r"\b(hunt(?:ing)?|blaze orange)\b", "Use:hunting"),
    (r"\b(ski(?:ing)?|snowboard|hiking|mountaineering|outdoors|camping)\b", "Use:outdoors"),
    (r"\b(gift|present|stocking stuffer)\b", "Use:gift"),
    # Country of origin
    (r"made in usa", "Made_in:USA"),
    (r"made in canada", "Made_in:Canada"),
    (r"made in china", "Made_in:China"),
    (r"made in vietnam", "Made_in:Vietnam"),
    # Price & Service
    (r"\b(overpriced|pricey)\b", "Price:overpriced"),
    (r"\b(great price|fair price|good price|value)\b", "Price:value"),
    (r"\b(fast shipping|arrived quickly|quick delivery)\b", "Shipping:fast"),
    (r"\b(slow shipping|arrived late|took (?:weeks|long)|delivery .* (?:late|slow))\b", "Shipping:slow"),
    # Care
    (r"\b(hand wash|do not (?:machine )?dry|shrinks|shrinkage|wash(?:es)? well|launder)\b", "Care:notes"),
    # ---------- Product types ----------
    (r"\bodor[- ]?x\b|\bodor[- ]?fighting foot powder\b|\bfoot powder\b", "Product:FootPowder"),
    (r"\binsoles?\b|\bshoe inserts?\b", "Product:Insole"),
    (r"\bheel (?:cushion|cup)s?\b", "Product:HeelCushion"),
    (r"\blaces?\b|\bshoelaces?\b", "Product:Laces"),
    (r"\bshoe ?horn\b", "Product:ShoeHorn"),
    (r"\bshoe trees?\b", "Product:ShoeTrees"),

    # ---------- Odor & sweat ----------
    (r"\b(kills the funk|odor fighter|no more (?:stinky|smelly) feet|"
      r"eliminat(?:e|es|ed) odor|destroy(?:s|ed)? odor|neutraliz(?:e|es|ed)|"
      r"odor[- ]?control)\b", "OdorControl:effective"),
    (r"\b(?:odor|smell) (?:still|worse|bad)|does(?:n't| not) (?:work|help).*(?:odor|smell)\b", "OdorControl:ineffective"),
    (r"\b(keeps|kept) (?:feet|socks) (?:dry|cool)\b", "Sweat:dry"),
    (r"\b(?:too|very|brutally) dry|dries out (?:feet|skin)\b", "Sweat:overdry"),
    (r"\b(messy|powder footprints?|white footprint|powder all over)\b", "Powder:messy"),
    (r"\b(light(?:ly)?|mild) (?:scent|smell)\b", "Scent:mild"),
    (r"\b(strong|heavy) (?:scent|smell)\b", "Scent:strong"),
    (r"\b(makes .*feet.* sweaty|feet (?:sweaty|hot))\b", "Sweat:increased"),

    # ---------- Comfort & support ----------
    (r"\b(walking on (?:air|clouds|pillows)|cushion(?:ing)?|cushy|massaging gel|"
      r"comfortable|comfort)\b", "Comfort:high"),
    (r"\b(reduce(?:s|d)? (?:fatigue|tired)|more (?:energy|support)|support(?:ive)?)\b", "Support:good"),
    (r"\b(plantar fasciitis|heel pain relief|help(?:s)? my heels)\b", "Health:heel_pain_relief"),
    (r"\barch support\b", "Support:arch"),
    (r"\b(no arch support|arch too (?:low|high)|painful arch support)\b", "Support:arch_issue"),
    (r"\b(hard|rigid|too (?:hard|stiff)|uncomfortable|painful)\b", "Comfort:low"),

    # ---------- Fit / size / install ----------
    (r"\b(trim to size|cut to size|easy to trim|sizing guide)\b", "Fit:trim_to_size"),
    (r"\b(too thick|bulky|makes shoes tight|snug fit|no room|raised heel)\b", "Fit:too_thick"),
    (r"\b(not wide enough|narrow|size mismark(?:ed)?|wrong size)\b", "Fit:size_issue"),
    (r"\b(slid(?:e|es|ing)|moves?|bunch(?:es)? up|curl(?:s|ed)|does(?:n't| not) stay in place)\b", "Fit:moves_in_shoe"),
    (r"\b(squeak|gurgle|nois(?:e|y))\b", "Fit:noise"),

    # ---------- Durability / build quality ----------
    (r"\b(fell apart|separat(?:e|ed)|delaminat(?:e|ed)|"
      r"gel (?:leaked|disintegrated|broke|flattened)|"
      r"wore out (?:quickly|fast)|poor lifespan|quality (?:declined|down))\b", "Durability:issues"),
    (r"\b(last(?:s|ed) (?:weeks|months|years)|durable)\b", "Durability:high"),

    # ---------- Use contexts ----------
    (r"\b(work boots?|steel toe|warehouse|factory|concrete|long shift|12 ?hours?|"
      r"standing all day|on my feet all day)\b", "Use:work_long_hours"),
    (r"\b(hiking|trail|outdoors|boots)\b", "Use:hiking"),
    (r"\b(nurse|teacher|barista|server|construction|landscap(?:er|ing))\b", "Use:profession"),
    (r"\b(dress shoes|heels?|cowboy boots|rain boots|sneakers|tennis shoes|keds|converse)\b", "Use:shoe_type"),

    # ---------- Laces ----------
    (r"\bstay tied\b", "Laces:stay_tied"),
    (r"\bthin\b", "Laces:thin"),
    (r"\bwaxed\b", "Laces:waxed"),

    # ---------- Shoe horn / trees ----------
    (r"\bshoe ?horn\b.*\b(?:useless|spring).*(?:rigid|support)|\bspring\b.*\bdoes(?:n't| not) allow support\b", "ShoeHorn:ineffective"),
    (r"\bprevent wrinkles|stretch the length\b", "ShoeTrees:effective"),
    # Recommendation
    (r"\b(highly recommend|would recommend|recommend to|definitely recommend)\b", "Recommendation:recommend"),
    (r"\b(would not recommend|won't recommend|not recommend)\b", "Recommendation:not_recommend")
]

Overwriting /content/drive/MyDrive/AAI-590/attribute_rules.py


In [None]:
# ================================================================================
# PREPARE TEXT AND SENTIMENT LABELS
# ================================================================================
# Combine title and review text, then convert ratings to sentiment labels

df_selected["__text__"] = df_selected.apply(lambda row: build_text(safe_get(row, "Title"),
                                                     safe_get(row, "ReviewText")), axis=1)
df_selected["sentiment"] = df_selected["Rating"].apply(rating_to_sentiment_label)

In [None]:
# Display first few rows to verify text and sentiment label creation
df_selected.head()

Unnamed: 0,SourceClient,OriginalProductName,Title,ReviewText,Rating,IsRecommended,Category,__text__,sentiment
0,carhartt,Knit Cuffed Beanie,Great hat,"These hats are great. They are warm, comfortab...",5,,Women_Clothing,great hat \nthese hats are great. they are war...,positive
1,carhartt,Knit Cuffed Beanie,"Awkward fit, miss my old Carhartt beanie",I ordered this beanie to replace a Carhartt be...,2,,Women_Clothing,"awkward fit, miss my old carhartt beanie \ni o...",negative
2,carhartt,Knit Cuffed Beanie,,But for these beanies and they all were great....,5,,Women_Clothing,but for these beanies and they all were great....,positive
3,carhartt,Knit Cuffed Beanie,Beanie2024Chicago,My son immediately worn his beanie to work. Ap...,5,,Women_Clothing,beanie2024chicago \nmy son immediately worn hi...,positive
4,carhartt,Knit Cuffed Beanie,Best Beanie,Great fit and fast shipping. My Daughters need...,5,,Women_Clothing,best beanie \ngreat fit and fast shipping. my ...,positive


In [None]:
# Extract only the columns needed for sentiment model training
selected_columns = ['__text__', 'sentiment']
rating_train = df_selected[selected_columns]

In [11]:
rating_train.head()

Unnamed: 0,__text__,sentiment
0,great hat \nthese hats are great. they are war...,positive
1,"awkward fit, miss my old carhartt beanie \ni o...",negative
2,but for these beanies and they all were great....,positive
3,beanie2024chicago \nmy son immediately worn hi...,positive
4,best beanie \ngreat fit and fast shipping. my ...,positive


In [None]:
# Convert DataFrame columns to lists for processing
texts = rating_train['__text__'].tolist()
labels = rating_train['sentiment'].tolist()

In [None]:
# ================================================================================
# TRAIN-TEST SPLIT
# ================================================================================
# Split data into training and testing sets (85% train, 15% test)

x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.15, random_state=23)

In [None]:
# ================================================================================
# INITIALIZE CUSTOM TOKENIZER
# ================================================================================
# Create a Byte Pair Encoding (BPE) tokenizer using HuggingFace Tokenizers library

# Initialize tokenizer with BPE model
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()  # Split on whitespace
tokenizer.normalizer = normalizers.Lowercase()  # Convert to lowercase

# Flatten the training text data for tokenizer training
flatten_data = [text for text in x_train]

In [None]:
# ================================================================================
# TRAIN TOKENIZER
# ================================================================================
# Train the BPE tokenizer on the training data with specified vocabulary size

max_length = 256  # Maximum sequence length for padding/truncation
vocab_size = 12000  # Size of vocabulary to learn
special_tokens = ["<unk>", "<pad>", "<cls>", "<sep>", "<mask>"]  # Special tokens

# Create trainer with configuration
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens)

# Train tokenizer on the training corpus
tokenizer.train_from_iterator(flatten_data, trainer=trainer)

In [None]:
# ================================================================================
# CONFIGURE POST-PROCESSING
# ================================================================================
# Add special tokens and configure BERT-style post-processing

tokenizer.add_special_tokens(special_tokens)

# Set up post-processing to add [CLS] and [SEP] tokens like BERT
tokenizer.post_processor = processors.BertProcessing(
    ('[CLS]', tokenizer.token_to_id('<cls>')), 
    ('[SEP]', tokenizer.token_to_id('<sep>'))
)

In [None]:
# ================================================================================
# ENABLE TRUNCATION AND PADDING
# ================================================================================
# Configure tokenizer to handle variable-length sequences

# Enable truncation to max_length
tokenizer.enable_truncation(max_length=max_length)

# Enable padding to ensure all sequences are the same length
tokenizer.enable_padding(
    pad_id=tokenizer.token_to_id("<pad>"),
    pad_token="<pad>",
    length=max_length
)

In [None]:
# Save the trained tokenizer to Google Drive for later use during inference
tokenizer.save("/content/drive/MyDrive/AAI-590/tokenizer.json")

In [None]:
# Load the saved tokenizer from file
tokenizer = Tokenizer.from_file("/content/drive/MyDrive/AAI-590/tokenizer.json")

In [None]:
# ================================================================================
# DATA PREPROCESSING UTILITIES
# ================================================================================
# Functions to tokenize text and create PyTorch DataLoaders

def tokenize_text(text):
    """
    Tokenize a batch of text using the trained tokenizer.
    
    Args:
        text (list): List of text strings
        
    Returns:
        list: List of token ID sequences
    """
    encodings = tokenizer.encode_batch(text)
    return [encode.ids for encode in encodings]

def create_dataloader(sequences, labels, batch_size=64):
    """
    Create a PyTorch DataLoader from token sequences and labels.
    
    Args:
        sequences (list): List of token ID sequences
        labels (list): List of sentiment labels ('negative', 'neutral', 'positive')
        batch_size (int): Batch size for training
        
    Returns:
        DataLoader: PyTorch DataLoader object
    """
    sequences = torch.tensor(sequences)
    
    # Map string labels to numerical indices for CrossEntropyLoss
    label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    numerical_labels = [label_map[label] for label in labels]
    
    # Labels must be LongTensor for CrossEntropyLoss
    labels = torch.tensor(numerical_labels, dtype=torch.long)
    print(f"DEBUG: Labels dtype after creation in create_dataloader: {labels.dtype}")
    
    # Create TensorDataset and DataLoader
    dataset = TensorDataset(sequences, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, prefetch_factor=2)

def preprocess_texts(texts, labels, max_len=None, batch_size=64):
    """
    Tokenize texts and create a DataLoader.
    
    Args:
        texts (list): List of text strings
        labels (list): List of sentiment labels
        max_len (int): Maximum sequence length (unused, handled by tokenizer)
        batch_size (int): Batch size for training
        
    Returns:
        DataLoader: PyTorch DataLoader ready for training
    """
    padded_sequences = tokenize_text(texts)
    data_loader = create_dataloader(padded_sequences, labels, batch_size)
    return data_loader

def calculate_accuracy(outputs, original_labels):
    """
    Calculate accuracy from model outputs and true labels.
    
    Args:
        outputs (torch.Tensor): Raw logits from model (batch_size, num_classes)
        original_labels (torch.Tensor): True class indices (batch_size,)
        
    Returns:
        torch.Tensor: Accuracy as a scalar tensor
    """
    # Get predicted classes by taking argmax of logits
    predicted_classes = torch.argmax(outputs, dim=1)
    correct = (predicted_classes == original_labels).float()
    return correct.sum() / len(correct)

In [None]:
# ================================================================================
# MODEL CONFIGURATION
# ================================================================================
# Define hyperparameters for the LSTM sentiment analysis model

cfg = {
    "vocab_size": vocab_size,      # Size of vocabulary (12000)
    "emb_dim": 128,                # Embedding dimension
    "hidden_dim": 512,             # LSTM hidden state dimension
    "num_layers": 5,               # Number of LSTM layers
    "bidirectional": True,         # Use bidirectional LSTM
    "dropout": 0.15,               # Dropout probability for regularization
    "seq_len": 256,                # Maximum sequence length
}

In [None]:
# ================================================================================
# CREATE TRAIN DATALOADER
# ================================================================================
# Preprocess training data and create DataLoader

train_dataloader_path = '/content/drive/MyDrive/AAI-590/train_dataloader.pt'

# Force recreation of dataloader to ensure correct label dtype
if os.path.exists(train_dataloader_path):
    os.remove(train_dataloader_path)
    print(f"Removed old train_dataloader from {train_dataloader_path}")

print("Creating train_dataloader...")
train_dataloader = preprocess_texts(x_train, y_train)

# Save the dataloader for future use
torch.save(train_dataloader, '/content/drive/MyDrive/AAI-590/train_dataloader.pt')
print(f"New train_dataloader saved to {train_dataloader_path}")

Removed old train_dataloader from /content/drive/MyDrive/AAI-590/train_dataloader.pt
Creating train_dataloader...
DEBUG: Labels dtype after creation in create_dataloader: torch.int64
New train_dataloader saved to /content/drive/MyDrive/AAI-590/train_dataloader.pt


In [None]:
# ================================================================================
# CREATE TEST DATALOADER
# ================================================================================
# Preprocess test data and create DataLoader

test_dataloader_path = '/content/drive/MyDrive/AAI-590/test_dataloader.pt'

# Force recreation of dataloader to ensure correct label dtype
if os.path.exists(test_dataloader_path):
    os.remove(test_dataloader_path)
    print(f"Removed old test_dataloader from {test_dataloader_path}")

print("Creating test_dataloader...")
test_dataloader = preprocess_texts(x_test, y_test)

# Save the dataloader for future use
torch.save(test_dataloader, '/content/drive/MyDrive/AAI-590/test_dataloader.pt')
print(f"New test_dataloader saved to {test_dataloader_path}")

Removed old test_dataloader from /content/drive/MyDrive/AAI-590/test_dataloader.pt
Creating test_dataloader...
DEBUG: Labels dtype after creation in create_dataloader: torch.int64
New test_dataloader saved to /content/drive/MyDrive/AAI-590/test_dataloader.pt


In [None]:
# ================================================================================
# LSTM SENTIMENT ANALYSIS MODEL ARCHITECTURE
# ================================================================================
# Define a deep learning model using LSTM, CNN, and fully connected layers

import torch
import torch.nn as nn

class SentimentAnalysisModel(nn.Module):
    """
    Multi-layered sentiment analysis model combining:
    - Embedding layer for word representations
    - Bidirectional LSTM for sequence processing
    - 1D Convolution for feature extraction
    - Max pooling for dimensionality reduction
    - Fully connected layers for classification
    """
    
    def __init__(self, cfg):
        super(SentimentAnalysisModel, self).__init__()
        
        # Embedding layer: converts token IDs to dense vectors
        self.embedding = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        
        # Bidirectional LSTM: processes sequences in both directions
        self.lstm = nn.LSTM(
            input_size=cfg['emb_dim'],
            hidden_size=cfg['hidden_dim'],
            num_layers=cfg['num_layers'],
            batch_first=True,
            bidirectional=cfg['bidirectional'],
            dropout=cfg['dropout']
        )
        
        # 1D Convolution: extract local features from LSTM output
        # Input channels: hidden_dim * 2 (because bidirectional)
        self.conv1 = nn.Conv1d(
            in_channels=cfg['hidden_dim'] * 2,
            out_channels=128,
            kernel_size=3
        )
        
        # Max pooling: reduce sequence length by half
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        # Calculate flattened size after convolution and pooling
        # Formula: (seq_len - kernel_size + 1) // pool_kernel_size
        # (256 - 3 + 1) // 2 = 254 // 2 = 127
        # Flattened size: 128 channels * 127 = 16256
        
        # First fully connected layer
        self.fc1 = nn.Linear(128 * 127, 64)
        
        # Output layer: 3 classes (negative, neutral, positive)
        self.fc2 = nn.Linear(64, 3)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(cfg['dropout'])

    def forward(self, x):
        """
        Forward pass through the model.
        
        Args:
            x (torch.Tensor): Input token IDs (batch_size, seq_len)
            
        Returns:
            torch.Tensor: Raw logits for 3 classes (batch_size, 3)
        """
        # Embedding: (batch_size, seq_len) -> (batch_size, seq_len, emb_dim)
        x = self.embedding(x)
        
        # LSTM: (batch_size, seq_len, emb_dim) -> (batch_size, seq_len, hidden_dim*2)
        lstm_out, _ = self.lstm(x)
        
        # Permute for Conv1d: (batch_size, seq_len, hidden_dim*2) -> (batch_size, hidden_dim*2, seq_len)
        lstm_out = lstm_out.permute(0, 2, 1)
        
        # Convolution: (batch_size, hidden_dim*2, seq_len) -> (batch_size, 128, seq_len-2)
        conv_out = self.conv1(lstm_out)
        
        # Pooling: (batch_size, 128, seq_len-2) -> (batch_size, 128, (seq_len-2)//2)
        pooled_out = self.pool(conv_out)
        
        # Flatten: (batch_size, 128, 127) -> (batch_size, 128*127)
        flattened = pooled_out.view(pooled_out.size(0), -1)
        
        # Fully connected layers with dropout and ReLU activation
        x = self.dropout(torch.relu(self.fc1(flattened)))
        
        # Output layer: return raw logits (no softmax, handled by CrossEntropyLoss)
        x = self.fc2(x)
        
        return x


# ================================================================================
# INITIALIZE MODEL
# ================================================================================

model = SentimentAnalysisModel(cfg)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Model initialized and moved to {device}")

SentimentAnalysisModel(
  (embedding): Embedding(12000, 128)
  (lstm): LSTM(128, 512, num_layers=5, batch_first=True, dropout=0.15, bidirectional=True)
  (conv1): Conv1d(1024, 128, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=16256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=3, bias=True)
  (dropout): Dropout(p=0.15, inplace=False)
)

In [None]:
# ================================================================================
# MODEL TRAINING SETUP AND EXECUTION
# ================================================================================
# Configure loss function, optimizer, and training loop

# Loss function: CrossEntropyLoss for multi-class classification
criterion = nn.CrossEntropyLoss()

# Optimizer: Adam with learning rate 0.001
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Path to save trained model
model_save_path = '/content/drive/MyDrive/AAI-590/sentiment_analysis_model.pth'

def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    """
    Train the sentiment analysis model.
    
    Args:
        model: PyTorch model to train
        train_loader: DataLoader with training data
        criterion: Loss function
        optimizer: Optimization algorithm
        device: Device to train on (CPU or GPU)
        num_epochs: Number of training epochs
    """
    model.train()
    
    for epoch in range(num_epochs):
        print(f"Starting Epoch {epoch+1}/{num_epochs}")
        
        for batch_idx, (sequences, labels) in enumerate(train_loader):
            # Move data to device
            sequences = sequences.to(device)
            labels = labels.to(device)  # Labels are already LongTensor (0, 1, 2)

            # Debug: verify label dtype on first batch
            if batch_idx == 0:
                print(f"DEBUG: Labels dtype in train_model (on device): {labels.dtype}")

            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass: get model predictions (logits)
            outputs = model(sequences.long())
            
            # Compute loss (CrossEntropyLoss expects raw logits and class indices)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Print progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

# ================================================================================
# TRAIN OR LOAD MODEL
# ================================================================================

# Check if model already exists to avoid retraining
if not os.path.exists(model_save_path):
    print("Starting training...")
    train_model(model, train_dataloader, criterion, optimizer, device)
    print("Training complete.")
    
    # Save the trained model
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")
else:
    print(f"Model already found at {model_save_path}. Skipping training.")
    
    # Load pre-trained model weights
    model.load_state_dict(torch.load(model_save_path))
    model.to(device)  # Ensure model is on correct device
    print("Pre-trained model loaded successfully.")

Starting training...
Starting Epoch 1/10
DEBUG: Labels dtype in train_model (on device): torch.int64
Epoch [1/10], Batch [100/628], Loss: 0.5138
Epoch [1/10], Batch [200/628], Loss: 0.3953
Epoch [1/10], Batch [300/628], Loss: 0.3266
Epoch [1/10], Batch [400/628], Loss: 0.3216
Epoch [1/10], Batch [500/628], Loss: 0.2574
Epoch [1/10], Batch [600/628], Loss: 0.1564
Starting Epoch 2/10
DEBUG: Labels dtype in train_model (on device): torch.int64
Epoch [2/10], Batch [100/628], Loss: 0.3769
Epoch [2/10], Batch [200/628], Loss: 0.4733
Epoch [2/10], Batch [300/628], Loss: 0.2501
Epoch [2/10], Batch [400/628], Loss: 0.1998
Epoch [2/10], Batch [500/628], Loss: 0.1674
Epoch [2/10], Batch [600/628], Loss: 0.1457
Starting Epoch 3/10
DEBUG: Labels dtype in train_model (on device): torch.int64
Epoch [3/10], Batch [100/628], Loss: 0.1162
Epoch [3/10], Batch [200/628], Loss: 0.1858
Epoch [3/10], Batch [300/628], Loss: 0.1325
Epoch [3/10], Batch [400/628], Loss: 0.1438
Epoch [3/10], Batch [500/628], Loss

In [None]:
# Save DataLoaders to Google Drive for future use
torch.save(train_dataloader, '/content/drive/MyDrive/AAI-590/train_dataloader.pt')
torch.save(test_dataloader, '/content/drive/MyDrive/AAI-590/test_dataloader.pt')
print("DataLoaders saved successfully.")

In [None]:
# ================================================================================
# MODEL EVALUATION ON TEST SET
# ================================================================================
# Evaluate trained model performance and generate classification metrics

model.eval()  # Set model to evaluation mode
epoch_val_loss = 0
epoch_val_acc = 0
all_preds = []
all_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch_seq, batch_labels in test_dataloader:
        # Move data to device
        batch_seq = batch_seq.to(device)
        original_batch_labels = batch_labels.to(device)  # Keep as LongTensor (0, 1, 2)

        # Debug: verify label dtype
        print(f"DEBUG: Labels dtype in evaluation (on device): {original_batch_labels.dtype}")

        # Forward pass: get predictions
        output = model(batch_seq.long())

        # Compute loss
        loss = criterion(output, original_batch_labels)
        epoch_val_loss += loss.item()
        
        # Compute accuracy
        epoch_val_acc += calculate_accuracy(output, original_batch_labels).item()

        # Collect predictions and labels for detailed metrics
        preds = torch.argmax(output, dim=1).cpu().numpy()
        labels = original_batch_labels.cpu().numpy().flatten()
        all_preds.extend(preds)
        all_labels.extend(labels)

# Calculate average loss and accuracy
val_loss = epoch_val_loss / len(test_dataloader)
val_acc = epoch_val_acc / len(test_dataloader)

# ================================================================================
# GENERATE CLASSIFICATION REPORT
# ================================================================================

# Map numerical labels to sentiment names for display
label_names = ['negative', 'neutral', 'positive']

# Generate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)

# Generate detailed classification report
class_report = classification_report(
    all_labels, 
    all_preds, 
    target_names=label_names, 
    zero_division=0  # Avoid warnings for classes with no predictions
)

# ================================================================================
# DISPLAY RESULTS
# ================================================================================

print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
print("-" * 50)
print("Final Confusion Matrix:\n", conf_matrix)
print("\nFinal Classification Report:\n", class_report)

DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int64
DEBUG: Labels dtype in evaluation (on device): torch.int

In [31]:
def inference(model, text, tokenizer, device, cfg):
    model.eval()
    cleaned_text = clean_text(text)
    encoded = tokenizer.encode(cleaned_text)
    input_ids = torch.tensor([encoded.ids]).to(device)

    with torch.no_grad():
        output = model(input_ids)

    probability = output.item()
    sentiment = "Positive" if probability < 0.5 else "Negative"
    return sentiment, probability

In [None]:
# ================================================================================
# TEXT CLEANING UTILITY FOR INFERENCE
# ================================================================================

def clean_text(text):
    """
    Clean and normalize text for model inference.
    
    Removes URLs, hashtags, HTML tags, and non-alphabetic characters.
    Converts text to lowercase and strips whitespace.
    
    Args:
        text (str): Raw text to clean
        
    Returns:
        str: Cleaned and normalized text
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = text.strip()  # Remove leading/trailing whitespace
    return text

In [None]:
# ================================================================================
# INFERENCE FUNCTION FOR SENTIMENT PREDICTION
# ================================================================================

import re
import torch.nn.functional as F

def inference(model, text, tokenizer, device, cfg):
    """
    Predict sentiment for a single text input.
    
    Args:
        model: Trained sentiment analysis model
        text (str): Input text to analyze
        tokenizer: Trained tokenizer for text encoding
        device: Device to run inference on (CPU or GPU)
        cfg (dict): Model configuration dictionary
        
    Returns:
        tuple: (sentiment_label, confidence_probability)
            - sentiment_label (str): 'negative', 'neutral', or 'positive'
            - confidence_probability (float): Confidence score for prediction
    """
    model.eval()  # Set model to evaluation mode
    
    # Clean and preprocess text
    cleaned_text = clean_text(text)
    
    # Tokenize text
    encoded = tokenizer.encode(cleaned_text)
    input_ids = torch.tensor([encoded.ids]).to(device)

    # Get model prediction
    with torch.no_grad():
        output = model(input_ids)

    # Convert logits to probabilities using softmax
    probabilities = F.softmax(output, dim=1)

    # Get the class with highest probability
    predicted_probability, predicted_index = torch.max(probabilities, dim=1)

    # Map numerical index to sentiment label
    label_map_reverse = {0: 'negative', 1: 'neutral', 2: 'positive'}
    sentiment = label_map_reverse[predicted_index.item()]

    return sentiment, predicted_probability.item()

# ================================================================================
# TEST INFERENCE WITH SAMPLE TEXT
# ================================================================================

text_to_analyze = "Very Bad fit, I didn't like it"
sentiment, probability = inference(model, text_to_analyze, tokenizer, device, cfg)
print(f"Sentiment: {sentiment}")
print(f"Probability: {probability:.4f}")

Sentiment: negative
Probability: 1.0000


In [None]:
# ================================================================================
# ADDITIONAL INFERENCE TEST
# ================================================================================
# Test the model with another sample review

text_to_analyze = "design looked good but the cloth is bad. Don't buy it"
sentiment, probability = inference(model, text_to_analyze, tokenizer, device, cfg)
print(f"Sentiment: {sentiment}")
print(f"Probability: {probability:.4f}")

Sentiment: negative
Probability: 0.9999
