# Final Project
## CSCE 633
## Arya Rahmanian
## Summer 2024

### Load Dataset

In [65]:
import pandas as pd

# Load the data from CSV files
train_df = pd.read_csv("yelp_review_train.csv")
test_df = pd.read_csv("yelp_review_test.csv")

# Randomly sample data points for training, validation, and testing
train_data = train_df.sample(n=10000, random_state=42)
val_data = train_df.drop(train_data.index).sample(n=1000, random_state=42)
test_data = test_df.sample(n=2000, random_state=42)

# Display the first few rows to ensure data is loaded correctly
print(train_data.head())
print(val_data.head())
print(test_data.head())


                                                     text  stars
117921  This was a tough one.  After having heard good...    3.0
117777  I have been here probably 3 times in the last ...    3.0
172811  I don't usually do chain restaurants, but I st...    4.0
63245   This place is just out there... There main onl...    1.0
87607   Went here for Happy Hour one evening. Twas a g...    4.0
                                                     text  stars
102220  Nice people...working hard while under staffed...    5.0
129668  I had the worst experience today. Food was col...    1.0
97138   I've been to several burger joints in Tucson a...    5.0
113647  I would absolutely head to Cousins Maine Lobst...    5.0
93028   UPS is constantly losing our packages, every t...    1.0
                                                    text  stars
13742  Sadly I tried chipotle again after the last re...    3.0
12304  This place sells really good fresh corn tortil...    4.0
6107   Having only recently 

In [66]:
# Define a function to map stars to binary labels
def map_labels(stars):
    return 0 if stars < 4 else 1

# Apply the function to create a 'label' column
train_df['label'] = train_df['stars'].apply(map_labels)
test_data['label'] = test_data['stars'].apply(map_labels)
val_data['label'] = val_data['stars'].apply(map_labels)


In [67]:
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))

# Define a function to preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing to the 'text' column
train_df['text'] = train_df['text'].apply(preprocess_text)
val_data['text'] = val_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Airsight\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
val_sequences = tokenizer.texts_to_sequences(val_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Define a maximum length for sequences
max_length = 300

# Pad/Truncate sequences to the same length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Display the shape of the padded sequences
print(train_padded.shape)
print(val_padded.shape)
print(test_padded.shape)

(174757, 300)
(1000, 300)
(2000, 300)


In [92]:
# Load GloVe embeddings
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_filepath = 'glove.twitter.27B.200d.txt'  # Change this to your GloVe file path
embeddings_index = load_glove_embeddings(glove_filepath)

In [103]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Create embedding matrix
word_index = tokenizer.word_index
embedding_dim = 200
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Convert data to PyTorch tensors
train_padded = torch.tensor(train_padded, dtype=torch.long)
val_padded = torch.tensor(val_padded, dtype=torch.long)
test_padded = torch.tensor(test_padded, dtype=torch.long)
train_labels = torch.tensor(train_df['label'].values, dtype=torch.long)
val_labels = torch.tensor(val_data['label'].values, dtype=torch.long)
test_labels = torch.tensor(test_data['label'].values, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(train_padded, train_labels)
val_dataset = TensorDataset(val_padded, val_labels)
test_dataset = TensorDataset(test_padded, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [106]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [116]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1), :]
        return x

In [118]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, num_classes, max_len, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        self.pos_encoder = PositionalEncoding(embedding_dim, max_len)
        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward=512, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(embedding_dim, num_classes)
        self.dropout = nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, src, src_mask=None):
        src = self.embedding(src) * np.sqrt(embedding_dim)
        src = self.pos_encoder(src.permute(1, 0, 2))  # permute to (sequence_length, batch_size, embedding_dim)
        output = self.transformer_encoder(src, src_mask)
        output = output.mean(dim=0)
        output = self.dropout(output)
        output = self.fc(output)
        return output
# Define the model parameters
vocab_size = len(word_index) + 1
embedding_dim = 100
num_heads = 4
num_layers = 2
num_classes = 2  # Binary classification
max_len = 100

# Initialize the model, criterion, and optimizer
model = TransformerModel(vocab_size, embedding_dim, num_heads, num_layers, num_classes, max_len)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)



TransformerModel(
  (embedding): Embedding(26880, 200)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=100, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [120]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Send inputs and labels to device
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {correct/total * 100:.2f}%')

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)


RuntimeError: The size of tensor a (200) must match the size of tensor b (100) at non-singleton dimension 2