<a href="https://colab.research.google.com/github/amodkala/transformer/blob/main/ECON_424_PC6_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# fix for torch.compile() error on A100 runtime

!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link



In [None]:
# Imports, 'global' variables, and class definitions

import math
import torch
import os.path
import torchtext
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F

from torch import nn
from tqdm import tqdm
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.transforms import VocabTransform
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, classification_report, confusion_matrix

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
gpu_ok = device.type != "cpu" and torch.cuda.get_device_capability() in ((7, 0), (8, 0), (9, 0))
if gpu_ok and "A100" in torch.cuda.get_device_name(0):
    torch.set_float32_matmul_precision('high')

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200, device=None):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach()

class TransformerClassifier(nn.Module):
    def __init__(self, embeddings, d_model, nhead, num_encoder_layers, num_classes, dropout=0.0, transformer_dropout=0.0):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)
        self.embedding_dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncoding(d_model, device=device)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_model, transformer_dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(d_model)
        src = self.embedding_dropout(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)
        # output = self.output_dropout(output)
        return self.fc(output)

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=200):
        assert len(texts) == len(labels)
        self.labels = labels
        self.texts = [torch.tensor([vocab[token] for token in text], dtype=torch.long) for text in texts]
        self.max_len = max_len if max_len is not None else max(len(t) for t in self.texts)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        if len(text) < self.max_len:
            # Pad the sequence if it's shorter than max_len
            text = F.pad(text, (0, self.max_len - len(text)), 'constant', vocab['<pad>'])
        return text, self.labels[idx]

    def collate_fn(self, batch):
        texts, labels = zip(*batch)
        # If all texts are already padded to max_len, no need for dynamic padding
        return torch.stack(texts), torch.tensor(labels, dtype=torch.long)


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2023-12-14 18:29:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-12-14 18:29:52--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-12-14 18:29:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# implement preprocessing from https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1184/reports/6880837.pdf
#
# tokenize pros and cons columns
# limit them to 100 words in length
# combine them, add padding characters for consistent 200 length

# Load CSV
size = "small"
filename = f"Econ424_F2023_PC6_glassdoor_training_{size}_v1.csv"

df = pd.read_csv(filename, low_memory=False)

tokenizer = get_tokenizer("basic_english")
specials = {
    "unknown": "<unk>",
    "padding": "<pad>",
}

df["pros"] = df["pros"].apply(lambda x: tokenizer(str(x))[:100])
df["cons"] = df["cons"].apply(lambda x: tokenizer(str(x))[:100])

df["text"] = df["pros"] + df["cons"]
df["text"].apply(lambda x: x.extend([specials["padding"]] * (200 - len(x))))

# 5 features
df["pros_tokens"] = df["pros"].apply(lambda x: len(x))
df["cons_tokens"] = df["cons"].apply(lambda x: len(x))
df["text_tokens"] = df["pros_tokens"] + df["cons_tokens"]
df["headline_tokens"] = df["headline"].apply(lambda x: len(tokenizer(str(x))))
df["total_tokens"] = df["headline_tokens"] + df["text_tokens"]

vocab = build_vocab_from_iterator((tokens for tokens in df["text"]), specials=list(specials.values()), max_tokens=10000)
vocab.set_default_index(vocab[specials["unknown"]])  # Set default index for unknown tokens

# basic integer encoding
#
# vt = VocabTransform(vocab)
# print(df["text"][0])
# print(vt(df["text"][0]))

# GloVe embeddings
embedding_dim = 300 # or whatever dimension your GloVe embeddings are
vocab_size = len(vocab)  # Assuming 'vocab' is your vocabulary built from the dataset

embeddings_dict = {}
glove_size = "6B"
glove_dim = "300d"
with open(f"glove.{glove_size}.{glove_dim}.txt", 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

# Initialize the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in vocab.get_stoi().items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim, ))

pad_index = vocab[specials["padding"]]  # Get the index of the padding token
embedding_matrix[pad_index] = np.zeros((embedding_dim, ))  # Set the padding token's embedding to a zero vector

In [None]:
average_ratings = df.groupby('firm')['overall_rating'].mean().reset_index()
average_ratings = average_ratings.rename(columns={'overall_rating': 'average_rating'})

# Step 2: Merge the average ratings back into the original DataFrame
df = pd.merge(df, average_ratings, on='firm', how='left')


In [None]:
low_threshold = 3  # Ratings below this are considered low
high_threshold = 4  # Ratings above this are considered high

# Create three new DataFrames
low_ratings_df = df[df['average_rating'] < low_threshold]
medium_ratings_df = df[(df['average_rating'] >= low_threshold) & (df['average_rating'] <= high_threshold)]
high_ratings_df = df[df['average_rating'] > high_threshold]

print(len(low_ratings_df))
print(len(medium_ratings_df))
print(len(high_ratings_df))

4313
96479
9937


In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def get_unique_top_terms(df, column, exclude_terms, num_terms=10):
    # Flatten the list of words in the DataFrame column and filter out stopwords and excluded terms
    all_words = [word for sublist in df[column] for word in sublist if word not in stop_words and word not in exclude_terms and word.isalpha()]
    return Counter(all_words).most_common(num_terms)

# Assuming low_ratings_df, medium_ratings_df, high_ratings_df are your DataFrames
# Start with the high ratings group
high_common_terms = get_unique_top_terms(high_ratings_df, 'cons', set(), num_terms=25)

# Then, calculate for medium ratings excluding high ratings' top terms
medium_common_terms = get_unique_top_terms(medium_ratings_df, 'cons', set(term for term, _ in high_common_terms), num_terms=25)

# Lastly, calculate for low ratings excluding both medium and high ratings' top terms
low_common_terms = get_unique_top_terms(low_ratings_df, 'cons', set(term for term, _ in high_common_terms + medium_common_terms), num_terms=25)

print([term for term, num in high_common_terms])
print([term for term, num in medium_common_terms])
print([term for term, num in low_common_terms])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['work', 'company', 'management', 'hours', 'people', 'get', 'long', 'time', 'sometimes', 'much', 'balance', 'hard', 'working', 'many', 'lot', 'like', 'cons', 'good', 'life', 'really', 'career', 'difficult', 'culture', 'job', 'big']
['pay', 'employees', 'managers', 'salary', 'low', 'poor', 'staff', 'lack', 'team', 'bad', 'high', 'little', 'growth', 'one', 'less', 'times', 'environment', 'years', 'politics', 'slow', 'always', 'even', 'need', 'new', 'make']
['training', 'senior', 'manager', 'support', 'progression', 'customers', 'never', 'care', 'business', 'store', 'targets', 'day', 'would', 'sales', 'enough', 'pressure', 'office', 'worked', 'everything', 'communication', 'could', 'paid', 'know', 'hr', 'wage']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def get_top_n_grams(corpus, n=None, n_grams=2):
    vec = CountVectorizer(ngram_range=(n_grams, n_grams), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Example usage with the low ratings DataFrame
low_ratings_corpus = [' '.join(row) for row in low_ratings_df['cons']]
low_ratings_top_bigrams = get_top_n_grams(low_ratings_corpus, n=10, n_grams=3)
medium_ratings_corpus = [' '.join(row) for row in medium_ratings_df['cons']]
medium_ratings_top_bigrams = get_top_n_grams(medium_ratings_corpus, n=10, n_grams=3)
high_ratings_corpus = [' '.join(row) for row in high_ratings_df['cons']]
high_ratings_top_bigrams = get_top_n_grams(high_ratings_corpus, n=10, n_grams=3)

print("Top 10 bigrams in low ratings 'cons':", low_ratings_top_bigrams)
print("Top 10 bigrams in medium ratings 'cons':", medium_ratings_top_bigrams)
print("Top 10 bigrams in high ratings 'cons':", high_ratings_top_bigrams)


Top 10 bigrams in low ratings 'cons': [('work life balance', 69), ('high staff turnover', 25), ('poor work life', 15), ('work long hours', 13), ('high turnover staff', 11), ('staff turnover high', 9), ('poor management poor', 9), ('poor management lack', 8), ('good place work', 8), ('poor senior management', 8)]
Top 10 bigrams in medium ratings 'cons': [('work life balance', 5500), ('long working hours', 776), ('poor work life', 353), ('work long hours', 274), ('hours busy season', 216), ('long work hours', 186), ('long hours busy', 169), ('long hours work', 160), ('great place work', 154), ('hours work life', 130)]
Top 10 bigrams in high ratings 'cons': [('work life balance', 577), ('long working hours', 52), ('great place work', 32), ('work long hours', 31), ('poor work life', 28), ('long work hours', 21), ('fast paced environment', 19), ('life balance difficult', 17), ('life balance challenge', 17), ('bureaucracy bureaucracy bureaucracy', 16)]


In [None]:
print(low_ratings_df["cons"].head())

204     [poor, management, ., no, growth, opportunitie...
466     [they, expected, so, much, ,, but, give, very,...
482     [rigid, managerial, structure, ., strong, deli...
1099                      [can, be, stressful, at, times]
1104    [constant, reorganisations, ., no, apparent, s...
Name: cons, dtype: object


In [None]:
# test vocab/GloVe alignment

absent, matches, notmatches = 0, 0, 0

for word, idx in list(vocab.get_stoi().items()):  # Check first 10 words for example
    vocab_embedding = embedding_matrix[idx]
    glove_embedding = embeddings_dict.get(word, "Not in GloVe")

    if isinstance(glove_embedding, str):
        absent = absent + 1
    elif np.allclose(vocab_embedding, glove_embedding):
        matches = matches + 1
    else:
        notmatches = notmatches + 1

print("absent words: ", absent)
print("matching words: ", matches)
print("misaligned words: ", notmatches)


absent words:  577
matching words:  9422
misaligned words:  0


In [None]:
# Create train + test datasets, instantiate model
train_df, test_df = train_test_split(df, test_size=0.2)

# create tensors from 0-indexed labels
train_labels = torch.tensor(train_df['overall_rating'].values - 1)
test_labels = torch.tensor(test_df['overall_rating'].values - 1)
full_labels = torch.tensor(df['overall_rating'].values - 1)

# Assuming you have already instantiated the dataset
train_dataset = ReviewDataset(train_df['text'].tolist(), train_labels.tolist(), vocab)
test_dataset = ReviewDataset(test_df['text'].tolist(), test_labels.tolist(), vocab)
full_dataset = ReviewDataset(df['text'].tolist(), full_labels.tolist(), vocab)

batch_size = 16

# DataLoader with collate_fn
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=test_dataset.collate_fn, pin_memory=True)
full_dataloader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, collate_fn=full_dataset.collate_fn, pin_memory=True)


In [None]:
print(df.groupby("overall_rating").size()/len(df) * 100)

overall_rating
1     7.769419
2    10.176196
3    24.703556
4    33.190944
5    24.159886
dtype: float64


In [None]:
# Model parameters
embeddings = embedding_matrix
d_model = embedding_dim # Embedding dimension
nhead = 6  # Number of heads in MultiHeadAttention
num_encoder_layers = 4  # Number of TransformerEncoder layers
num_classes = 5  # Number of output classes

# Instantiate the model
torch.manual_seed(0)
model = TransformerClassifier(embeddings, d_model, nhead, num_encoder_layers, num_classes)
model = model.to(device)
if gpu_ok:
    model = torch.compile(model)



epochs 3, class_weights, no dropout, lr 6e-5, heads 6, layers 4, large dataset => accuracy 0.453

epochs 3, class_weights, no dropout, lr 6e-5, heads 6, layers 4 => accuracy 0.378

epochs 3, class_weights, no dropout, lr 5e-5, heads 6, layers 4 => accuracy 0.374

epochs 3, class_weights, no dropout, lr 5e-5, heads 6, layers 3 => accuracy 0.373

epochs 3, class_weights, no dropout, lr 4e-5, heads 6, layers 4 => accuracy 0.368

epochs 3, class_weights, no dropout, lr 5e-5, heads 6, layers 6 => accuracy 0.367

epochs 3, class_weights, dropout 0.1, lr 5e-5, heads 6, layers 4 => accuracy 0.367

epochs 3, class_weights, no dropout, lr 5e-5, heads 3, layers 3 => accuracy 0.367

epochs 3, class_weights, no dropout, lr 5e-5, heads 2, layers 2 => accuracy 0.36

epochs 3, no class_weights, dropout 0.25, lr 1e-5 => accuracy 0.36 (didn't predict label=1)

epochs 3, no class_weights, dropout 0.25, lr 1e-6 => accuracy 0.33 (didn't predict label=1)

epochs 3, no class_weights, dropout 0.3, lr 1e-5 => accuracy 0.33 (didn't predict label=1,4)

epochs 3, no class_weights, dropout 0.2, lr 1e-5 => accuracy 0.33 (didn't predict label=4)

epochs 3, class weights, no dropout, lr 1e-5, heads 6, layers 6 => accuracy 0.29

epochs 3, class_weights, dropout 0.2, lr 1e-5, heads 6, layers 6 => accuracy 0.29

epochs 3, class_weights, dropout 0.2, lr 1e-6, heads 6, layers 6 => accuracy 0.25

epochs 3, class_weights, dropout 0.4, lr 1e-5, heads 6, layers 6 => accuracy 0.24

In [None]:
# Training

# Training setup (assuming you have a train function set up)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

step_size = 3
num_epochs = 6 * step_size # Number of training epochs
optimizer = torch.optim.AdamW(model.parameters(), lr=6e-5)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.1)
criterion = nn.CrossEntropyLoss(weight=class_weights)

model.train()

# Initialize a list to store all predictions and labels
all_predictions = []
all_labels = []

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    mode = "predict"
    dl = train_dataloader if mode == "eval" else full_dataloader

    progress_bar = tqdm(dl, desc=f'Epoch {epoch + 1}/{num_epochs}')
    for input, label in progress_bar:
        input = input.to(device)
        label = label.to(device)

        # Forward pass, backward pass, and optimize
        optimizer.zero_grad()
        output = model(input)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Calculate loss and accuracy
        total_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        correct_predictions += (predicted == label).sum().item()
        total_predictions += label.size(0)

        # Update progress bar description
        avg_loss = total_loss / total_predictions
        accuracy = correct_predictions / total_predictions * 100
        progress_bar.set_postfix(loss=avg_loss, accuracy=f'{accuracy:.2f}%')

        # Store predictions and actual labels on GPU
        all_predictions.append(output)
        all_labels.append(label)

    # Print average loss and accuracy for this epoch
    avg_loss = total_loss / len(dl)
    accuracy = correct_predictions / total_predictions * 100
    print(f'End of Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

# Concatenate all predictions and labels and then move to CPU
all_predictions = torch.cat(all_predictions).cpu()
all_labels = torch.cat(all_labels).cpu()

# If your task is a classification task, convert logits to predicted class indices
_, all_predictions = torch.max(all_predictions, 1)

# Convert tensors to numpy arrays for metric calculation
all_predictions_np = all_predictions.numpy()
all_labels_np = all_labels.numpy()



  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,
Epoch 1/18: 100%|██████████| 6921/6921 [01:32<00:00, 74.66it/s, accuracy=29.27%, loss=0.0944]


End of Epoch 1, Loss: 1.5101, Accuracy: 29.27%


Epoch 2/18: 100%|██████████| 6921/6921 [01:08<00:00, 101.27it/s, accuracy=35.78%, loss=0.0877]


End of Epoch 2, Loss: 1.4027, Accuracy: 35.78%


Epoch 3/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.44it/s, accuracy=38.54%, loss=0.0843] 


End of Epoch 3, Loss: 1.3492, Accuracy: 38.54%


Epoch 4/18: 100%|██████████| 6921/6921 [01:08<00:00, 101.02it/s, accuracy=39.93%, loss=0.0822]


End of Epoch 4, Loss: 1.3151, Accuracy: 39.93%


Epoch 5/18: 100%|██████████| 6921/6921 [01:09<00:00, 100.24it/s, accuracy=41.01%, loss=0.0805]


End of Epoch 5, Loss: 1.2884, Accuracy: 41.01%


Epoch 6/18: 100%|██████████| 6921/6921 [01:09<00:00, 98.88it/s, accuracy=42.06%, loss=0.0792]


End of Epoch 6, Loss: 1.2664, Accuracy: 42.06%


Epoch 7/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.38it/s, accuracy=42.66%, loss=0.078]


End of Epoch 7, Loss: 1.2485, Accuracy: 42.66%


Epoch 8/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.42it/s, accuracy=43.35%, loss=0.0769] 


End of Epoch 8, Loss: 1.2296, Accuracy: 43.35%


Epoch 9/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.56it/s, accuracy=43.74%, loss=0.0759]


End of Epoch 9, Loss: 1.2136, Accuracy: 43.74%


Epoch 10/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.63it/s, accuracy=44.22%, loss=0.0751] 


End of Epoch 10, Loss: 1.2013, Accuracy: 44.22%


Epoch 11/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.90it/s, accuracy=44.79%, loss=0.0742] 


End of Epoch 11, Loss: 1.1873, Accuracy: 44.79%


Epoch 12/18: 100%|██████████| 6921/6921 [01:08<00:00, 100.40it/s, accuracy=45.10%, loss=0.0735]


End of Epoch 12, Loss: 1.1759, Accuracy: 45.10%


Epoch 13/18: 100%|██████████| 6921/6921 [01:10<00:00, 98.49it/s, accuracy=45.57%, loss=0.0727] 


End of Epoch 13, Loss: 1.1625, Accuracy: 45.57%


Epoch 14/18: 100%|██████████| 6921/6921 [01:09<00:00, 100.10it/s, accuracy=46.00%, loss=0.0719]


End of Epoch 14, Loss: 1.1498, Accuracy: 46.00%


Epoch 15/18: 100%|██████████| 6921/6921 [01:09<00:00, 100.28it/s, accuracy=46.46%, loss=0.0712]


End of Epoch 15, Loss: 1.1396, Accuracy: 46.46%


Epoch 16/18: 100%|██████████| 6921/6921 [01:10<00:00, 98.49it/s, accuracy=46.87%, loss=0.0704] 


End of Epoch 16, Loss: 1.1262, Accuracy: 46.87%


Epoch 17/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.32it/s, accuracy=47.26%, loss=0.0697] 


End of Epoch 17, Loss: 1.1147, Accuracy: 47.26%


Epoch 18/18: 100%|██████████| 6921/6921 [01:09<00:00, 99.15it/s, accuracy=47.40%, loss=0.0689]


End of Epoch 18, Loss: 1.1028, Accuracy: 47.40%


In [None]:
print(confusion_matrix(all_labels_np, all_predictions_np, labels=[i for i in range(5)]))

[[ 99497  28494  15681   4216   6966]
 [ 52796  66999  51719  17267  14043]
 [ 44662  78390 164935 122948  81437]
 [ 23009  40990 148983 226247 222307]
 [ 13223  11993  50804 110579 294937]]


In [None]:
# Load model from disk

if os.path.isfile("model_state_dict.pth"):
    model.load_state_dict(torch.load('model_state_dict.pth'))


In [None]:
# Evaluation

# Set the model to evaluation mode
model.eval()

# Initialize a list to store all predictions and labels
all_predictions = []
all_labels = []

# Disabling gradient calculation
with torch.no_grad():
    progress_bar = tqdm(test_dataloader, desc=f'Evaluating model')
    for inputs, labels in progress_bar:
        # Transfer inputs and labels to the same device as the model
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Store predictions and actual labels on GPU
        all_predictions.append(outputs)
        all_labels.append(labels)

# Concatenate all predictions and labels and then move to CPU
all_predictions = torch.cat(all_predictions).cpu()
all_labels = torch.cat(all_labels).cpu()

# If your task is a classification task, convert logits to predicted class indices
_, all_predictions = torch.max(all_predictions, 1)

# Convert tensors to numpy arrays for metric calculation
all_predictions_np = all_predictions.numpy()
all_labels_np = all_labels.numpy()


In [None]:
test_df["prediction"] = all_predictions_np
correct_df = test_df[test_df["overall_rating"] == test_df["prediction"]]
incorrect_df = test_df[test_df["overall_rating"] != test_df["prediction"]]

features = ['pros_tokens', 'cons_tokens', 'text_tokens', 'headline_tokens', 'total_tokens']

plt.figure(figsize=(15, 10))

for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(correct_df[feature], color="blue", label="Correct Predictions", kde=True, stat="density", linewidth=0)
    sns.histplot(incorrect_df[feature], color="red", label="Incorrect Predictions", kde=True, stat="density", linewidth=0)
    plt.title(f"Distribution of {feature}")
    plt.legend()

plt.tight_layout()
plt.savefig('graph5.png')
plt.close()

In [None]:
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['pros_tokens', 'cons_tokens', 'text_tokens', 'headline_tokens', 'total_tokens'], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='overall_rating', y=feature, data=df)
    plt.title(f'{feature} by Overall Rating')
    plt.xlabel('Overall Rating')
    plt.ylabel(feature)

plt.tight_layout()
plt.savefig('graph6.png')
plt.close()

In [None]:
# Save model to disk
torch.save(model.state_dict(), 'model_state_dict.pth')

In [None]:
assert({0, 1, 2, 3, 4} == set(all_predictions_np))
assert(len(all_predictions_np) == len(all_labels_np))
correct = 0
for i in range(len(all_labels_np)):
    if all_predictions_np[i] == all_labels_np[i]:
        correct += 1
print(f'Accuracy: {correct/len(all_labels_np)}')

Accuracy: 0.4277786307110152


In [None]:
pred_df = pd.read_csv("/content/424_F2023_Final_PC_glassdoor_test_without_response_v1.csv", low_memory=False)

pred_df["pros"] = pred_df["pros"].apply(lambda x: tokenizer(str(x))[:100])
pred_df["cons"] = pred_df["cons"].apply(lambda x: tokenizer(str(x))[:100])
pred_df["text"] = pred_df["pros"] + pred_df["cons"]
pred_df["text"].apply(lambda x: x.extend([specials["padding"]] * (200 - len(x))))

# Assuming test_labels are available
# If not, you can use dummy labels as they won't be used in prediction
pred_labels = torch.zeros(len(pred_df))  # Dummy labels

pred_dataset = ReviewDataset(pred_df['text'].tolist(), pred_labels.tolist(), vocab)
pred_dataloader = DataLoader(pred_dataset, batch_size=batch_size, shuffle=False, collate_fn=pred_dataset.collate_fn, pin_memory=True)

# Set the model to evaluation mode
model.eval()

# Store predictions
predictions = []

with torch.no_grad():
    progress_bar = tqdm(pred_dataloader, desc=f'Evaluating model')
    for inputs, _ in progress_bar:
        inputs = inputs.to(device)

        # Forward pass
        outputs = model(inputs)

        # Convert outputs to probabilities and then to class labels
        predicted_labels = torch.argmax(outputs.data, dim=1)

        predictions.extend(predicted_labels.cpu().numpy())

predictions = [pred + 1 for pred in predictions]
preds_df = pd.DataFrame(predictions) # account for the 0-indexed labels from earlier
preds_df.to_csv("predictions.csv", index=False, header=False)

Evaluating model: 100%|██████████| 3561/3561 [00:10<00:00, 327.91it/s]


In [None]:
features = ['pros_tokens', 'cons_tokens', 'text_tokens', 'headline_tokens', 'total_tokens']

# graph 2
plt.figure(figsize=(10, 8))
corr_matrix = train_df[features].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap Among Features")
plt.savefig('graph2.png')  # Specify your save path here
plt.close()

# graph 3
plt.figure(figsize=(15, 10))

for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(train_df[feature], color="blue", label="Training", kde=True, stat="density", linewidth=0)
    sns.histplot(test_df[feature], color="red", label="Test", kde=True, stat="density", linewidth=0)
    plt.title(f"Distribution of {feature}")
    plt.legend()

plt.tight_layout()
plt.savefig('graph3.png')  # Specify your save path here
plt.close()

# graph 4
train_preds = all_predictions.tolist()
train_labels = all_labels.tolist()
predictions = predictions

plt.figure(figsize=(10, 4))
sns.histplot(train_preds, color="green", label="Train Predictions", kde=True, stat="density", linewidth=0)
sns.histplot(train_labels, color="blue", label="Actual Train Values", kde=True, stat="density", linewidth=0)
sns.histplot(predictions, color="red", label="Test Predictions", kde=True, stat="density", linewidth=0)
plt.title("Distribution of Predictions and Actual Values")
plt.legend()
plt.savefig('graph4.png')  # Specify your save path here
plt.close()

