### MSDS 453 - Final Project
Summer 2024
Akhilesh Nair

Import Necessary Libraries

In [53]:
import tensorflow as tf
import pandas as pd
import numpy
import sklearn
import transformers
import torch
import re
import os

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification, AdamW

Load in google dataset: GeoEmotions

In [16]:
dataset1 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv')
dataset2 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv')
dataset3 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv')

frame = [dataset1, dataset2, dataset3]
full_data = pd.concat(frame)

In [17]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211225 entries, 0 to 71224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  confus

Cleaning the data

In [19]:
# function to convert text to lowercase while also removing special char, nums and punctuation
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

full_data['cleaned_text'] = full_data['text'].apply(clean_text)

In [20]:
full_data.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,cleaned_text
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,that game hurt
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,sexuality shouldn t be a grouping category it...
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,you do right if you don t care then fuck em
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,man i love reddit
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,name was nowhere near them he was by the falcon


Now, let's drop all the useless columns, like the author, time created, rater approval, link_id, parent_id, rater_id etc... I will keep the subreddit for reference though.

In [22]:
full_data.drop(['id', 'author', 'link_id', 'parent_id', 'created_utc', 'rater_id'], axis=1, inplace=True)

In [24]:
full_data.head(5)

Unnamed: 0,text,subreddit,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,cleaned_text
0,That game hurt.,nrl,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,that game hurt
1,>sexuality shouldn’t be a grouping category I...,unpopularopinion,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sexuality shouldn t be a grouping category it...
2,"You do right, if you don't care then fuck 'em!",confessions,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,you do right if you don t care then fuck em
3,Man I love reddit.,facepalm,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,man i love reddit
4,"[NAME] was nowhere near them, he was by the Fa...",starwarsspeculation,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,name was nowhere near them he was by the falcon


As the data is already encoded via one hot encoding, we can move on to splitting the test and train set.

In [29]:
emotion_columns = [col for col in full_data.columns[3:31]]

In [32]:
X = full_data['cleaned_text']
y = full_data[emotion_columns]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 168980 samples
Testing set size: 42245 samples


Now, let us tokenize the data.

In [37]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


train_encodings = {key: torch.tensor(val) for key, val in train_encodings.items()}
test_encodings = {key: torch.tensor(val) for key, val in test_encodings.items()}

train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [40]:
# Create a TensorDataset directly
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Now that our data is good and ready, let's start training 

In [42]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y_train.shape[1])

# This will move us to the gpu --- THIS DOES NOT MOVE US TO THE GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [43]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()



In [None]:
# from tensorflow.keras import layers, regularizers

# k.clear_session()
# inputs = tf.keras.Input(shape=(None,), dtype="int64")
# embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
# x = layers.LSTM(
#     32,
#     kernel_regularizer=regularizers.l2(0.01)
# )(embedded)
# x = layers.Dropout(0.5)(x)  #
# outputs = layers.Dense(4, activation="softmax")(x)
# model = tf.keras.Model(inputs, outputs)
# model.compile(optimizer="rmsprop",
#               loss="SparseCategoricalCrossentropy",
#               metrics=["accuracy"])
# model.summary()

# callbacks = [
#     tf.keras.callbacks.ModelCheckpoint("LSTM_L2_Dropout.keras", save_best_only=True),
#     tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
# ]
# history = model.fit(int_train_ds, validation_data=int_val_ds, epochs=200, callbacks=callbacks)
# model = keras.models.load_model("LSTM_L2_Dropout.keras")
# print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

In [57]:
import torch.nn as nn

# Ensure you're using the right loss function for multi-label classification
loss_fn = nn.BCEWithLogitsLoss()

# During training
with tf.device('/device:GPU:0'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    print(torch.cuda.is_available())  # This should return True if GPU is accessible
    print(torch.version.cuda)  # This should return the CUDA version PyTorch is using
    print('in the while loop')
    for epoch in range(3):
        print('enter loop 1')
        model.train()
        total_loss = 0
        print('exit loop 1')
        for batch in train_loader:
            print('enter loop 2')
            # Move batch to device
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Calculate loss
            loss = loss_fn(logits, labels.float())  # Make sure labels are in the correct format (float)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            print('exit loop 2')
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")
print('out of the while loop')

Using device: cpu
False
None
in the while loop
enter loop 1
exit loop 1
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
enter loop 2
exit loop 2
ent

KeyboardInterrupt: 

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

For some reason, I couldn't switch the processing device for the tensor stuff to the GPU. The fitting time was running upwards of 5 hours when I decided to stop it. I think it would have gone for many 100+ hours. I have way too many data points, will have to take a subset to train the model. Another option is to do a smaller scale version of BERT, known as distillBERT.