In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import pickle as pkl
import pandas as pd
import os

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [None]:
# !pip install -U torchtext==0.12.0

Collecting torchtext==0.12.0
  Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting torch==1.11.0 (from torchtext==0.12.0)
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.2.1+cu121
    Uninstalling torch-2.2.1+cu121:
      Successfully uninstalled torch-2.2.1+cu121
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.17.1
    Uninstalling torchtext-0.17.1:
      Successfully uninstalled torchtext-0.17.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

# Accessing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
transformers_df = pd.read_csv('/content/drive/MyDrive/new_emotions_df.csv')

In [None]:
transformers_df = transformers_df.drop(['Unnamed: 0'], axis = 1)

In [None]:
transformers_df

Unnamed: 0,text,label
0,im sick with allergies and feeling horrible,0
1,i feel the music hit me in a vain attempt to k...,0
2,i feel terribly helpless and thus i am putting...,0
3,im feeling like ive missed you all this time s...,0
4,im finding it harder and harder every day to c...,0
...,...,...
8995,I thought I was the only one! I’m currently go...,8
8996,"Sometimes it's a survival skill, health wise I...",8
8997,Quite dudes i hang shit on..usually it be shut...,8
8998,I could say I’ve been in similar situations wh...,8


# Train Test Split on sample of data

In [None]:
data_sample = transformers_df.sample(n = 7000, random_state = 19104, ignore_index = True)

In [None]:
text_sample = data_sample['text'].to_list()
label_sample = np.array(data_sample['label'])

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_sample['text'], data_sample['label'], test_size = 0.3, shuffle = True)

In [None]:
# Train
x_train_list = x_train.to_list()
y_train = np.array(y_train)

# Test
x_test_list = x_test.to_list()
y_test = np.array(y_test)

#**BERT**

In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, BertModel, TFBertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')

In [None]:
# Set the maximum sequence length
max_seq_length = 512


# Tokenize and truncate both training and test datasets with the maximum sequence length
tokenized_text_train = tokenizer(x_train_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
tokenized_text_test = tokenizer(x_test_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)

### Train BERT

In [None]:
model = BertModel.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Smaller batch size for GPU processing
gpu_batch_size = 64

all_pooled_representations = []

for i in range(0, len(x_train_list), gpu_batch_size):
    batch = tokenized_text_train['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_train['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_train_gpu = model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_train_gpu.pooler_output

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_train = torch.cat(all_pooled_representations, dim=0)

### Test BERT

In [None]:
# Load pre-trained BERT model
model = BertModel.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Smaller batch size for GPU processing
gpu_batch_size = 64

all_pooled_representations = []

for i in range(0, len(x_test_list), gpu_batch_size):
    batch = tokenized_text_test['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_test['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_test_gpu = model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_test_gpu.pooler_output

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_test = torch.cat(all_pooled_representations, dim=0)

In [None]:
# Move final_pooled_representation_train tensor to CPU
final_pooled_representation_train_cpu = final_pooled_representation_train.cpu().numpy()

# Move final_pooled_representation_test tensor to CPU and convert to numpy array
final_pooled_representation_test_cpu = final_pooled_representation_test.cpu().numpy()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()

# Initialize and fit logistic regression model
logistic_reg = LogisticRegression(max_iter=5000)
logistic_reg.fit(final_pooled_representation_train_cpu, y_train)

# Predict on test data
predictions_LR = logistic_reg.predict(final_pooled_representation_test_cpu)

In [None]:
# LR Accuracy and F1 Score
LR_accuracy = accuracy_score(predictions_LR, y_test)*100
print("Logistic Regression accuracy score: ", LR_accuracy)

LR_F1 = f1_score(predictions_LR, y_test, average = 'weighted')*100
print("Logistic Regression F1 Score: ", LR_F1)

Logistic Regression accuracy score:  85.33333333333334
Logistic Regression F1 Score:  85.5847924385294


## SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC()

svm_model.fit(final_pooled_representation_train_cpu, y_train)

predictions_SVM = svm_model.predict(final_pooled_representation_test_cpu)

In [None]:
# SVM Accuracy and F1 Score
SVM_accuracy = accuracy_score(predictions_SVM, y_test)*100
SVM_f1 = f1_score(predictions_SVM, y_test, average = 'weighted')*100
print("SVM accuracy score: ", SVM_accuracy)
print("SVM F1 Score: ", SVM_f1)

SVM accuracy score:  78.61904761904762
SVM F1 Score:  79.63745892300811


#**ELECTRA**

In [None]:
from transformers import ElectraTokenizer, ElectraModel

# Load ELECTRA tokenizer
tokenizer = ElectraTokenizer.from_pretrained('mudogruer/electra-emotion')

# Load ELECTRA model
model = ElectraModel.from_pretrained('mudogruer/electra-emotion')

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
max_seq_length = 512  # Set the maximum sequence length

# Tokenize and truncate both training and test datasets with the maximum sequence length
tokenized_text_train_ELECTRA = tokenizer(x_train_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
tokenized_text_test_ELECTRA = tokenizer(x_test_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)

### Train Electra

In [None]:
# Smaller batch size for GPU processing
gpu_batch_size = 64

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_pooled_representations = []

for i in range(0, len(x_train_list), gpu_batch_size):
    batch = tokenized_text_train_ELECTRA['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_train_ELECTRA['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_train_gpu = model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_train_gpu.last_hidden_state[:, 0, :]

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_train_ELECTRA = torch.cat(all_pooled_representations, dim=0)

### Test Electra

In [None]:
# Smaller batch size for GPU processing
gpu_batch_size = 64

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_pooled_representations = []

for i in range(0, len(x_test_list), gpu_batch_size):
    batch = tokenized_text_test_ELECTRA['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_test_ELECTRA['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_test_gpu = model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_test_gpu.last_hidden_state[:, 0, :]

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_test_ELECTRA = torch.cat(all_pooled_representations, dim=0)

In [None]:
# Move final_pooled_representation_train_ELECTRA tensor to CPU
final_pooled_representation_train_ELECTRA_cpu = final_pooled_representation_train_ELECTRA.cpu().numpy()

# Move final_pooled_representation_test_ELECTRA tensor to CPU and convert to numpy array
final_pooled_representation_test_ELECTRA_cpu = final_pooled_representation_test_ELECTRA.cpu().numpy()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()

# Initialize and fit logistic regression model
logistic_reg = LogisticRegression(max_iter=5000)
logistic_reg.fit(final_pooled_representation_train_ELECTRA_cpu, y_train)

# Predict on test data
predictions_LR = logistic_reg.predict(final_pooled_representation_test_ELECTRA_cpu)

In [None]:
# LR Accuracy and F1 Score
LR_accuracy = accuracy_score(predictions_LR, y_test)*100
print("Logistic Regression accuracy score: ", LR_accuracy)

LR_F1 = f1_score(predictions_LR, y_test, average = 'weighted')*100
print("Logistic Regression F1 Score: ", LR_F1)

Logistic Regression accuracy score:  85.09523809523809
Logistic Regression F1 Score:  85.20923812063636


## SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC()

svm_model.fit(final_pooled_representation_train_ELECTRA_cpu, y_train)

predictions_SVM = svm_model.predict(final_pooled_representation_test_ELECTRA_cpu)

In [None]:
# SVM Accuracy and F1 Score
SVM_accuracy = accuracy_score(predictions_SVM, y_test)*100
SVM_f1 = f1_score(predictions_SVM, y_test, average = 'weighted')*100
print("SVM accuracy score: ", SVM_accuracy)
print("SVM F1 Score: ", SVM_f1)

SVM accuracy score:  79.95238095238095
SVM F1 Score:  80.43381069884346


#**DistilBERT**

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

# Load DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('Rahmat82/DistilBERT-finetuned-on-emotion')

# Load DistilBERT model
distilbert_model = DistilBertModel.from_pretrained('Rahmat82/DistilBERT-finetuned-on-emotion')

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/862 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
max_seq_length = 512  # Set the maximum sequence length

# Tokenize and truncate both training and test datasets with the maximum sequence length
tokenized_text_train_distilBERT = distilbert_tokenizer(x_train_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
tokenized_text_test_distilBERT = distilbert_tokenizer(x_test_list, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)

## Train DistilBERT

In [None]:
# Smaller batch size for GPU processing
gpu_batch_size = 64

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
distilbert_model.to(device)

all_pooled_representations = []

for i in range(0, len(x_train_list), gpu_batch_size):
    batch = tokenized_text_train_distilBERT['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_train_distilBERT['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_train_gpu = distilbert_model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_train_gpu.last_hidden_state[:, 0, :]

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_train_distilBERT = torch.cat(all_pooled_representations, dim=0)

## Test DistilBERT

In [None]:
# Smaller batch size for GPU processing
gpu_batch_size = 64

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
distilbert_model.to(device)

all_pooled_representations = []

for i in range(0, len(x_test_list), gpu_batch_size):
    batch = tokenized_text_test_distilBERT['input_ids'][i:i+gpu_batch_size].to(device), tokenized_text_test_distilBERT['attention_mask'][i:i+gpu_batch_size].to(device)

    with torch.no_grad():
        outputs_test_gpu = distilbert_model(input_ids=batch[0], attention_mask=batch[1])

    pooled_output_gpu = outputs_test_gpu.last_hidden_state[:, 0, :]

    all_pooled_representations.append(pooled_output_gpu)

# Concatenate the aggregated pooled representations
final_pooled_representation_test_distilBERT = torch.cat(all_pooled_representations, dim=0)

In [None]:
# Move final_pooled_representation_train_distilBERT tensor to CPU
final_pooled_representation_train_distilBERT_cpu = final_pooled_representation_train_distilBERT.cpu().numpy()

# Move final_pooled_representation_train_distilBERT tensor to CPU and convert to numpy array
final_pooled_representation_test_distilBERT_cpu = final_pooled_representation_test_distilBERT.cpu().numpy()

## Logistic Regression

In [None]:
logistic_reg = LogisticRegression()

# Initialize and fit logistic regression model
logistic_reg = LogisticRegression(max_iter=5000)
logistic_reg.fit(final_pooled_representation_train_distilBERT_cpu, y_train)

# Predict on test data
predictions_LR = logistic_reg.predict(final_pooled_representation_test_distilBERT_cpu)

In [None]:
# LR Accuracy and F1 Score
LR_accuracy = accuracy_score(predictions_LR, y_test)*100
print("Logistic Regression accuracy score: ", LR_accuracy)

LR_F1 = f1_score(predictions_LR, y_test, average = 'weighted')*100
print("Logistic Regression F1 Score: ", LR_F1)

Logistic Regression accuracy score:  86.33333333333333
Logistic Regression F1 Score:  86.37040279106559


## SVM

In [None]:
svm_model = SVC()
svm_model.fit(final_pooled_representation_train_distilBERT_cpu, y_train)
predictions_SVM = svm_model.predict(final_pooled_representation_test_distilBERT_cpu)

In [None]:
# SVM Accuracy and F1 Score
SVM_accuracy = accuracy_score(predictions_SVM, y_test)*100
SVM_f1 = f1_score(predictions_SVM, y_test, average = 'weighted')*100
print("SVM accuracy score: ", SVM_accuracy)
print("SVM F1 Score: ", SVM_f1)

SVM accuracy score:  82.85714285714286
SVM F1 Score:  83.21012835747877
