In [77]:
!pip install torch transformers datasets scikit-learn pandas numpy tqdm




In [78]:
import numpy as np
import torch
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [65]:
# Load the IMDB dataset
dataset = load_dataset("imdb")
df = pd.DataFrame(dataset['train'])

In [66]:
# Create a balanced sample of 200 (100 positive and 100 negative)
df_positive = df[df['label'] == 1].sample(100, random_state=42)
df_negative = df[df['label'] == 0].sample(100, random_state=42)
balanced_df = pd.concat([df_positive, df_negative])

In [67]:
# Extract the texts and labels
texts = balanced_df['text'].tolist()
labels = balanced_df['label'].tolist()

## Use BERT for Sentiment Prediction

In [68]:
# Load the pre-trained BERT tokenizer and model for sequence classification


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
bert_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
def return_sentiment(txt):
    encoded_input = tokenizer(txt, return_tensors='pt', padding=True, truncation=True, max_length=510)
    output = bert_model(**encoded_input)
    score = output[0][0].detach().numpy()
    scores = softmax(score)

    # Get the predicted class (0 for negative, 1 for positive)
    if scores[2] > scores[0]:
      return 1

    return 0

In [79]:
# Predict using BERT
predictions = []

for text in tqdm(texts):
  predictions.append(return_sentiment(text))


100%|██████████| 200/200 [05:24<00:00,  1.62s/it]


In [80]:
# Create a confusion matrix for BERT model predictions
print("Confusion Matrix for BERT model:")
bert_conf_matrix = confusion_matrix(labels, predictions)
print(bert_conf_matrix)
print(classification_report(labels, predictions))

Confusion Matrix for BERT model:
[[79 21]
 [24 76]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       100
           1       0.78      0.76      0.77       100

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.77       200
weighted avg       0.78      0.78      0.77       200



## Extract BERT Embeddings and Use Logistic Regression

In [84]:
# Extract word embeddings from BERT and use Logistic Regression
lrtokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the texts
inputs = lrtokenizer(texts, padding=True, truncation=True, return_tensors="pt")
bert_model = BertModel.from_pretrained("bert-base-uncased")

inputs = lrtokenizer(texts, padding=True, truncation=True, return_tensors="pt",max_length=510)
with torch.no_grad():
    embeddings = bert_model(**inputs).last_hidden_state[:, 0, :].numpy()  # Take the [CLS] token

In [85]:
# Split the embeddings for training and testing the logistic regression
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)


In [86]:
# Train a logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [87]:
# Predict using the logistic regression model
lr_predictions = lr_model.predict(X_test)

In [88]:
# Create a confusion matrix for logistic regression predictions
print("Confusion Matrix for Logistic Regression model:")
lr_conf_matrix = confusion_matrix(y_test, lr_predictions)
print(lr_conf_matrix)
print(classification_report(y_test, lr_predictions))

Confusion Matrix for Logistic Regression model:
[[14  5]
 [ 3 18]]
              precision    recall  f1-score   support

           0       0.82      0.74      0.78        19
           1       0.78      0.86      0.82        21

    accuracy                           0.80        40
   macro avg       0.80      0.80      0.80        40
weighted avg       0.80      0.80      0.80        40

