In [3]:
import re
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from heapq import *
#from BertExtractors import *
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

%load_ext autoreload
%autoreload 2

In [6]:
df = pd.read_csv("../data/IMDBDataset_CleanHTML.csv")
train = df.iloc[0:2000,]
test = df.iloc[25000:26000,]

In [8]:
def assess_model(true, pred, prob):
    print(f"Accuracy: {accuracy_score(true, pred)}")
    print(f"Precision: {precision_score(true, pred)}")
    print(f"Recall: {recall_score(true, pred)}")
    print(f"ROC AUC: {roc_auc_score(true, prob)}")

# Contiguous 150

In [4]:
#Dataframes containing the extracted features from the contiguous 150 extractor
contiguous150_train = pd.read_csv("../data/contiguous150_train_0000-2000.csv")
contiguous150_test = pd.read_csv("../data/contiguous150_test_1000.csv")

In [5]:
contiguous150 = LogisticRegression(penalty=None, random_state = 32110, max_iter = 5000)
contiguous150.fit(contiguous150_train.iloc[:,0:768], contiguous150_train.sentiment)

In [6]:
contiguous150_pred = contiguous150.predict(contiguous150_test.iloc[:, 0:768])
#Probability of positive review
contiguous150_prob = contiguous150.predict_proba(contiguous150_test.iloc[:, 0:768])[:, 1]
true = test.sentiment

In [7]:
assess_model(true, contiguous150_pred, contiguous150_prob)

Accuracy: 0.759
Precision: 0.8061224489795918
Recall: 0.7301293900184843
ROC AUC: 0.8008549486748899


In [15]:
contiguous150_results = pd.DataFrame({'pred': contiguous150_pred, 'true': true, 'prob': contiguous150_prob})

# Top 150 Tokens

In [16]:
#Dataframes containing the extracted features from the Top 150 extractor
top150_train = pd.read_csv("../data/top150_train_0000-2000.csv")
top150_test = pd.read_csv("../data/top150_test_1000.csv")

In [18]:
top150 = LogisticRegression(penalty=None, random_state = 42155, max_iter = 5000)
top150.fit(top150_train.iloc[:,0:768], top150_train.sentiment)

In [19]:
top150_pred = top150.predict(top150_test.iloc[:, 0:768])
#Probability of positive review
top150_prob = top150.predict_proba(top150_test.iloc[:, 0:768])[:, 1]
true = test.sentiment

In [20]:
assess_model(true, top150_pred, top150_prob)

Accuracy: 0.748
Precision: 0.7752380952380953
Recall: 0.7523105360443623
ROC AUC: 0.7999911404282395


# Contiguous 50

In [21]:
#Dataframes containing the extracted features from the contiguous 50 extractor
contiguous050_train = pd.read_csv("../data/contiguous050_train_0000-2000.csv")
contiguous050_test = pd.read_csv("../data/contiguous050_test_1000.csv")

In [22]:
contiguous050 = LogisticRegression(penalty=None, random_state = 41854, max_iter = 20000)
contiguous050.fit(contiguous050_train.iloc[:,0:768], contiguous050_train.sentiment)

In [23]:
contiguous050_pred = contiguous050.predict(contiguous050_test.iloc[:, 0:768])
#Probability of positive review
contiguous050_prob = contiguous050.predict_proba(contiguous050_test.iloc[:, 0:768])[:, 1]
true = test.sentiment

In [24]:
assess_model(true, contiguous050_pred, contiguous050_prob)

Accuracy: 0.677
Precision: 0.7104247104247104
Recall: 0.6802218114602587
ROC AUC: 0.6998739524563165


# Top 50 Tokens

In [25]:
#Dataframes containing the extracted features from the Top 50 extractor
top050_train = pd.read_csv("../data/top050_train_0000-2000.csv")
top050_test = pd.read_csv("../data/top050_test_1000.csv")

In [26]:
top050 = LogisticRegression(penalty=None, random_state = 31254, max_iter = 50000)
top050.fit(top050_train.iloc[:,0:768], top050_train.sentiment)

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
top050_pred = top050.predict(top050_test.iloc[:, 0:768])
#Probability of positive review
top050_prob = top050.predict_proba(top050_test.iloc[:, 0:768])[:, 1]
true = test.sentiment

In [28]:
assess_model(true, top050_pred, top050_prob)

Accuracy: 0.66
Precision: 0.6899810964083176
Recall: 0.6746765249537893
ROC AUC: 0.6794465989312135


# Pretrained BERT

Model from https://huggingface.co/textattack/bert-base-uncased-imdb

In [19]:
pretrain_tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
pretrain_model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb")

In [51]:
#Thanks to https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
#Predicted class and probabilities of positive class
pred, prob = [], []
for i in range(0, 1000):
    if i % 50 == 0:
        print(f"Location: {i}")
    review = test.iloc[i,].review
    #tokenize the review
    inputs = pretrain_tokenizer(
        review,
        return_tensors='pt',
        max_length = 512,
        truncation = 'longest_first',
    )
    with torch.no_grad():
        logits = pretrain_model(**inputs).logits
    pred.append(logits.argmax().item())
    #Probability of positive class
    #https://stackoverflow.com/questions/46416984/how-to-convert-logits-to-probability-in-binary-classification-in-tensorflow
    prob.append(torch.nn.functional.softmax(logits, dim = 1)[0, 1].item())
pretrain_output_df = pd.DataFrame({'pred': pred, 'prob': prob})
pretrain_output_df.to_csv("../data/pretrain_test_pred_1000.csv", index = False)
print("File saved successfully.")

Location: 0
Location: 50
Location: 100
Location: 150
Location: 200
Location: 250
Location: 300
Location: 350
Location: 400
Location: 450
Location: 500
Location: 550
Location: 600
Location: 650
Location: 700
Location: 750
Location: 800
Location: 850
Location: 900
Location: 950
File saved successfully.


In [52]:
true = test.sentiment
assess_model(true, pred, prob)

Accuracy: 0.951
Precision: 0.9641509433962264
Recall: 0.944547134935305
ROC AUC: 0.9862596096150515


# Vanilla BERT

In [9]:
#Dataframes containing the extracted features from the vanilla BERT
vanilla_train = pd.read_csv("../data/vanilla_train_0000-2000.csv")
vanilla_test = pd.read_csv("../data/vanilla_test_1000.csv")

vanilla = LogisticRegression(penalty=None, random_state = 34153, max_iter = 5000)
vanilla.fit(vanilla_train.iloc[:,0:768], vanilla_train.sentiment)

vanilla_pred = vanilla.predict(vanilla_test.iloc[:, 0:768])
#Probability of positive review
vanilla_prob = vanilla.predict_proba(vanilla_test.iloc[:, 0:768])[:, 1]
true = test.sentiment

assess_model(true, vanilla_pred, vanilla_prob)

Accuracy: 0.78
Precision: 0.8034026465028355
Recall: 0.7855822550831792
ROC AUC: 0.8274517858077715


In [15]:
vanilla.predict_log_proba(vanilla_test.iloc[2, 0:768].to_numpy().reshape(1, -1))

  return np.log(self.predict_proba(X))


array([[-inf,   0.]])

# Output

In [31]:
output = pd.DataFrame({'true': true,
                      'vanilla_pred': vanilla_pred,
                      'cont150': contiguous150_pred,
                      'top150': top150_pred,
                      'cont50': contiguous050_pred,
                      'top50': top050_pred})

In [33]:
output.to_csv("../results/results.csv")