In [None]:
# Adam Yang
# CS 525 Natural Language Processing
# Assignment 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from wordcloud import WordCloud
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk import pos_tag_sents, pos_tag
import gensim
import gensim.downloader as gensim_api

import torch
!pip install transformers
from transformers import pipeline
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
# OBTAIN THE REVIEWS DATASET
reviews_df = pd.read_csv('Reviews.csv')
print("The size of the dataset:", reviews_df.shape)
reviews_df.head()

# DATA PREPROCESSING

In [None]:
def data_preprocessing(df):
    df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1, inplace=True)

    df['Label'] = df['Score'].map(lambda x: 1 if x > 3 else 0)
    df.drop('Score', axis=1, inplace=True)
    
    return df

def imbalance_resampling(df, num):
    # RESAMPLE THE DATA TO MITIGATE IMBALANCE
    df_majority = df[df['Label'] == 1] # Filters minorities and keeps majority class labels
    df_minority = df[df['Label'] == 0] # Filters majorities and keeps minority class labels

    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                  replace=True,     # sample with replacement
                                  n_samples=int(num/2))    # to match majority class

    # Undersample majority class
    df_majority_undersampled = resample(df_majority, 
                                  replace=True,     # sample with replacement
                                  n_samples=int(num/2))    # to match majority class

    # Combine majority class with upsampled minority class
    reviews_df_resampled = pd.concat([df_majority_undersampled, df_minority_upsampled])
    reviews_df_resampled = reviews_df_resampled.reset_index(drop=True)

    return reviews_df_resampled






In [None]:
reviews_df_prep = data_preprocessing(reviews_df)
reviews_df_resampled = imbalance_resampling(reviews_df_prep, 50000)

reviews_df_resampled['Label'].value_counts()

# TASK 1

In [None]:
# PERFORM TF-IDF ANALYSIS OF DATASET
y = reviews_df_resampled['Label']
X = reviews_df_resampled['Text']

# PERFORM 70-30 SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

In [None]:
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print('X_train_review_tfidf shape: ', X_train_tfidf.shape)
print('X_test_review_tfidf shape: ', X_test_tfidf.shape)

In [None]:
# RANDOM FOREST CLASSIFIER
random_forest_clf = RandomForestClassifier(max_depth=100, min_samples_leaf=2, verbose=2)
random_forest_clf.fit(X_train_tfidf, y_train)

y_pred = random_forest_clf.predict(X_test_tfidf)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

In [None]:
# ADABOOST CLASSIFIER
adaboost_clf = AdaBoostClassifier(n_estimators=100)
adaboost_clf.fit(X_train_tfidf, y_train)

y_pred = adaboost_clf.predict(X_test_tfidf)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

In [None]:
# DECISION TREE CLASSIFIER
dt_clf = DecisionTreeClassifier(max_depth=100, min_samples_leaf=2)
dt_clf.fit(X_train_tfidf, y_train)

y_pred = dt_clf.predict(X_test_tfidf)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

# TASK 2

In [None]:
# LOAD THE WORD2VEC EMBEDDINGS
embeddings = gensim_api.load("word2vec-google-news-300")

In [None]:
# CONVERT EACH DOCUMENT TO THE LATENT REPRESENTATION
try:  
    curr_num = 1
    docs_vectors = pd.DataFrame() # creating empty final dataframe
    stopwords = nltk.corpus.stopwords.words('english') # removing stop words
    for doc in reviews_df_resampled['Text'].str.lower().str.replace('[^a-z ]', ''): # looping through each document and cleaning it
        temp = pd.DataFrame()  # creating a temporary dataframe(store value for 1st doc & for 2nd doc remove the details of 1st & proced through 2nd and so on..)
        for word in doc.split(' '): # looping through each word of a single document and spliting through space
            if word not in stopwords: # if word is not present in stopwords then (try)
                try:
                    word_vec = embeddings[word] # if word is present in embeddings(goole provides weights associate with words(300)) then proceed
                    temp = temp.append(pd.Series(word_vec), ignore_index = True) # if word is present then append it to temporary dataframe
                except:
                    pass
        doc_vector = temp.mean() # take the average of each column(w0, w1, w2,........w300)
        docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) # append each document value to the final dataframe
        curr_num += 1

        if curr_num == 5000:
            print("10% Done")
        elif curr_num == 10000:
            print("20% Done")
        elif curr_num == 15000:
            print("30% Done")
        elif curr_num == 20000:
            print("40% Done")
        elif curr_num == 25000:
            print("50% Done")
        elif curr_num == 30000:
            print("60% Done")
        elif curr_num == 35000:
            print("70% Done")
        elif curr_num == 40000:
            print("80% Done")
        elif curr_num == 45000:
            print("90% Done")
        elif curr_num == 50000:
            print("100% Done")
    print("Successful Processing,", docs_vectors.shape)
except:
    print("This either didn't work, or took too long that I got annoyed.")


In [None]:
# Further processing
docs_vectors = docs_vectors.reset_index(drop=True)
reviews_df_resampled = reviews_df_resampled.reset_index(drop=True)
docs_vectors['Label'] = reviews_df_resampled['Label']

In [None]:
# SAVE THE DOCS_VECTOR DF TO LOCAL DRIVE
# Uncomment line below to save model
# docs_vectors.to_csv('docs_vectors_embedding.csv', index = False)

# LOAD THE DOCS_VECTOR DF FROM LOCAL DRIVE
# Uncomment line below to load saved model
docs_vectors = pd.read_csv('docs_vectors_embedding.csv')

In [None]:
# Extract the labels
y = docs_vectors['Label']
X = docs_vectors.drop('Label', axis=1)

# PERFORM 70-30 SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

In [None]:
# RANDOM FOREST CLASSIFIER
random_forest_clf = RandomForestClassifier(max_depth=100, min_samples_leaf=2, verbose=2)
random_forest_clf.fit(X_train, y_train)

y_pred = random_forest_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

In [None]:
# ADABOOST CLASSIFIER
adaboost_clf = AdaBoostClassifier(n_estimators=100)
adaboost_clf.fit(X_train, y_train)

y_pred = adaboost_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

In [None]:
# DECISION TREE CLASSIFIER
dt_clf = DecisionTreeClassifier(max_depth=100, min_samples_leaf=2)
dt_clf.fit(X_train, y_train)

y_pred = dt_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

# TASK 3

In [None]:
# IMPORT THE PRETRAINED BERT SENTIMENT CLASSIFIER
bert_clf = pipeline("sentiment-analysis")

In [None]:
# truncate the text to limit of 512 characters
MAX_LENGTH = 512
reviews_df_resampled['Text_trunc'] = reviews_df_resampled['Text'].str.slice(0,MAX_LENGTH)


In [None]:
# PERFORM SENTIMENT CLASSIFICATION ON EACH REVIEW
try:
    bert_results = bert_clf(reviews_df_resampled['Text_trunc'].tolist())
    bert_results = pd.DataFrame(bert_results)

except:
    print("This either didn't work, or took too long that I got annoyed.")

In [None]:
bert_results['true_labels'] = reviews_df_resampled['Label']

In [None]:
# CONVERT 'POSITIVE' AND 'NEGATIVE' LABELS TO 1 AND 0 RESPECTIVELY
label_mapping = {'POSITIVE': 1, 'NEGATIVE': 0}
bert_results['label'] = bert_results['label'].map(label_mapping)

In [None]:
# SAVE THE BERT CLASSIFICATION RESULTS TO LOCAL DRIVE
# Uncomment line below to save model
# bert_results.to_csv('bert_pretrained_results.csv', index = False)

# LOAD THE BERT CLASSIFICATION RESULTS FROM LOCAL DRIVE
# Uncomment line below to load saved model
bert_results = pd.read_csv('bert_pretrained_results.csv')

In [None]:
# DETERMINE PERFORMANCE METRICS OF THE PRETRAINED BERT MODEL
y_pred = bert_results['label']
y_true = bert_results['true_labels']

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_true, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))

# TASK 4

In [None]:
# The model we will train: base uncased BERT
model_name = "bert-base-uncased"

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)

In [None]:
# FOR SAKE OF SIMPLICITY, USE ANOTHER RESAMPLE FROM SAME REVIEW DATASET
bert_finetuning_df = imbalance_resampling(reviews_df_prep, 20000)

In [None]:
# PERFORM 70-30 SPLIT
texts = bert_finetuning_df['Text']
labels = bert_finetuning_df['Label']
test_size = 0.3
train_texts, valid_texts, train_labels, valid_labels = train_test_split(texts, labels, test_size=test_size)
train_texts = train_texts.reset_index(drop=True)
valid_texts = valid_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
valid_labels = valid_labels.reset_index(drop=True)

In [None]:
MAX_LENGTH = 512 # character limit for BERT model
# Tokenize the dataset, truncate when passed `max_length`, and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH)
valid_encodings = tokenizer(valid_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH)

In [None]:
class ReviewGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = ReviewGroupsDataset(train_encodings, train_labels)
valid_dataset = ReviewGroupsDataset(valid_encodings, valid_labels)

In [None]:
# load the BERT classification model
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Function for generating performance metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [None]:
# CREATE THE TRAINING ARGUMENTS INSTANCE
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=200,               # log & save weights each logging_steps
    save_steps=200,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [None]:
# CREATE THE TRAINER INSTANCE
trainer = Trainer(
    model=bert_model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [None]:
# PERFORM FINE-TUNING OF THE BERT MODEL
try:
    # train the model
    trainer.train()
except:
    print("This either didn't work, or took too long that I got annoyed.")

In [None]:
# The path where the fine-tuned model will be saved
model_path = 'sentiment-analysis-bert-base-uncased'

# SAVE THE FINE-TUNED BERT MODEL AND TOKENIZER TO LOCAL DRIVE
# Uncomment the two lines below to save model and tokenizer
# bert_model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

# LOAD THE FINE-TUNED BERT MODEL AND TOKENIZER FROM LOCAL DRIVE
# Uncomment the two lines below to save model and tokenizer
bert_model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
# Function to get sentiment prediction
MAX_LENGTH = 512
def get_prediction(text, convert_to_label=False):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    # perform inference to our model
    outputs = bert_model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    d = {
        0: "negative",
        1: "positive"
    }
    if convert_to_label:
        return d[int(probs.argmax())]
    else:
        return int(probs.argmax())

In [None]:
# PERFORM SENTIMENT CLASSIFICATION ON EACH REVIEW IN THE DATASET
bert_finetuning_df = imbalance_resampling(reviews_df_prep, 15000)

try:
    curr_num = 1
    bert_predictions = []
    for doc in bert_finetuning_df['Text']:
        prediction = get_prediction(doc, convert_to_label=False)
        bert_predictions.append(prediction)
        curr_num += 1

        if curr_num == 750:
            print("5% Done")
        if curr_num == 1500:
            print("10% Done")
        if curr_num == 2250:
            print("15% Done")
        elif curr_num == 3000:
            print("20% Done")
        elif curr_num == 3750:
            print("25% Done")            
        elif curr_num == 4500:
            print("30% Done")
        elif curr_num == 5250:
            print("35% Done")
        elif curr_num == 6000:
            print("40% Done")
        elif curr_num == 6750:
            print("45% Done")
        elif curr_num == 7500:
            print("50% Done")
        elif curr_num == 8250:
            print("55% Done")        
        elif curr_num == 9000:
            print("60% Done")
        elif curr_num == 9750:
            print("65% Done")
        elif curr_num == 10500:
            print("70% Done")
        elif curr_num == 11250:
            print("75% Done")
        elif curr_num == 12000:
            print("80% Done")
        elif curr_num == 12750:
            print("85% Done")
        elif curr_num == 13500:
            print("90% Done")
        elif curr_num == 14250:
            print("95% Done")
        elif curr_num == 15000:
            print("100% Done")
    print("Successful Processing")

except:
    print("This either didn't work, or took too long that I got annoyed.")



In [None]:
bert_predictions = pd.DataFrame(bert_predictions, columns = ['prediction'])
bert_predictions['truth'] = bert_finetuning_df['Label']

In [None]:
# DETERMINE PERFORMANCE METRICS OF THE FINE-TUNED BERT MODEL
y_pred = bert_predictions['truth']
y_true = bert_predictions['prediction']

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp
p = tp/(tp+fp)
r = tp/(tp+fn)
print('Test Accuracy: ', accuracy_score(y_true, y_pred))
print("Precision Score:", p)
print("Recall Score:", r)
print("F1 Score:", (2*p*r)/(p+r))