<a href="https://colab.research.google.com/github/carla-garcia-medina/nlp_final_project/blob/main/final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-Processing

## Dataset cleaning & pre-process

In [202]:
import numpy as np
import pandas as pd
import nltk.corpus
from sklearn.model_selection import train_test_split

# Please delete this if you are not using google drive
from google.colab import drive
drive.mount('/content/drive')

# Please change this to your working directory
path="/content/drive/My Drive/2022NLP/project/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [203]:
closed_class_stop_words = ['a','the','an','and','or','but','about','above','after','along','amid','among',\
                           'as','at','by','for','from','in','into','like','minus','near','of','off','on',\
                           'onto','out','over','past','per','plus','since','till','to','under','until','up',\
                           'via','vs','with','that','can','cannot','could','may','might','must',\
                           'need','ought','shall','should','will','would','have','had','has','having','be',\
                           'is','am','are','was','were','being','been','get','gets','got','gotten',\
                           'getting','seem','seeming','seems','seemed',\
                           'enough', 'both', 'all', 'your' 'those', 'this', 'these', \
                           'their', 'the', 'that', 'some', 'our', 'no', 'neither', 'my',\
                           'its', 'his' 'her', 'every', 'either', 'each', 'any', 'another',\
                           'an', 'a', 'just', 'mere', 'such', 'merely' 'right', 'no', 'not',\
                           'only', 'sheer', 'even', 'especially', 'namely', 'as', 'more',\
                           'most', 'less' 'least', 'so', 'enough', 'too', 'pretty', 'quite',\
                           'rather', 'somewhat', 'sufficiently' 'same', 'different', 'such',\
                           'when', 'why', 'where', 'how', 'what', 'who', 'whom', 'which',\
                           'whether', 'why', 'whose', 'if', 'anybody', 'anyone', 'anyplace', \
                           'anything', 'anytime' 'anywhere', 'everybody', 'everyday',\
                           'everyone', 'everyplace', 'everything' 'everywhere', 'whatever',\
                           'whenever', 'whereever', 'whichever', 'whoever', 'whomever' 'he',\
                           'him', 'his', 'her', 'she', 'it', 'they', 'them', 'its', 'their','theirs',\
                           'you','your','yours','me','my','mine','I','we','us','much','and/or'
                           ]

In [204]:
# Start pre-processing

nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

def process_text(text):
  
    # Remove punctuation
    import string
    for punctuation in string.punctuation:
      if punctuation!=".":
        clean_text=text.replace(punctuation,' ')
    
    # tokenizing
    tokens = word_tokenize(clean_text)

    clean_tokens = []
    for token in tokens:
        if token.isalpha():
            clean_tokens.append(token)
   
    return clean_tokens

#Lemminization

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

def lemma(term):
  lemma_list=[]
  for token in term:
    lemma=wordnet_lemmatizer.lemmatize(token)
    if lemma not in closed_class_stop_words:
      lemma_list.append(lemma)
  return lemma_list

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [205]:
train_size = 30000
val_percentage = 0.2
test_size = 0.2 * train_size

### Dataset 1: Yelp Review Rating Labeling Dataset

In [206]:
!pip install datasets
from datasets import load_dataset



In [207]:
yelp_rating_labled_dataset = load_dataset("yelp_review_full")

Reusing dataset yelp_review_full (/root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
yelp_rating_train = pd.DataFrame(yelp_rating_labled_dataset['train'])
yelp_rating_test = pd.DataFrame(yelp_rating_labled_dataset['test'])

In [None]:
yelp_rating_train_sample = yelp_rating_train.sample(train_size)
yelp_rating_train_sample, yelp_rating_val_sample = train_test_split(yelp_rating_train_sample, test_size = 0.1)
yelp_rating_test_sample = yelp_rating_test.sample(test_size)

In [None]:
#yelp_rating_train_sample['cleaned_tokens'] = yelp_rating_train_sample['text'].apply(process_text).apply(lemma)
#yelp_rating_test_sample['cleaned_tokens'] = yelp_rating_test_sample['text'].apply(process_text).apply(lemma)

In [None]:
yelp_rating_train_sample.head()

In [None]:
yelp_rating_test_sample.head()

### Dataset 2: Yelp Reviews Polarity-labled Dataset

The data file is 4GB for this one and is too large. Therefore I decide to convert the rating-labled data above to polarized data by mapping rating score of 1 and 2 to negative, 4 and 5 to positive.

In [None]:
yelp_rating_train_polarized = yelp_rating_train.sample(train_size).rename(columns={'label':'score'})
yelp_rating_test_polarized = yelp_rating_test.sample(test_size).rename(columns={'label':'score'})
# negative:0, neutral:1, positive:2
polarization = {0: 0,
         1: 0,
         2: 1,
         3: 2,
         4: 2}
yelp_rating_train_polarized["label"] = yelp_rating_train_polarized["score"].map(polarization)
yelp_rating_test_polarized["label"] = yelp_rating_test_polarized["score"].map(polarization)

In [None]:
#yelp_rating_train_polarized['cleaned_tokens'] = yelp_rating_train_polarized['text'].apply(process_text).apply(lemma)
yelp_rating_train_polarized, yelp_rating_val_polarized = train_test_split(yelp_rating_train_polarized, test_size = val_percentage)
#yelp_rating_test_polarized['cleaned_tokens'] = yelp_rating_test_polarized['text'].apply(process_text).apply(lemma)

In [None]:
yelp_rating_train_polarized.head()
#yelp_rating_test_polarized.groupby("label").size()

### Dataset 3: Rotten Tomatoes Review Polarity Labeling

In [None]:
rotten_tomatoes_dataset = load_dataset("rotten_tomatoes")

In [None]:
rotten_tomatoes_dataset_train = pd.DataFrame(rotten_tomatoes_dataset['train'])#.sample(train_size)
rotten_tomatoes_dataset_test = pd.DataFrame(rotten_tomatoes_dataset['test'])#.sample(test_size)

In [None]:
#rotten_tomatoes_dataset_train['cleaned_tokens'] = rotten_tomatoes_dataset_train['text'].apply(process_text).apply(lemma)
rotten_tomatoes_dataset_train, rotten_tomatoes_dataset_val = train_test_split(rotten_tomatoes_dataset_train, test_size = val_percentage)
#rotten_tomatoes_dataset_test['cleaned_tokens'] = rotten_tomatoes_dataset_test['text'].apply(process_text).apply(lemma)

In [None]:
rotten_tomatoes_dataset_train.head()
rotten_tomatoes_dataset_test.head()

### Dataset 4: Tweet Emoji Labeling

In [None]:
tweet_emoji_dataset = load_dataset("tweet_eval", "emoji")

In [None]:
tweet_emoji_dataset_train = pd.DataFrame(tweet_emoji_dataset['train']).sample(train_size)
tweet_emoji_dataset_test = pd.DataFrame(tweet_emoji_dataset['test']).sample(test_size)

In [None]:
#tweet_emoji_dataset_train['cleaned_tokens'] = tweet_emoji_dataset_train['text'].apply(process_text).apply(lemma)
tweet_emoji_dataset_train, tweet_emoji_dataset_val = train_test_split(tweet_emoji_dataset_train, test_size = val_percentage)
#tweet_emoji_dataset_test['cleaned_tokens'] = tweet_emoji_dataset_test['text'].apply(process_text).apply(lemma)

In [None]:
tweet_emoji_dataset_train.head()
tweet_emoji_dataset_test.head()
#len(sorted(tweet_emoji_dataset_train['label'].unique()))

### List all datasets

In [None]:
datasets_lst = [(yelp_rating_train_sample, yelp_rating_val_sample, yelp_rating_test_sample, 5),
            (yelp_rating_train_polarized, yelp_rating_val_polarized, yelp_rating_test_polarized, 3),
            (rotten_tomatoes_dataset_train, rotten_tomatoes_dataset_val, rotten_tomatoes_dataset_test, 2),
            (tweet_emoji_dataset_train, tweet_emoji_dataset_val, tweet_emoji_dataset_test, 20)]

# Building BiLSTM Model

## Import packages

In [None]:
!pip install sacremoses

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
import pandas as pd
import sacremoses
from torch.utils.data import dataloader, Dataset
from tqdm.auto import tqdm

## Download and Load GloVe Embeddings
We will use GloVe embedding parameters to initialize our layer of word representations / embedding layer.


In [None]:
# === Download GloVe word embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip

# === Unzip word embeddings and use only the top 50000 word embeddings for speed
# !unzip glove.6B.zip
# !head -n 50000 glove.6B.300d.txt > glove.6B.300d__50k.txt

# === Download Preprocessed version
!wget https://docs.google.com/uc?id=1KMJTagaVD9hFHXFTPtNk0u2JjvNlyCAu -O glove_split.aa
!wget https://docs.google.com/uc?id=1LF2yD2jToXriyD-lsYA5hj03f7J3ZKaY -O glove_split.ab
!wget https://docs.google.com/uc?id=1N1xnxkRyM5Gar7sv4d41alyTL92Iip3f -O glove_split.ac
!cat glove_split.?? > 'glove.6B.300d__50k.txt'

In [None]:
def load_glove(glove_path, embedding_dim):
    with open(glove_path) as f:
        token_ls = [PAD_TOKEN, UNK_TOKEN]
        embedding_ls = [np.zeros(embedding_dim), np.random.rand(embedding_dim)]
        for line in f:
            token, raw_embedding = line.split(maxsplit=1)
            token_ls.append(token)
            embedding = np.array([float(x) for x in raw_embedding.split()])
            embedding_ls.append(embedding)
        embeddings = np.array(embedding_ls)
        print(embedding_ls[-1].size)
    return token_ls, embeddings

PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
EMBEDDING_DIM=300 # dimension of Glove embeddings
glove_path = "glove.6B.300d__50k.txt"
vocab, embeddings = load_glove(glove_path, EMBEDDING_DIM)

## Convert text data into sequence of indices

In [None]:
def tokenize(data, labels, tokenizer, vocab, max_seq_length=128):
    vocab_to_idx = {word: i for i, word in enumerate(vocab)}
    text_data = []
    label_data = []
    for ex in tqdm(data):
        tokenized = tokenizer.tokenize(ex.lower())
        ids = [vocab_to_idx.get(token, 1) for token in tokenized]
        text_data.append(ids)
    return text_data, labels

## Create DataLoaders
Create Pytorch DataLoaders for our train, val, and test data.

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    This class inherits torch.utils.data.Dataset
    """
    def __init__(self, data_list, target_list, max_sent_length=128):
        """
        @param data_list: list of data tokens 
        @param target_list: list of data targets 
        """
        self.data_list = data_list
        self.target_list = target_list
        self.max_sent_length = max_sent_length
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key, max_sent_length=None):
        """
        Triggered when calling dataset[i]
        """
        if max_sent_length is None:
            max_sent_length = self.max_sent_length
        token_idx = self.data_list[key][:max_sent_length]
        label = self.target_list[key]
        return [token_idx, label]

    def collate_func(self, batch):
        """
        Customized function for DataLoader that dynamically pads the batch so that all 
        data have the same length
        """ 
        data_list = [] # store padded sequences
        label_list = [element[1] for element in batch]
        max_batch_seq_len = None # the length of longest sequence in batch
                                 # if it is less than self.max_sent_length
                                 # else max_batch_seq_len = self.max_sent_length

        # If self.max_sent_length is less than the length of longest sequence 
        # in the batch, use self.max_sent_length. Otherwise, use the length 
        # of longest sequence in the batch.
        max_num_elements = max([len(element[0]) for element in batch])
        if max_num_elements < self.max_sent_length:
          max_batch_seq_len = max_num_elements
        else:
          max_batch_seq_len = self.max_sent_length

        """
          # Pad the sequences in your data 
          # Trim the sequences that are longer than self.max_sent_length
          # return padded data_list and label_list
        """

        for element in batch:
          sequence = element[0]
          length = len(sequence)
          if length < max_batch_seq_len:
            padding = [0 for _ in range(max_batch_seq_len - length)]
            data_list.append(sequence + padding)
          else:
            data_list.append(sequence[:max_batch_seq_len])
        
        data_list = torch.tensor(data_list)
        label_list = torch.tensor(label_list)

        return [data_list, label_list]

## BiLSTM Classifier

In [None]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    """
    LSTMClassifier classification model
    """
    def __init__(self, embeddings, hidden_size, num_layers, num_classes, bidirectional, dropout_prob=0.3):
        """
           Components of BiLSTM Classifier model
        """
        super().__init__()
        self.embedding_layer = self.load_pretrained_embeddings(embeddings)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.lstm = nn.LSTM(
            input_size=embeddings.shape[1], hidden_size=hidden_size, 
            num_layers=num_layers, dropout=dropout_prob, 
            batch_first=True, bidirectional=bidirectional)
        self.non_linearity = nn.ReLU() # For example, ReLU
        self.clf = nn.Linear(hidden_size*2, num_classes) # classifier layer
        
    
    def load_pretrained_embeddings(self, embeddings):
        embedding_layer = nn.Embedding(embeddings.shape[0], embeddings.shape[1], padding_idx=0)
        embedding_layer.weight.data = torch.Tensor(embeddings).float()
        return embedding_layer


    def forward(self, inputs):
        logits = None
        v_embedded = self.embedding_layer(inputs)
        v_dropout = self.dropout(v_embedded)
        v_bilstm, _ = self.lstm(v_dropout)
        v_avg_pool = torch.mean(v_bilstm, 1)
        v_nonlinear = self.non_linearity(v_avg_pool)
        v_classify = self.clf(v_nonlinear)

        return v_classify

## Train model with early stopping

Train the model for `NUM_EPOCHS`. 
Keep track of training loss.  
Compute the validation accuracy after each epoch. Keep track of the best validation accuracy and save the model with the best validation accuracy.  

If the validation accuracy does not improve for more than `early_stop_patience` number of epochs in a row, stop training. 


In [None]:
def evaluate(model, dataloader, device):
    accuracy = None
    n_correct = n_total = 0 
    model.eval()
    with torch.no_grad():
        for (data_batch, batch_labels) in dataloader:
            out = model(data_batch.to(device))
            max_scores, preds = out.max(dim=1)
            n_correct += np.sum(preds.cpu().numpy() == batch_labels.numpy())
            n_total += out.shape[0]
    accuracy = n_correct*1.0/n_total
    return accuracy 

In [None]:
def train_with_early_stopping(device, criterion, optimizer):
  train_loss_history = []
  val_accuracy_history = []
  best_val_accuracy = 0
  n_no_improve = 0
  early_stop_patience=2
  NUM_EPOCHS=10
    
  for epoch in tqdm(range(NUM_EPOCHS)):
      model.train()  # this enables dropout/regularization
      for i, (data_batch, batch_labels) in enumerate(train_loader):
          preds = model(data_batch.to(device))
          loss = criterion(preds, batch_labels.to(device))
          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          train_loss_history.append(loss.item())
          
      """
          Code for tracking best validation accuracy, saving the best model, and early stopping
          # Compute validation accuracy after each training epoch using `evaluate` function
          # Keep track of validation accuracy in `val_accuracy_history`
          # save model with best validation accuracy, hint: torch.save(model, 'best_model.pt')
          # Early stopping: 
          # stop training if the validation accuracy does not improve for more than `early_stop_patience` runs
      """
      accuracy = evaluate(model, val_loader, device)
      val_accuracy_history.append(accuracy)
      torch.save(model, 'best_model.pt')
      if best_val_accuracy < accuracy:
        best_val_accuracy = accuracy
      else:
        n_no_improve += 1
      if n_no_improve == early_stop_patience:
        break

  print("Best validation accuracy is: ", best_val_accuracy)
  return train_loss_history, val_accuracy_history

To avoid overfiting of our model, we use early stopping. Particularly when training a large model, early stopping can help us stop training when at the point where the model stops making genaralizations about the data and begins learning statistical noise that would cause the model to overfit.This would make our model less useful and have less performance when tested on new data/datasets.

# Final Evaluation

In [None]:
def score(keyFileName, responseFileName):
    keyFile = open(keyFileName, 'r')
    key = keyFile.readlines()
    responseFile = open(responseFileName, 'r')
    response = responseFile.readlines()
    if len(key) != len(response):
        print("length mismatch between key and submitted file")
        exit()
    correct = 0
    incorrect = 0
    response_total = 0
    key_total = 0

    for i in range(len(key)):
        key[i] = key[i].rstrip('\n')
        response[i] = response[i].rstrip('\n')

        if key[i] == response[i]:
            correct += 1
        else:
            incorrect += 1

        if response[i]:
            response_total += 1
        if key[i]:
            key_total += 1

    print(correct, "out of", str(correct + incorrect) + " tags correct")
    accuracy = 100.0 * correct / (correct + incorrect)
    print("  accuracy: %5.2f" % accuracy)

    precision = 100.0 * correct / response_total
    recall = 100.0 * correct / key_total
    F = 2 * precision * recall / (precision + recall)
    print("  precision: %5.2f" % precision)
    print("  recall:    %5.2f" % recall)
    print("  F1:        %5.2f" % F)

# Put it all together

In [None]:
import matplotlib.pyplot as plt
for datasets in datasets_lst:
  train_labels = datasets[0]['label'].tolist()
  train_texts = datasets[0]['text'].tolist()
  val_labels = datasets[1]['label'].tolist()
  val_texts = datasets[1]['text'].tolist()
  test_labels = datasets[2]['label'].tolist()
  test_texts = datasets[2]['text'].tolist()
  num_classes = datasets[3]

  tokenizer = sacremoses.MosesTokenizer()
  train_data_indices, train_labels = tokenize(train_texts, train_labels, tokenizer, vocab)
  val_data_indices, val_labels = tokenize(val_texts, val_labels, tokenizer, vocab)
  test_data_indices, test_labels = tokenize(test_texts, test_labels, tokenizer, vocab)

  BATCH_SIZE = 64
  max_sent_length=128

  train_dataset = SpamDataset(train_data_indices, train_labels, max_sent_length)
  train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                            batch_size=BATCH_SIZE,
                                            collate_fn=train_dataset.collate_func,
                                            shuffle=True)

  val_dataset = SpamDataset(val_data_indices, val_labels, train_dataset.max_sent_length)
  val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                            batch_size=BATCH_SIZE,
                                            collate_fn=train_dataset.collate_func,
                                            shuffle=False)

  test_dataset = SpamDataset(test_data_indices, test_labels, train_dataset.max_sent_length)
  test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                            batch_size=BATCH_SIZE,
                                            collate_fn=train_dataset.collate_func,
                                            shuffle=False)

  data_batch, labels = next(iter(train_loader))
  # BiLSTM hyperparameters
  hidden_size = 32
  num_layers = 1
  num_classes = num_classes
  print(num_classes)
  bidirectional=True
  torch.manual_seed(1234)

  # if cuda exists, use cuda, else run on cpu
  if torch.cuda.is_available():
      device = torch.device("cuda:0")
  else:
      device=torch.device('cpu')

  model = LSTMClassifier(embeddings, hidden_size, num_layers, num_classes, bidirectional)
  model.to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.01)
  train_loss_history, val_accuracy_history = train_with_early_stopping(device, criterion, optimizer)

  #train_loss_history
  pd.Series(train_loss_history).plot()
  plt.show()

  #X-axis: Epochs, Y-axis: validation accuracy
  pd.Series(val_accuracy_history).plot()
  plt.show()

  # Reload best model from saved checkpoint
  # Compute test accuracy
  # device = "cuda:0"
  model = torch.load('best_model.pt')
  test_accuracy = evaluate(model, test_loader, device)
  print(test_accuracy)

  #score(key_file, response_file)