<a href="https://colab.research.google.com/github/andrewdge/CSE354-Final-Project/blob/main/CSE354_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Document Level Predictions Using Paragraph Level Sentimient

Using the PerSent dataset, we will be training our model based on paragraph-level sentiments. These will be aggregated to produce document-level sentiments.

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 3.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 3.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 44.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.3 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacre

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import torch
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
import os
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer
torch.manual_seed(42)
np.random.seed(42)

# Constants

Constants we will use in our experiments. These may be subjected to change as hyperparameters

In [7]:
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
BATCH_SIZE = 16
EPOCHS = 3

# Andrew PATH
TEST_PATH = '/content/drive/MyDrive/CSE354/random_test.csv'
TRAIN_PATH = '/content/drive/MyDrive/CSE354/train.csv'

test = pd.read_csv(TEST_PATH)
train = pd.read_csv(TRAIN_PATH)



# Initializing Our Model

Here is where we set up our DistilBERT model.

In [8]:
class DistillBERT():
  def __init__(self):
    # TODO(students): start
    self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
    model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
    # TODO(students): end
  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer

# DataLoader

This class handles loading, preprocessing, and tokenizing the data.

Each row in the dataframe contains text with some number of paragraphs, as well as a number as labels per paragraph. We add another column in the dataframe, paragraphs per document. This will be used later to test our predictions as compare paragraph-level predictions to paragraph labels, as well as document-level predictions to document labels. We also remove data without paragraph-level labels.

For the labels create new columns for each.

This format is largely takes inspiration from Assignment 3.

In [9]:
class DatasetLoader(Dataset):
  def __init__(self, data, tokenizer):
    # Data is the uncleaned data, as a dataframe.
    self.data = data
    self.tokenizer = tokenizer

  def preprocess_data(self):
    # Combine labels into list.
    df = self.data
    df = df[df['Paragraph0'].notna()]
    df['Paragraph Labels'] = df.iloc[:, 6:].values.tolist() #Includes Nans, remove them
    self.data = df

  def tokenize_data(self):
    # Tokenizing
    tokens = []
    labels = []
    label_dict = {'Negative': 0,
                  'Neutral': 1,
                  'Positive': 2}
    document_list = self.data['DOCUMENT']
    label_list = self.data['Paragraph Labels']

    for (document, labels) in tqdm(zip(document_list, label_list), total=len(document_list)):
      paragraphs = document.split('\n')
      for paragraph, label in zip(paragraphs, labels):
        encoding = self.tokenizer(text=paragraph, truncation='longest_first', max_length=512, return_tensors='pt')
        labels.append(label)
        tokens.append(encoding.input_ids[0]) # Might need to CUDA
    
    tokens = pad_sequence(tokens, batch_first=True)
    labels = torch.tensor(labels)
    labels.to("cuda:0" if torch.cuda.is_available() else "cpu")
    tokens.to("cuda:0" if torch.cuda.is_available() else "cpu")
    dataset = TensorDataset(tokens, labels)
    return dataset

  def get_data_loaders(self, shuffle=True):
    processed_dataset = self.tokenize_data()
    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=BATCH_SIZE
    )
    return data_loader

In [None]:
p = (train['DOCUMENT'].values[0])
x = p.split('\n')
print(len(x))


5


In [10]:
class Trainer():

  def __init__(self, args):
    self.train_data = args['train_data']
    self.val_data = args['val_data']
    self.batch_size = args['batch_size']
    self.epochs = args['epochs']
    self.save_path = args['save_path']
    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0)
    recall = recall_score(labels_flat, pred_flat, zero_division=0)
    f1 = f1_score(labels_flat, pred_flat, zero_division=0)
    return precision, recall, f1

  def set_training_parameters(self):
    # TODO(students): start
    layers = []
    for name, param in self.model.named_parameters(): # get layers
        if "distilbert.transformer.layer" in name:
          # print(f'{name},  {param.requires_grad}')
          layers.append(name)
    
    # print('--------------------------------\n\n\n')


    tt = self.training_type
    if tt == "frozen_embeddings":
      for name, param in self.model.named_parameters(): # get layers
        if "distilbert.embeddings" in name:
          self.model.get_parameter(name).requires_grad = False
    elif tt == "top_2_training":
      exclude = ['layer.4', 'layer.5']
      for i in range(len(layers)):
        for j in range(len(exclude)):
          if exclude[j] not in layers[i]:
            self.model.get_parameter(layers[i]).requires_grad = False
    elif tt == "top_4_training":
      include = ['layer.0', 'layer.1']
      for i in range(len(layers)):
        for j in range(len(include)):
          if include[j] in layers[i]:
            self.model.get_parameter(layers[i]).requires_grad = False
    else: # all training
      pass # since all layers are trained, do nothing

    # for name, param in self.model.named_parameters(): # get layers
    #     print(f'{name},  {param.requires_grad}')
          
    # TODO(students): end

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for batch_idx, (reviews, labels) in enumerate(tqdm(data_loader)):
      self.model.zero_grad()
      # TODO(students): start
      output = self.model(reviews.to(self.device), labels=labels.to(self.device)) 
      with torch.no_grad():
        prec, rec, f1 = self.get_performance_metrics(output.logits.cpu(), labels.cpu())
      output.loss.backward()
      optimizer.step()
      total_recall += rec
      total_precision += prec
      total_f1 += f1
      total_loss += output.loss
      # TODO(students): end

    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for (reviews, labels) in tqdm(data_loader):
        # TODO(students): start
        output = self.model(reviews.to(self.device), labels=labels.to(self.device)) 
        prec, rec, f1 = self.get_performance_metrics(output.logits.cpu(), labels.cpu())
        total_recall += rec
        total_precision += prec
        total_f1 += f1
        total_loss += output.loss

        # TODO(students): end
    
    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)
    val_dataset = DatasetLoader(self.val_data, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)
    optimizer = AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)
    self.set_training_parameters()
    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')
      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        # self.save_transformer()
        last_best = val_f1
        print("Model saved.")