In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%bash
pip install torch
pip install datasets
pip install nltk
pip install transformers[torch]

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 472.7/472.7 kB 15.7 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 10.6 MB/s eta 0:00:00
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 10.7 MB/s eta 0:00:00
Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 

In [3]:
import os
import random
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
import pickle
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoModel, DistilBertModel, Trainer, TrainingArguments, AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, EarlyStoppingCallback


nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
# Load dataset
dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

train, val, test = train_dataset.to_pandas(), validation_dataset.to_pandas(), test_dataset.to_pandas()
max_length = max(0, train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5
max_length

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

272

# Without Augmentation

In [6]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [7]:
def tokenize_fn(batch):
  return tokenizer(batch['text'], truncation = True, padding = True)

In [8]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis = -1)
  acc = np.mean(predictions == labels)
  return {'accuracy': acc}

In [9]:
tokenized_train = train_dataset.map(tokenize_fn, batched = True)
tokenized_val = validation_dataset.map(tokenize_fn, batched = True)

training_args = TrainingArguments(
    output_dir = 'drive/MyDrive/SC4002/DistilBERT',
    eval_strategy = 'steps',
    save_steps = 100,
    eval_steps = 100,
    logging_steps = 100,
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    num_train_epochs = 15,
    learning_rate = 2e-5,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    greater_is_better = True,
    report_to = 'none',
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 5,
    early_stopping_threshold = 0.01
)

trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = tokenized_train,
        eval_dataset = tokenized_val,
        callbacks = [early_stopping_callback],
        tokenizer = tokenizer,
        compute_metrics = compute_metrics,
    )

trainer.train()

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy
100,0.4539,0.375131,0.834897
200,0.266,0.379919,0.844278
300,0.1534,0.462377,0.841463
400,0.0995,0.522847,0.841463
500,0.0617,0.582295,0.842402
600,0.0384,0.655102,0.844278


TrainOutput(global_step=600, training_loss=0.17880892872810364, metrics={'train_runtime': 508.1176, 'train_samples_per_second': 251.812, 'train_steps_per_second': 1.978, 'total_flos': 1542442110738048.0, 'train_loss': 0.17880892872810364, 'epoch': 8.955223880597014})

In [10]:
model_dir = 'drive/MyDrive/SC4002/DistilBERT/checkpoint-200'
saved_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt')

with torch.no_grad():
    logits = saved_model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predicted_labels

tensor([1, 1, 0,  ..., 0, 0, 0])

In [11]:
predictions = predicted_labels.numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8405253283302064


In [12]:
model_dir = 'drive/MyDrive/SC4002/DistilBERT/checkpoint-600'
saved_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt')

with torch.no_grad():
    logits = saved_model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predictions = predicted_labels.numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8424015009380863


In [15]:
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt').to(device)

with torch.no_grad():
    logits = model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predictions = predicted_labels.cpu().numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8405253283302064


# With Augmentation

In [16]:
augmented_train = pd.read_csv('drive/MyDrive/SC4002/augmented_combined_train_dataset.csv')
max_length = max(0, augmented_train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5
max_length

286

In [17]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [20]:
from datasets import Dataset

augmented_train_dataset = Dataset.from_pandas(augmented_train)
tokenized_train = augmented_train_dataset.map(tokenize_fn, batched = True)
tokenized_val = validation_dataset.map(tokenize_fn, batched = True)

training_args = TrainingArguments(
    output_dir = 'drive/MyDrive/SC4002/DistilBERT_augmented',
    eval_strategy = 'steps',
    save_steps = 100,
    eval_steps = 100,
    logging_steps = 100,
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    num_train_epochs = 15,
    learning_rate = 2e-5,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    greater_is_better = True,
    report_to = 'none',
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 5,
    early_stopping_threshold = 0.01
)

trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = tokenized_train,
        eval_dataset = tokenized_val,
        callbacks = [early_stopping_callback],
        tokenizer = tokenizer,
        compute_metrics = compute_metrics,
    )

trainer.train()

Map:   0%|          | 0/12935 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy
100,0.4566,0.3557,0.838649
200,0.2699,0.350576,0.847092
300,0.1884,0.413228,0.848968
400,0.1232,0.473385,0.847092
500,0.0791,0.537274,0.84803
600,0.0577,0.601555,0.85272


TrainOutput(global_step=600, training_loss=0.1958197776476542, metrics={'train_runtime': 504.7582, 'train_samples_per_second': 384.392, 'train_steps_per_second': 3.031, 'total_flos': 1537650257826900.0, 'train_loss': 0.1958197776476542, 'epoch': 5.882352941176471})

In [21]:
augmented_model_dir = 'drive/MyDrive/SC4002/DistilBERT_augmented/checkpoint-600'
saved_model = AutoModelForSequenceClassification.from_pretrained(augmented_model_dir)
tokenizer = AutoTokenizer.from_pretrained(augmented_model_dir)
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt')

with torch.no_grad():
    logits = saved_model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predictions = predicted_labels.numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8395872420262664


In [22]:
augmented_model_dir = 'drive/MyDrive/SC4002/DistilBERT_augmented/checkpoint-200'
saved_model = AutoModelForSequenceClassification.from_pretrained(augmented_model_dir)
tokenizer = AutoTokenizer.from_pretrained(augmented_model_dir)
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt')

with torch.no_grad():
    logits = saved_model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predictions = predicted_labels.numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8461538461538461


In [23]:
test_encodings = tokenizer(test['text'].to_list(), truncation = True, padding = True, return_tensors = 'pt').to(device)

with torch.no_grad():
    logits = model(**test_encodings)
    probabilities = torch.softmax(logits.logits, dim = 1)
predicted_labels = torch.argmax(probabilities, dim = 1)
predictions = predicted_labels.cpu().numpy()
predictions = predictions.tolist()
ground_truth_labels = test['label'].to_list()

correct_predictions = sum(1 for p, g in zip(predictions, ground_truth_labels) if p == g)
accuracy = correct_predictions / len(ground_truth_labels)
print("accuracy = ", accuracy)

accuracy =  0.8395872420262664
