In [1]:
# COLAB
from google.colab import drive
drive.mount('/content/drive')

# COMMON
import numpy as np
import torch

# SEED
import random
import os

# DATA
import pandas as pd

# PREPROCESS DATA
import re
!pip install contractions
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

nltk.download('punkt')
nltk.download('stopwords')

# DATASET/LOADER
!pip install datasets
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

# MODEL
from transformers import AutoModelForSequenceClassification

# TRAIN
from tqdm.auto import tqdm
import torch.optim as optim
import torch.nn as nn

# METRICS
import sklearn.metrics as sm

Mounted at /content/drive
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [2]:
# define variable
MODEL_CHECKPOINT = "bert-base-cased"
BATCH_SIZE = 32
MAX_LENGTH = 256
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

# move model to device if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# set seed
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed()

Random seed set as 42


# Prepare Data

In [4]:
# train data path
train_data_path = "/content/drive/MyDrive/nlu/data/train.csv"

# development data path
dev_data_path = "/content/drive/MyDrive/nlu/data/dev.csv"

# read data
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

In [None]:
train_data

Unnamed: 0,text_1,text_2,label
0,Nick ( Kevin Anderson ) goes back to his homet...,Bank clerk Miles Cullen ( Elliott Gould ) is r...,1
1,"Kate Nelligan , always a forthright and grippi...",Anthony Perkins reportedly felt threatened as ...,1
2,"Patrick, Please, contact Zimin Lu, 713 853 638...","Corey, Paula West Trilium Court 107 The Woodla...",1
3,"wow, ok so my mom was saying how when she gets...","dude, i'm cold",1
4,"Flat broke again , Stan borrows from ' Basher ...",I had been underwhelmed by my first viewing of...,0
...,...,...,...
29995,"John and Krishna, I am sending you an outline ...",FYI Vince,1
29996,You want to know about rabies ? Try this : One...,Wow what a great idea for a movie getting a bu...,1
29997,Plz see email. Sent you info for u & Rick. Bes...,Here's Paul's memo with Mona's edits and a cou...,1
29998,Pretty blonde Paris Hilton ( as Victoria Engli...,Crippled violinist David Miles ( as Filippo ) ...,1


# Preprocessing

In [None]:
# get preprocessed data if it is saved
train_data = pd.read_csv("/content/drive/MyDrive/nlu/data/preprocess_train.csv")
dev_data = pd.read_csv("/content/drive/MyDrive/nlu/data/preprocess_dev.csv")

In [5]:
# remove null row
train_data.drop(train_data[train_data.isnull().any(axis=1)].index, inplace=True)
dev_data.drop(dev_data[dev_data.isnull().any(axis=1)].index, inplace=True)        # need to predict the data including Nan?

In [6]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# define preprocessing function
def preprocess_text(text):
    # convert to lower case
    text = text.lower()

    # expand contractions (ex. don't -> do not)
    text = contractions.fix(text)

    # remove punctuations
    # text = re.sub(r'[^\w\s]', '', text)

    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # remove stopwords
    # tokens = word_tokenize(text)
    # tokens = [token for token in tokens if token not in stop_words]
    # text = ' '.join(tokens)

    # lemmatize
    # tokens = []
    # spacy_string = nlp(text)
    # for token in spacy_string:
    #     tokens.append(token.lemma_)
    # text = " ".join(tokens)

    return text

In [7]:
# preprocess train data
train_data['text_1'] = train_data['text_1'].apply(preprocess_text)
train_data['text_2'] = train_data['text_2'].apply(preprocess_text)

# preprocess development data
dev_data['text_1'] = dev_data['text_1'].apply(preprocess_text)
dev_data['text_2'] = dev_data['text_2'].apply(preprocess_text)

In [None]:
# save preprocessed data
train_data.to_csv("/content/drive/MyDrive/nlu/data/preprocess_train.csv", index=False)
dev_data.to_csv("/content/drive/MyDrive/nlu/data/preprocess_dev.csv", index=False)

In [None]:
# check max length of text and average text length for later tokenization
max_len_1 = 0
max_len_2 = 0
avg_len_1 = 0
avg_len_2 = 0

for i, j in zip(train_data['text_1'], train_data['text_2']):
    avg_len_1 += len(i)
    avg_len_2 += len(j)

    if len(i) > max_len_1:
        max_len_1 = len(i)
    if len(j) > max_len_2:
        max_len_2 = len(j)

max_len_1, max_len_2, avg_len_1 / len(train_data['text_1']), avg_len_2 / len(train_data['text_2'])

# Data Loader

In [8]:
# sample the text by MAX_LENGTH
def sample_text(texts):
    res = []

    for text in texts:
        if len(text.split(" ")) > MAX_LENGTH:
            max_start_index = len(text.split(" ")) - MAX_LENGTH
            start_index = random.randint(0, max_start_index)
            res.append(" ".join(text.split(" ")[start_index: start_index + MAX_LENGTH]))
        else:
            res.append(text)

    return res

# tokenization with random sampling
def random_sample_preprocess_function(records):
    text_1 = sample_text(records['text_1'])
    text_2 = sample_text(records['text_2'])

    return tokenizer(text_1, text_2, truncation=True, return_token_type_ids=True, max_length=MAX_LENGTH)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(records):
    return tokenizer(records['text_1'], records['text_2'], truncation=True, return_token_type_ids=True, max_length = MAX_LENGTH)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# own dataset
class PairwiseDataset(torch.utils.data.Dataset):
    def __init__(self, data, train=True):
        dataset = Dataset.from_pandas(data)
        # dataset = dataset.remove_columns('__index_level_0__')
        if train:
            encoded_dataset = dataset.map(random_sample_preprocess_function, batched=True)
        else:
            encoded_dataset = dataset.map(preprocess_function, batched=True)

        self.input_ids = encoded_dataset["input_ids"]
        self.token_type_ids = encoded_dataset["token_type_ids"]
        self.attention_mask = encoded_dataset["attention_mask"]
        self.labels = encoded_dataset["label"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'token_type_ids': torch.tensor(self.token_type_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'label': torch.tensor(self.labels[idx])
        }

# call this function in every epoch to get randomly sampled train dataset
def get_train_dataloader():
    # prepare dataset
    train_dataset = PairwiseDataset(train_data, train=True)

    # define dataloader
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=data_collator
    )

    return train_dataloader

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [10]:
# validation dataset is deterministic to compare the performance

# prepare dataset
# train_dataset = PairwiseDataset(train_data, train=True)
dev_dataset = PairwiseDataset(dev_data, train=False)

# define dataloader
# train_dataloader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True,
#     collate_fn=data_collator
# )

dev_dataloader = torch.utils.data.DataLoader(
    dev_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator
)

Map:   0%|          | 0/5984 [00:00<?, ? examples/s]

# Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [12]:
# return metrics
def metrics(y_true, y_pred):
    acc = sm.accuracy_score(y_true, y_pred)
    macro_p = sm.precision_score(y_true, y_pred, average='macro')
    macro_r = sm.recall_score(y_true, y_pred, average='macro')
    macro_f1 = sm.f1_score(y_true, y_pred, average='macro')
    w_macro_p = sm.precision_score(y_true, y_pred, average='weighted')
    w_macro_r = sm.recall_score(y_true, y_pred, average='weighted')
    w_macro_f1 = sm.f1_score(y_true, y_pred, average='weighted')
    mcc = sm.matthews_corrcoef(y_true, y_pred)

    return {"Accuracy":acc, "Macro-P":macro_p, "Macro-R":macro_r, "Macro-F1":macro_f1, "W Macro-P":w_macro_p, "W Macro-R":w_macro_r, "W Macro-F1":w_macro_f1, "MCC":mcc}

In [13]:
# train function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(dataloader):
        # move batch to device
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # zero the gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        logits = outputs.logits
        logits = logits.reshape(logits.shape[0])

        # calculate loss
        loss = criterion(logits, labels.float())

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()

        # compute running loss and accuracy
        train_loss += loss.item()
        predicted = (torch.sigmoid(logits) > 0.5).float()
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        # print the loss and accuracy
        history = f"train loss: {loss.item():.2f} | train accuracy: {(correct_predictions / total_predictions):.2f}"
        sys.stdout.write('\r' + history)

    # calculate epoch loss and accuracy
    train_loss /= len(dataloader)
    train_accuracy = correct_predictions / total_predictions

    return train_loss, train_accuracy

# test/validation function
def test(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0.0
    y_true = []
    y_pred = []

    with torch.inference_mode():
        for batch in tqdm(dataloader):
            # move batch to device
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # forward pass
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            logits = outputs.logits
            logits = logits.reshape(logits.shape[0])

            # calculate loss
            loss = criterion(logits, labels.float())

            # compute running loss and metrics
            test_loss += loss.item()
            predicted = (torch.sigmoid(logits) > 0.5).float()

            y_true.extend(labels.detach().cpu().tolist())
            y_pred.extend(predicted.detach().cpu().tolist())

    # calculate epoch loss and accuracy
    test_loss /= len(dataloader)

    test_metrics = metrics(y_true, y_pred)

    return test_loss, test_metrics

In [14]:
import sys

# move model to device if GPU is available
model.to(device)

# define optimizer
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# define loss function
criterion = nn.BCEWithLogitsLoss()

# result
res = {"train_loss": [], "train_accuracy": [], "dev_loss": [], "dev_metrics": []}

# training loop
for epoch in range(NUM_EPOCHS):
    # train
    train_dataloader = get_train_dataloader()
    train_loss, train_accuracy = train(model=model, dataloader=train_dataloader, optimizer=optimizer, criterion=criterion, device=device)

    # validation
    dev_loss, dev_metrics = test(model=model, dataloader=dev_dataloader, criterion=criterion, device=device)

    # save results
    res["train_loss"].append(train_loss)
    res["train_accuracy"].append(train_accuracy)
    res["dev_loss"].append(dev_loss)
    res["dev_metrics"].append(dev_metrics)

    # print result
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | train loss: {train_loss:.4f}, train acc: {train_accuracy:.4f} | dev loss: {dev_loss:.4f}, dev acc: {dev_metrics['Accuracy']:.4f}")

Map:   0%|          | 0/29941 [00:00<?, ? examples/s]

  0%|          | 0/936 [00:00<?, ?it/s]

train loss: 0.75 | train accuracy: 0.72

  0%|          | 0/187 [00:00<?, ?it/s]

Epoch 1/3 | train loss: 0.5159, train acc: 0.7169 | dev loss: 0.4843, dev acc: 0.7276


Map:   0%|          | 0/29941 [00:00<?, ? examples/s]

  0%|          | 0/936 [00:00<?, ?it/s]

train loss: 0.43 | train accuracy: 0.82

  0%|          | 0/187 [00:00<?, ?it/s]

Epoch 2/3 | train loss: 0.3707, train acc: 0.8192 | dev loss: 0.4611, dev acc: 0.7627


Map:   0%|          | 0/29941 [00:00<?, ? examples/s]

  0%|          | 0/936 [00:00<?, ?it/s]

train loss: 0.09 | train accuracy: 0.91

  0%|          | 0/187 [00:00<?, ?it/s]

Epoch 3/3 | train loss: 0.2199, train acc: 0.9066 | dev loss: 0.5111, dev acc: 0.7786


In [15]:
res

{'train_loss': [0.5158769919767848, 0.3707482801256781, 0.21994948874299342],
 'train_accuracy': [0.7168765238301994,
  0.8191777161751445,
  0.9066497444975118],
 'dev_loss': [0.48432519673344926, 0.4611045286298436, 0.5110803079955718],
 'dev_metrics': [{'Accuracy': 0.7276069518716578,
   'Macro-P': 0.7736080191133062,
   'Macro-R': 0.728493464232992,
   'Macro-F1': 0.7161542443407607,
   'W Macro-P': 0.7741002116023116,
   'W Macro-R': 0.7276069518716578,
   'W Macro-F1': 0.7159065152261157,
   'MCC': 0.5000705715361624},
  {'Accuracy': 0.7627005347593583,
   'Macro-P': 0.7630871038326537,
   'Macro-R': 0.7627802828339698,
   'Macro-F1': 0.7626468443321868,
   'W Macro-P': 0.7631358941040405,
   'W Macro-R': 0.7627005347593583,
   'W Macro-F1': 0.7626313337643371,
   'MCC': 0.5258672971581888},
  {'Accuracy': 0.7785762032085561,
   'Macro-P': 0.7810409617900482,
   'Macro-R': 0.7783694402134966,
   'Macro-F1': 0.7779994759098715,
   'W Macro-P': 0.7809312293874476,
   'W Macro-R': 0

In [18]:
torch.save(model.state_dict(), "/content/drive/MyDrive/nlu/result/model/model.pt")

import pickle
with open("/content/drive/MyDrive/nlu/result/performance/result.pkl", "wb") as f:
    pickle.dump(res, f)

# Test