In [None]:
# some auxiliary code from https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

In [1]:
import os
import torch
import torch.nn as nn
import pandas as pd
from collections import defaultdict
from transformers import (
    DistilBertTokenizer,
    DataCollatorWithPadding,
    DistilBertModel,
    get_linear_schedule_with_warmup
)
from load_data import create_data_loader
from train_eval import train_epoch, eval_model
from sklearn.model_selection import train_test_split
from config import (
    DIR_, EPOCHS, PRE_TRAINED_MODEL_NAME, RANDOM_STATE,
    TEST_SIZE, DATA_PATH, TARGET_COL, TEXT_COL, LR
    )

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if not os.path.exists(DIR_):
    os.mkdir(DIR_)

In [3]:
# instantiate df, tokenizer and data collator
df = pd.read_csv(DATA_PATH)[[TEXT_COL, TARGET_COL]]
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
# load data and data loaders
tr_data, te_data = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)
te_data, val_data = train_test_split(
    te_data, test_size=TEST_SIZE*0.5, random_state=RANDOM_STATE)

tr_data_loader, te_data_loader, val_data_loader = (
    create_data_loader(d, tokenizer) for d in [tr_data, te_data, val_data]
)

In [6]:
# using this a starting point:
# # https://www.kaggle.com/code/samson22/distilbert-in-pytorch
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        config = self.model.config
        self.pre_classifier = nn.Linear(
            config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(config.dropout)
        
    def forward(self, input_ids=None, attention_mask=None):
        distilbert_output = self.model(
            input_ids=input_ids, 
            attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [7]:
model = Classifier().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
total_steps = len(tr_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss().to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
print(device)

cpu


In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for e in range(EPOCHS):
    print(f"Epoch {e+1}/{EPOCHS}")
    print('-'*10)

    train_acc, train_loss = train_epoch(
        model, tr_data_loader, criterion, optimizer, device, scheduler, len(tr_data)
      )

    print(f"Train Loss: {train_loss} // Train Accuracy: {train_acc}")

    val_acc, val_loss = eval_model(
        model, val_data_loader, criterion, device, len(val_data)
    )

    print(f"Val Loss: {val_loss} // Val Accuracy: {val_acc}")

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), "best_model_state.bin")
        
    best_accuracy = val_acc