# import libs

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler, DataCollatorWithPadding
from datasets import Dataset, load_metric, ClassLabel

from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
import numpy as np
import pandas as pd
import os

In [3]:
# !jupyter nbextension enable --py widgetsnbextension

# setup configuration

In [2]:
class Config:
    # flag to target on Gaudi
    TRAIN_ON_HPU = False
    
    # dataset
    TRAIN_DS_PATH = '../dataset/unbalanced_train.csv'
    EVAL_DS_PATH = '../dataset/unbalanced_eval.csv'
    
    # checkpoint used in preprocessing and modelling
    CHECKPOINT = 'distilbert-base-uncased'
    
    # HF params
    MAX_SEQ_LENGTH = 256
    
    # training hyperparams
    EPOCHS = 5
    LR = 3e-5
    BATCH_SIZE = 16
    
    # saved model path
    MODEL_DIR = './model/distilbert_uncased_github_issue_tagger/'
cfg = Config()

# load the datasets

In [3]:
train_ds = Dataset.from_pandas(pd.read_csv(cfg.TRAIN_DS_PATH))
eval_ds = Dataset.from_pandas(pd.read_csv(cfg.EVAL_DS_PATH))

In [4]:
train_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 20999
})

In [5]:
eval_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 9000
})

# additional preproccessing

In [6]:
# get checkpoint tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.CHECKPOINT)

# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['descriptions'], truncation=True, max_length=cfg.MAX_SEQ_LENGTH)

In [7]:
# tokenize dataset
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)


100%|██████████| 21/21 [00:02<00:00,  7.03ba/s]
100%|██████████| 9/9 [00:01<00:00,  6.38ba/s]


In [8]:
## convert dataset labels from str to ClassLabel
lbels = train_ds.unique("labels")
label_feature = ClassLabel(names=lbels)

# Update default features
train_features = train_ds.features
train_features["labels"] = label_feature
eval_features = eval_ds.features
eval_features["labels"] = label_feature

# Update dataset
train_ds = train_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=train_features)
eval_ds = eval_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=eval_features)

# look at example for validation
train_ds.features

20999ex [00:03, 5252.18ex/s]
9000ex [00:01, 5416.76ex/s]


{'descriptions': Value(dtype='string', id=None),
 'labels': ClassLabel(num_classes=3, names=['__label__bug', '__label__enhancement', '__label__question'], names_file=None, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [9]:
# remove redundant column and set torch format to the datasets
train_ds = train_ds.remove_columns(['descriptions'])
eval_ds = eval_ds.remove_columns(['descriptions'])
train_ds.set_format("torch")
eval_ds.set_format("torch")

In [10]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_ds, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)

# Train and evaluate model

In [11]:
# setup  metrics
metric = load_metric("accuracy")
labels = train_ds.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
# create two discrionaries for the pretrained model 
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [12]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.CHECKPOINT, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
)

Downloading: 100%|██████████| 268M/268M [00:04<00:00, 55.1MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly init

In [13]:
# optimizer
optimizer = AdamW(model.parameters(), lr=cfg.LR)

# lr scheduler
num_training_steps = cfg.EPOCHS * len(train_dataloader) # epoch * train_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(f"Training steps: {num_training_steps}")

Training steps: 6565


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)

cpu


# train and eval helper functions

In [16]:
# train & eval helpers
def train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar=None):
    model.train()
    size = len(train_dataloader)
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # forward propagation
        outputs = model(**batch)
        loss = outputs.loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # step grad and lr
        optimizer.step()
        lr_scheduler.step()
        
        # print out training progress every 500 steps
        if batch_idx % 100 == 0:
            print(f"loss: {loss.item():>7f}  [{batch_idx:>5d}/{size:>5d}]")
        
        # update gobal bar progress
        if global_progress_bar:
            global_progress_bar.update(1)

def eval_epoch(eval_dataloader, model):
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
        # post-process pred
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # accumulate all batches' metrics
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    print(metric.compute())

# training and evaluation looping

In [17]:
# training and eval loops
global_progress_bar = tqdm(range(num_training_steps))
for epoch in range(cfg.EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar)
    print("Evaluation:")
    eval_epoch(eval_dataloader, model)
    print("\n")

  0%|          | 0/6565 [00:00<?, ?it/s]

Epoch 1
-------------------------------


  0%|          | 1/6565 [00:12<23:05:33, 12.67s/it]

loss: 1.077723  [    0/ 1313]


  1%|          | 70/6565 [14:24<22:50:06, 12.66s/it]

In [None]:
try:
    model.save_pretrained(cfg.MODEL_DIR)
    tokenizer.save_pretrained(cfg.MODEL_DIR)
    print(f"Trained model and its tokenizer are saved to {cfg.MODEL_DIR}")
except Exception as e:
    print(e)
    print("Saving model failed")
            