<a href="https://colab.research.google.com/github/btm1837/github_issue_tagger/blob/main/distilibert_collab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler, DataCollatorWithPadding
from datasets import Dataset, load_metric, ClassLabel

from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
import numpy as np
import pandas as pd
import os

# import libs

In [20]:
# !jupyter nbextension enable --py widgetsnbextension

# setup configuration

In [21]:
class Config:
    # flag to target on Gaudi
    TRAIN_ON_HPU = False
    
    # dataset
    TRAIN_DS_PATH = '/content/drive/MyDrive/04_projects/unbalanced_train.csv'
    EVAL_DS_PATH = '/content/drive/MyDrive/04_projects/unbalanced_eval.csv'
    
    # checkpoint used in preprocessing and modelling
    CHECKPOINT = 'distilbert-base-uncased'
    
    # HF params
    MAX_SEQ_LENGTH = 256
    
    # training hyperparams
    EPOCHS = 5
    LR = 3e-5
    BATCH_SIZE = 16
    
    # saved model path
    MODEL_DIR = '/content/drive/MyDrive/04_projects/distilbert_uncased_github_issue_tagger/'
cfg = Config()

# load the datasets

In [29]:
train_ds = Dataset.from_pandas(pd.read_csv(cfg.TRAIN_DS_PATH))
eval_ds = Dataset.from_pandas(pd.read_csv(cfg.EVAL_DS_PATH))

In [30]:
train_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 20603
})

In [31]:
eval_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 8831
})

# additional preproccessing

In [32]:
# get checkpoint tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.CHECKPOINT)

# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['descriptions'], truncation=True, max_length=cfg.MAX_SEQ_LENGTH)

In [33]:
# tokenize dataset
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)
train_ds.features

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'descriptions': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Value(dtype='string', id=None)}

In [34]:
## convert dataset labels from str to ClassLabel
lbels = train_ds.unique("labels")
label_feature = ClassLabel(names=lbels)

# Update default features
train_features = train_ds.features
train_features["labels"] = label_feature
eval_features = eval_ds.features
eval_features["labels"] = label_feature

# Update dataset
train_ds = train_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=train_features)
eval_ds = eval_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=eval_features)

# look at example for validation
train_ds.features

  0%|          | 0/20603 [00:00<?, ?ex/s]

  0%|          | 0/8831 [00:00<?, ?ex/s]

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'descriptions': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': ClassLabel(num_classes=3, names=['enhancement', 'bug', 'question'], id=None)}

In [35]:
# remove redundant column and set torch format to the datasets
train_ds = train_ds.remove_columns(['descriptions'])
eval_ds = eval_ds.remove_columns(['descriptions'])
train_ds.set_format("torch")
eval_ds.set_format("torch")

In [36]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_ds, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)

# Train and evaluate model

In [37]:
# setup  metrics
metric = load_metric("accuracy")
labels = train_ds.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [38]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.CHECKPOINT, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
)

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [39]:
# optimizer
optimizer = AdamW(model.parameters(), lr=cfg.LR)

# lr scheduler
num_training_steps = cfg.EPOCHS * len(train_dataloader) # epoch * train_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(f"Training steps: {num_training_steps}")

Training steps: 6440


In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

cuda


# train and eval helper functions

In [41]:
# train & eval helpers
def train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar=None):
    model.train()
    size = len(train_dataloader)
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # forward propagation
        outputs = model(**batch)
        loss = outputs.loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # step grad and lr
        optimizer.step()
        lr_scheduler.step()
        
        # print out training progress every 500 steps
        if batch_idx % 100 == 0:
            print(f"loss: {loss.item():>7f}  [{batch_idx:>5d}/{size:>5d}]")
        
        # update gobal bar progress
        if global_progress_bar:
            global_progress_bar.update(1)

def eval_epoch(eval_dataloader, model):
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
        # post-process pred
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # accumulate all batches' metrics
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    print(metric.compute())

# training and evaluation looping

In [42]:
# training and eval loops
global_progress_bar = tqdm(range(num_training_steps))
for epoch in range(cfg.EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar)
    print("Evaluation:")
    eval_epoch(eval_dataloader, model)
    print("\n")

  0%|          | 0/6440 [00:00<?, ?it/s]

Epoch 1
-------------------------------
loss: 1.106176  [    0/ 1288]
loss: 0.639922  [  100/ 1288]
loss: 0.851625  [  200/ 1288]
loss: 0.680983  [  300/ 1288]
loss: 0.763715  [  400/ 1288]
loss: 0.582402  [  500/ 1288]
loss: 0.459204  [  600/ 1288]
loss: 0.684624  [  700/ 1288]
loss: 0.418438  [  800/ 1288]
loss: 0.430136  [  900/ 1288]
loss: 0.224446  [ 1000/ 1288]
loss: 0.565316  [ 1100/ 1288]
loss: 0.272857  [ 1200/ 1288]
Evaluation:
{'accuracy': 0.7863209149586683}


Epoch 2
-------------------------------
loss: 0.391043  [    0/ 1288]
loss: 0.261442  [  100/ 1288]
loss: 0.211126  [  200/ 1288]
loss: 0.514121  [  300/ 1288]
loss: 0.422373  [  400/ 1288]
loss: 0.422729  [  500/ 1288]
loss: 0.139743  [  600/ 1288]
loss: 0.413406  [  700/ 1288]
loss: 0.479091  [  800/ 1288]
loss: 0.718650  [  900/ 1288]
loss: 0.381761  [ 1000/ 1288]
loss: 0.294562  [ 1100/ 1288]
loss: 0.602771  [ 1200/ 1288]
Evaluation:
{'accuracy': 0.7843958781564941}


Epoch 3
-------------------------------
loss: 

In [43]:
try:
    model.save_pretrained(cfg.MODEL_DIR)
    tokenizer.save_pretrained(cfg.MODEL_DIR)
    print(f"Trained model and its tokenizer are saved to {cfg.MODEL_DIR}")
except Exception as e:
    print(e)
    print("Saving model failed")
            

Trained model and its tokenizer are saved to /content/drive/MyDrive/04_projects/distilbert_uncased_github_issue_tagger/
