### Finetune BERT for text classification using pytorch and huggingface

In [1]:
import os
import math
from functools import partial

import numpy as np
import pandas as pd
import torch
import re
import nltk
import torch.nn as nn
import pytorch_lightning as pl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import DataCollatorWithPadding, AdamW, get_scheduler, set_seed, get_linear_schedule_with_warmup
from datasets import load_metric, Dataset
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Configuration for training

In [3]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    MODEL_SAVE_DIR="./hf_results/"
    MAX_LENGTH=512
    GRADIENT_ACCUMULATION_STEPS = 1
    TWEET_COL = "processed_text"
    RANDOM_STATE = 42
    BATCH_SIZE = 16
    OUT_SIZE = 2
    NUM_FOLDS = 5
    NUM_EPOCHS = 3
    NUM_WORKERS = 8
    TRANSFORMER_CHECKPOINT = "bert-base-uncased"
    # The hidden_size of the output of the last layer of the transformer model used
    TRANSFORMER_OUT_SIZE = 768
    PAD_TOKEN_ID = 0
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False    
    PATIENCE = 5        
    # model hyperparameters
    MODEL_HPARAMS = {
        "learning_rate": 2e-5,
        "adam_epsilon": 1e-8,
        "weight_decay": 0.0,
        "warmup_steps": 0
    }

DATA_PATH = "./data/"

# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
set_seed(Config.RANDOM_STATE)

### Load the data

In [4]:
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


### K Fold CV
Split the training dataframe into kfolds for cross validation. We do this before any processing is done
on the data. We use stratified kfold if the target distribution is unbalanced

In [5]:
def strat_kfold_dataframe(df, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df["target"].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

df_train = strat_kfold_dataframe(df_train, num_folds=Config.NUM_FOLDS)            

### Tweet preprocessing

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE        
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            # remove stopwords and punctuation
            #if (word.isalpha() and len(word) > 2 and word not in stop and word not in string.punctuation):
                #stem_word = stemmer.stem(word)  # stemming word            
                #lem_word = lemmatizer.lemmatize(word)
                #tweets_clean.append(lem_word) 
                tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [8]:
# Fill in missing values
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")
# length of the processed tweet
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))
df_train.iloc[50:52, :]

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
50,73,ablaze,"Sheffield Township, Ohio",Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k,1,2,deputies man shot before brighton home set ablaze,8
51,74,ablaze,India,Man wife get six years jail for setting ablaze niece\nhttp://t.co/eV1ahOUCZA,1,0,man wife get six years jail for setting ablaze niece,10


## Dataset for transformer model
We use hugging face Dataset library to create a custom dataset from pandas dataframe

In [9]:
tokenizer = BertTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# Process each row of the huggingface Dataset 
def tokenize_tweets(tokenizer, with_labels, row):
    result = tokenizer(row[Config.TWEET_COL], padding=False, truncation=True)
    if with_labels:
        result["labels"] = row["target"]
    return result

preprocess_train_data = partial(tokenize_tweets, tokenizer, True)  
preprocess_test_data = partial(tokenize_tweets, tokenizer, False)  

### Get train and validation dataset and dataloaders for a fold

In [11]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, remove_columns=raw_ds_col_names)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=data_collator, num_workers=Config.NUM_WORKERS)
    dl_valid = DataLoader(ds_valid, batch_size=Config.BATCH_SIZE, collate_fn=data_collator, num_workers=Config.NUM_WORKERS)
    return dl_train, dl_valid, ds_train, ds_valid

In [12]:
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
def get_optimizer(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": Config.MODEL_HPARAMS["weight_decay"],
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    return AdamW(optimizer_grouped_parameters, lr=Config.MODEL_HPARAMS["learning_rate"])

In [13]:
# lr_scheduler = get_scheduler(
#         name="linear",
#         optimizer=optimizer,
#         num_warmup_steps=Config.MODEL_HPARAMS["warmup_steps"],
#         num_training_steps=num_train_steps
#     )

def get_lr_scheduler(optimizer, dl_train):
    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(dl_train) / Config.GRADIENT_ACCUMULATION_STEPS)
    num_train_steps = Config.NUM_EPOCHS * num_update_steps_per_epoch
    print(f"num_update_steps_per_epoch = {num_update_steps_per_epoch}")
    print(f"num_train_steps = {num_train_steps}")
    lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=Config.MODEL_HPARAMS["warmup_steps"],
            num_training_steps=num_train_steps,
        )
    return lr_scheduler        

### The training function

In [14]:
def train_fn(epoch, model, optimizer, lr_scheduler, train_dataloader):
    progress_bar = tqdm(range(len(train_dataloader)))
    train_loss_epoch = []
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(Config.DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss      
        train_loss_epoch.append(loss.item())  
        loss.backward()        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)        
    train_loss_mean = np.mean(train_loss_epoch)
    return train_loss_mean    

### The validation function 

In [15]:
def eval_fn(epoch, model, val_dataloader, val_metric):
    progress_bar = tqdm(range(len(val_dataloader)))
    val_loss_epoch = []
    model.eval()    
    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):    
            batch = {k: v.to(Config.DEVICE) for k, v in batch.items()}           
            outputs = model(**batch)
            val_loss_epoch.append(outputs.loss.item())
            predictions = outputs.logits.argmax(dim=-1) 
            val_metric.add_batch(predictions=predictions, references=batch["labels"])
            progress_bar.update(1)

    val_metric_epoch = val_metric.compute()
    val_metric_epoch = round(val_metric_epoch['accuracy'], 4)
    val_loss_mean = np.mean(val_loss_epoch)    
    return val_loss_mean, val_metric_epoch

### The training loop

In [16]:
def fold_train_evaluate(fold):    
    fold_best_model_path = ""
    fold_str = f"fold_{fold}"
    print(f"Running fold {fold}")
    # we use validation loss as the criteria to save best model for a CV fold
    fold_val_loss_min = np.inf
    dl_train, dl_valid, ds_train, ds_valid = get_fold_dls(fold, df_train)
    print(f"Created data loaders for {fold_str}")
    config = BertConfig.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.OUT_SIZE)
    model = BertForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT,config=config)
    model.to(Config.DEVICE)  
    optimizer = get_optimizer(model)
    lr_scheduler = get_lr_scheduler(optimizer, dl_train)
    val_metric = load_metric(Config.MODEL_EVAL_METRIC)    
    for epoch in range(Config.NUM_EPOCHS):
        print(f"Running training for epoch {epoch+1}")
        epoch_train_loss = train_fn(epoch+1, model, optimizer, lr_scheduler, dl_train)        
        print(f"Running validation for epoch {epoch+1}")
        epoch_val_loss, epoch_val_metric = eval_fn(epoch+1, model, dl_valid, val_metric)
        print(f"EPOCH {epoch+1}: ")
        print(f"train_loss: {round(epoch_train_loss, 4)}")            
        print(f"val_loss: {round(epoch_val_loss, 4)}")
        print(f"{Config.MODEL_EVAL_METRIC}: {epoch_val_metric}")
        if epoch_val_loss < fold_val_loss_min:
            print(f"Validation loss decreased from " +
                  f"{round(fold_val_loss_min, 6)} --> {round(epoch_val_loss, 6)}. Saving model...")
            fold_best_model_path = Config.MODEL_SAVE_DIR + fold_str                  
            model.save_pretrained(fold_best_model_path)
            fold_val_loss_min = epoch_val_loss            

    del optimizer, lr_scheduler, model   
    return fold_val_loss_min, fold_best_model_path      

In [17]:
fold_results = []
for fold in range(Config.NUM_FOLDS):
    fold_val_loss_min, fold_best_model_path = fold_train_evaluate(fold)
    fold_results.append((fold_val_loss_min, fold_best_model_path))
    break

Running fold 0


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Created data loaders for fold_0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

num_update_steps_per_epoch = 381
num_train_steps = 1143
Running training for epoch 1


  0%|          | 0/381 [00:00<?, ?it/s]

Running validation for epoch 1


  0%|          | 0/96 [00:00<?, ?it/s]

EPOCH 1: 
train_loss: 0.4456
val_loss: 0.3691
accuracy: 0.8345
Validation loss decreased from inf --> 0.369067. Saving model...
Running training for epoch 2


  0%|          | 0/381 [00:00<?, ?it/s]

Running validation for epoch 2


  0%|          | 0/96 [00:00<?, ?it/s]

EPOCH 2: 
train_loss: 0.3114
val_loss: 0.4295
accuracy: 0.8188
Running training for epoch 3


  0%|          | 0/381 [00:00<?, ?it/s]

Running validation for epoch 3


  0%|          | 0/96 [00:00<?, ?it/s]

EPOCH 3: 
train_loss: 0.2266
val_loss: 0.4414
accuracy: 0.828


### Inference on the test set using best model

In [20]:
# Load the best performing model 
fold_results_sorted = sorted(fold_results, key=lambda x:x[0])
best_model_across_folds = fold_results_sorted[0][1]
best_model = BertForSequenceClassification.from_pretrained(best_model_across_folds)

In [21]:
# Create data loader for test data
ds_test_raw = Dataset.from_pandas(df_test)
ds_test_raw_col_names = ds_test_raw.column_names
ds_test = ds_test_raw.map(preprocess_test_data, batched=True, remove_columns=ds_test_raw_col_names)
dl_test = DataLoader(ds_test, batch_size=Config.BATCH_SIZE, collate_fn=data_collator, num_workers=Config.NUM_WORKERS)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [24]:
# perform predictions on test data
test_preds = []
best_model.to(Config.DEVICE)
with torch.no_grad():
    for step, batch in tqdm(enumerate(dl_test)):
        batch = {k: v.to(Config.DEVICE) for k, v in batch.items()}           
        outputs = best_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)         
        test_preds.extend(predictions.cpu().detach().numpy().tolist())

print(f"Completed prediction for {len(test_preds)} test rows")

3263


In [25]:
# Create the submission file
df_submission = pd.read_csv(DATA_PATH + 'submission.csv')
df_submission['target']= test_preds
df_submission.to_csv('my_submission.csv',index=False)