In [None]:
!pip install -qq transformers

In [None]:
!git clone https://huggingface.co/huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad

Cloning into 'prunebert-base-uncased-6-finepruned-w-distil-squad'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 24 (delta 7), reused 0 (delta 0)[K
Unpacking objects: 100% (24/24), done.


In [None]:
import transformers

#max number of tokens in the sentence
MAX_LEN = 512

#batch sizes is small cause model is huge
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

#lets train for a maximum of 10 epochs
EPOCHS = 10
BERT_PATH = 'prunebert-base-uncased-6-finepruned-w-distil-squad/'
#save model path
MODEL_PATH ='prune_model.bin'
#training file
TRAINING_FILE = 'augment_daily.csv'

#define the tokenizer
#we use tokenizer and model
#from hugging face transformers
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

In [None]:
#dataset.py
import torch

class BertDataset:
  def __init__(self,log,target):
    self.log = log
    self.target = target
    self.tokenizer = TOKENIZER
    self.max_len = MAX_LEN

  def __len__(self):
    return len(self.log)

  def __getitem__(self, item):
    log = str(self.log[item])
    log = " ".join(log.split())

    # enocode_plus comes from hugging face transformers
    # and exists for all tokenizers they offer
    # it can be used to convert a given string
    # to ids and mask and token type ids which
    # are used for model like BERT
    # review should be a string
    inputs = self.tokenizer.encode_plus(
        log,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        pad_to_max_length=True,
    ) 
    #ids are ids of tokens generated
    ids = inputs["input_ids"]
    #mask is 1 where we have input and 0 where we have padding
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask,dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids,dtype=torch.long),
        "targets": torch.tensor(self.target[item],dtype=torch.long)
    }

In [None]:
#model.py
import transformers
from transformers import BertForSequenceClassification
import torch.nn as nn

class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    #fetch the model from BERT_PATH 
    self.bert = BertForSequenceClassification.from_pretrained(
    "huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad",
    num_labels = 5,   
    output_attentions = False,
    output_hidden_states = False,
    )

  def forward(self, ids, mask, token_type_ids):
    #bert default settings returns
    #last hidden state and output of the bert pooler layer
    #output of the pooler which of size (batch_size,hidden_size)
    #hidden size can be 768(Bert base) or 1024(Bert Large)
    output = self.bert(
        ids,
        attention_mask=mask,
        token_type_ids=token_type_ids
    )

    return output

In [None]:
#engine.py
import torch
import torch.nn as nn 

def loss_fn(outputs, targets):
  """
  function that returns
  """
  return nn.CrossEntropyLoss()(outputs,targets)

def train_fn(data_loader,model,optimizer,device,scheduler):
  """
  Train the model from one epoch
  :param data_loader: torch dataloader
  :param model:bert base model
  :param optimizer: adam, sgd..etc
  :param device: can be cpu or gpu
  :param scheduler: learning rate scheduler
  """
  model.train()
  #loop over all batches
  i = 0
  size = len(data_loader)
  for d in tqdm(data_loader,total=size):
    #extract ids, token type ids and mask
    ids = d['ids']
    token_type_ids = d['token_type_ids']
    mask = d['mask']
    targets = d["targets"]

    #move everything to device
    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device,dtype=torch.long)
    mask = mask.to(device,dtype=torch.long)
    targets = targets.to(device, torch.long)

    #zero-grad optimizers
    optimizer.zero_grad()
    outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
    )[0]
    loss = loss_fn(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()

def eval_fn(data_loader,model,device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  #to not run out of GPU & not to change gradients
  with torch.no_grad(): 
    for d in data_loader:
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device,dtype=torch.long)
      mask = mask.to(device,dtype=torch.long)
      targets = targets.to(device, torch.long)

      outputs = model(
          ids,
          mask=mask,
          token_type_ids=token_type_ids,
      )[0]
      val_loss = loss_fn(outputs, targets)
      targets = targets.cpu().detach()
      fin_targets.extend(targets.numpy().tolist())
      outputs = outputs.cpu().detach()
      fin_outputs.extend(outputs.numpy().tolist())

    return val_loss, fin_outputs, fin_targets

In [None]:
#train.py
import pandas as pd 
import numpy as np
import torch.nn as nn 

from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

def train():
  dfx = pd.read_csv(TRAINING_FILE).fillna('none')
  dfx = dfx.sample(frac=1).reset_index(drop=True)
  # dfx = dfx.iloc[:100,:]
  dfx_mapper = {
      'food':0, 'transport':1, 'shopping':2, 'bills':3, 'credit':4
  }
  dfx.cat = dfx.cat.map(dfx_mapper)
  df_train,df_valid = model_selection.train_test_split(
      dfx,
      test_size=0.1,
      random_state=42,
      stratify=dfx.cat.values
  )

  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  train_dataset = BertDataset(
      log=df_train.logs.values,
      target=df_train.cat.values
  )

  train_dataloader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=TRAIN_BATCH_SIZE,
      num_workers=3
  )

  valid_dataset = BertDataset(
      log=df_valid.logs.values,
      target=df_valid.cat.values
  )

  valid_dataloader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=VALID_BATCH_SIZE,
      num_workers=1
  )

  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model = BERTBaseUncased()
  model.to(device)
           
  #create parameters we want to optimize
  #we generally dont use any decay for bias
  #and weight layers
  param_optimizer = list(model.named_parameters())
  no_decay = ["bias","LayerNorm.bias","LayerNorm.weight"]
  optimizer_parameters = [
               {
                   "params":[
                             p for n,p in param_optimizer if 
                             not any(nd in n for nd in no_decay)
                   ],
                   "weight_decay": 0.001,
               },
               {
                   "params":[
                             p for n,p in param_optimizer if 
                             any(nd in n for nd in no_decay)
                   ],
                   "weight_decay": 0.0,
               },
  ]

  #calculate the number of training steps
  #used by schduler
  num_train_steps = int(
      len(df_train) / TRAIN_BATCH_SIZE * EPOCHS
  )

  #AdamW in widely used optimizer for transformer based model
  optimizer = AdamW(optimizer_parameters, lr =3e-5)

  #fetch the schduler
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps
  )

  # model = nn.DataParallel(model)

  best_accuracy = 0
  for epoch in range(EPOCHS):
    print(f"Epoch: {epoch}")
    print("Model training..")
    print(len(train_dataloader))
    train_fn(train_dataloader, model,
             optimizer,device,scheduler)
    print("Model Evalutaion..")
    val_loss, outputs, targets = eval_fn(valid_dataloader,model,device)

    accuracy = metrics.accuracy_score(targets, np.argmax(outputs,axis=1))
    print(f"epoch: {epoch}, val acc: {accuracy}, val_loss: {val_loss}")

    if accuracy > best_accuracy:
      torch.save(model.state_dict(),MODEL_PATH)
      best_accuracy = accuracy

    


In [None]:
train()

Some weights of the model checkpoint at huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad were not used when initializing BertForSequenceClassification: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Epoch: 0
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Model Evalutaion..



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


epoch: 0, val acc: 0.9623655913978495, val_loss: 0.008109736256301403


  0%|          | 0/837 [00:00<?, ?it/s]

Epoch: 1
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Model Evalutaion..



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


epoch: 1, val acc: 0.978494623655914, val_loss: 0.00481443339958787


  0%|          | 0/837 [00:00<?, ?it/s]

Epoch: 2
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Model Evalutaion..



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 0/837 [00:00<?, ?it/s]

epoch: 2, val acc: 0.9771505376344086, val_loss: 0.0033930083736777306
Epoch: 3
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Model Evalutaion..



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 0/837 [00:00<?, ?it/s]

epoch: 3, val acc: 0.9758064516129032, val_loss: 0.002098868601024151
Epoch: 4
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Model Evalutaion..



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 0/837 [00:00<?, ?it/s]

epoch: 4, val acc: 0.9758064516129032, val_loss: 0.0017521880799904466
Epoch: 5
Model training..
837


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

KeyboardInterrupt: ignored

In [None]:
def sentence_prediction(sentence):
  tokenizer = TOKENIZER
  max_len = MAX_LEN

  log = str(sentence)
  log = " ".join(log.split())
  inputs = tokenizer.encode_plus(
      log,
      None,
      add_special_tokens=True,
      max_length=max_len
  )

  ids = inputs["input_ids"]
  mask = inputs['attention_mask']
  token_type_ids = inputs['token_type_ids']

  #add padding
  padding_length = max_len - len(ids)
  ids = ids + ([0] * padding_length)
  mask = mask + ([0] * padding_length)
  token_type_ids = token_type_ids + ([0] * padding_length)
  
  

In [None]:
sentence = "i have purchased a new shoe"
tokenizer = TOKENIZER
max_len = MAX_LEN

log = str(sentence)
log = " ".join(log.split())
inputs = tokenizer.encode_plus(
      log,
      None,
      add_special_tokens=True,
      max_length=max_len
  )

ids = inputs["input_ids"]
mask = inputs['attention_mask']
token_type_ids = inputs['token_type_ids']

In [None]:
  #add padding
  padding_length = max_len - len(ids)
  ids = ids + ([0] * padding_length)
  mask = mask + ([0] * padding_length)
  token_type_ids = token_type_ids + ([0] * padding_length)