In [1]:
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import unidecode
import contractions
import sys
sys.path.append('..')

In [2]:
from PrepareDf import *

In [3]:
train_pt = '/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/train/'
test_pt = '/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/test/'

In [4]:
path_to_handscored_p ='../ScoringDetail_viw_all_subscore.p'

In [5]:
sub_score_categories = ['Cross Selling', 'Creates Incentive', 'Education', 'Processes', 'Product Knowledge', 'Greeting', 'Professionalism', 'Confidence',  'Retention',
                        'Documentation']
scoring_criteria = sub_score_categories[:4]

In [6]:
score_df, q_text = prepare_score_df(
    path_to_handscored_p, workgroup='all')
train_df = prepare_trancript_score_df(score_df, q_text, train_pt, None)
test_df = prepare_trancript_score_df(score_df, q_text, test_pt, None)

Dataframe creation done


100%|██████████| 12684/12684 [00:31<00:00, 400.04it/s]


Number of Calls = 12497


100%|██████████| 2407/2407 [00:06<00:00, 373.02it/s]

Number of Calls = 2404





In [10]:
def post_process(arr):
    arr = [txt.capitalize() for txt in arr]
    return '. '.join(arr)

In [11]:
train_df.text = train_df.text.apply(lambda x: post_process(x))
test_df.text = test_df.text.apply(lambda x: post_process(x))

In [12]:
from datasets import Dataset

In [13]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [14]:
# checkpoint='allenai/longformer-base-4096'
# checkpoint = 'distilbert-base-uncased'
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
checkpoint = 'google/bigbird-roberta-base'
tokenizer = BigBirdTokenizer.from_pretrained(checkpoint)
model = BigBirdForSequenceClassification.from_pretrained(checkpoint, num_labels=8)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

In [15]:
def preprocess_function(example):
    new_dict = tokenizer(example["text"], truncation=True)
    new_dict['labels'] = [example[criteria] for criteria in scoring_criteria]
    return new_dict

In [16]:
cols = train_ds.column_names
tokenized_ds_train = train_ds.map(preprocess_function, remove_columns=cols)
tokenized_ds_test = test_ds.map(preprocess_function, remove_columns=cols)
tokenized_ds_train.set_format('torch')
tokenized_ds_test.set_format('torch')


  0%|          | 0/12497 [00:00<?, ?ex/s]

  0%|          | 0/2404 [00:00<?, ?ex/s]

In [17]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
data_collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

In [18]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        for i in range(self.model.config.num_labels//2):
            loss = loss_fn(outputs[0][:, 2*i:2*(i+1)], labels[:, i])

        return (loss, outputs) if return_outputs else loss


In [19]:
from sklearn.metrics import roc_auc_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pos_proba = logits[:, [i for i in range(1, logits.shape[1], 2)]]
    auc_scores = roc_auc_score(labels, pos_proba, average=None)
    result = {}
    for i in range(len(auc_scores)):
        result[scoring_criteria[i]] = auc_scores[i]
    return result


In [20]:
training_args = TrainingArguments(
    output_dir="../logs/bigbird",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='wandb',
    eval_steps=10,
    evaluation_strategy='epoch',
    run_name='bigbird'
)

In [21]:
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_test,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 12497
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4689
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mkekuda95[0m. Use [1m`wandb login --relogin`[0m to force relogin


Input ids are automatically padded from 2090 to 2112 to be a multiple of `config.block_size`: 64
  * num_indices_to_pick_from


: 

: 

## Native Pytorch Implementation

In [7]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, )

In [8]:
from transformers import AutoModel
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import torch.nn as nn
from tqdm.auto import tqdm
import numpy as np
from datasets import Dataset

In [9]:
class BERT_CLF_HEAD(nn.Module):
    def __init__(self, dim_size, num_classes, dropout=0.2):
        super(BERT_CLF_HEAD, self).__init__()
        self.pre_classifier = nn.Linear(dim_size, dim_size)
        self.classifier = nn.Linear(dim_size, num_classes)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        out = self.pre_classifier(x)
        logits = self.dropout(self.classifier(out))
        return logits

In [10]:
model = AutoModel.from_pretrained(checkpoint)
num_labels = 8
bert_head = BERT_CLF_HEAD(dim_size=768, num_classes=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
bert_head.to(device)

BERT_CLF_HEAD(
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=8, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [12]:
class MyDataset(Dataset):
 
  def __init__(self,df):
        self.df = df
 
  def __len__(self):
    return len(self.df)
   
  def __getitem__(self,idx):
    return self.df.iloc[idx]

In [19]:
class Collate:
    def __init__(self, tokenizer, window_size, stride):
        self.tokenizer = tokenizer
        self.window_size = window_size
        self.stride = stride
    def collate_fn(self, batch):
        tokenized_texts = []
        labels = []
        call_ids = []
        for sample in batch:
            text_arr = sample.text
            text_len = len(text_arr)
            start = 0
            slice_list =[]
            while(start<text_len):
                end = min(text_len, start+self.window_size)
                text_slice = text_arr[start:end]
                start+=self.stride
                processed_text = '. '.join(text_slice)
                slice_list.append(processed_text)
            
            tokenized_slices = self.tokenizer(slice_list, truncation=True, padding=True, return_tensors='pt')
            tokenized_texts.append(tokenized_slices)
            
            labels.append(([sample[criteria] for criteria in scoring_criteria]))
            call_ids.append(sample['InteractionIdKey'])
        labels = torch.tensor(labels)
        return {"tokenized_text": tokenized_texts, 'labels': labels, "call_id": call_ids} 

In [20]:
class Trainer:
    def __init__(self, train_ds, test_ds, epochs=3, lr=5e-5, bs=8, embed_size=768):
        self.train_dataloader = DataLoader(train_ds, batch_size=bs, collate_fn=data_collator)
        self.test_dataloader = DataLoader(test_ds, batch_size=bs, collate_fn=data_collator)
        self.num_epochs = epochs
        self.lr = lr
        self.bs = bs
        self.embed_size = embed_size
        
    def compute_loss(self, logits, labels):
        loss_fn = torch.nn.CrossEntropyLoss()
        for i in range(num_labels//2):
            loss = loss_fn(logits[:, 2*i:2*(i+1)], labels[:, i])
        return loss
    
    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        pos_proba = logits[:, [i for i in range(1, logits.shape[1], 2)]]
        auc_scores = roc_auc_score(labels, pos_proba, average=None)
        result = {}
        for i in range(len(auc_scores)):
            result[scoring_criteria[i]] = auc_scores[i]
        return result
    
    def compute_logits(self, batch):
        input_id_chunks = torch.split(batch['input_ids'], 512, -1)
        attention_mask_chunks = torch.split(batch['attention_mask'], 512, -1)
        cls_arr = []
        for input_ids, attention_mask in zip(input_id_chunks, attention_mask_chunks):
            chunk_batch = {'input_ids': input_ids, 'attention_mask': attention_mask}
            outputs = model(**chunk_batch, output_hidden_states=True)
            last_hidden_states = outputs.hidden_states[-1]
            cls_token_embed = last_hidden_states[:,0,:]
            cls_arr.append(cls_token_embed)        
        cls_embed_mean = torch.mean(torch.stack(cls_arr), dim=0)
        logits = bert_head(cls_embed_mean)
        return logits

    def compute_sliding_logits(self, batch):
        cls_embeds = torch.empty((self.bs, self.embed_size), device=device)
        for idx , sample in enumerate(batch['tokenized_text']):            
            chunk_batch = {'input_ids': sample['input_ids'].to(device), 'attention_mask': sample['attention_mask'].to(device)}
            outputs = model(**chunk_batch, output_hidden_states=True)
            last_hidden_states = outputs.hidden_states[-1]
            cls_token_embed = last_hidden_states[:,0,:]
            # print("cls_token_embed", cls_token_embed.shape)
            cls_embed_mean = torch.mean(cls_token_embed, dim=0)
            # print("cls_embed_mean", cls_embed_mean.shape)
            # print("cls_embeds", cls_embeds)
            cls_embeds[idx, :] = cls_embed_mean
        logits = bert_head(cls_embeds)
        return logits

    def get_optimizer(self):
        return AdamW(model.parameters(), self.lr)
    
    def get_scheduler(self):
        self.num_training_steps = self.num_epochs * len(self.train_dataloader)
        lr_scheduler = get_scheduler(
            name="linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps
        )
        return lr_scheduler
        
    def train(self):
        self.optimizer = self.get_optimizer()
        self.lr_scheduler  = self.get_scheduler()
        model.train()
        bert_head.train()
        progress_bar = tqdm(range(self.num_training_steps))
        for epoch in range(self.num_epochs):
            for batch in self.train_dataloader:
                logits = self.compute_sliding_logits(batch)
                labels = batch['labels'].to(device)
                loss = self.compute_loss(logits, labels)
                loss.backward()
                self.optimizer.step()
                self.lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar.update(1)
            self.evaluate()
            
    def evaluate(self):
        print("Running Eval")
        all_logits = []
        all_labels = []
        val_loss = 0
        for batch in self.test_dataloader:
            with torch.no_grad():
                logits = self.compute_sliding_logits(batch)
                all_logits.append(logits.numpy())
            labels = batch['labels'].to(device)
            val_loss += self.compute_loss(logits, labels).item()
            all_labels.append(labels.numpy())
        all_logits = np.vstack(all_logits)
        all_labels = np.vstack(all_labels)
        eval_pred = (all_logits, all_labels)
            
        metrics = self.compute_metrics(eval_pred)
        metrics["val_loss"] = val_loss/len(self.test_dataloader)
        print(metrics)
        return

In [59]:
trainer = Trainer(lr = 2e-5, bs=8, train_ds=tokenized_ds_train, test_ds=tokenized_ds_test, epochs=3)

In [60]:
trainer.train()

  0%|          | 0/4689 [00:00<?, ?it/s]

Running Eval
{'Cross Selling': 0.4735731094309147, 'Creates Incentive': 0.46843593793421245, 'Education': 0.537360536806054, 'Processes': 0.5867165075103398}
Running Eval
{'Cross Selling': 0.44059693387359544, 'Creates Incentive': 0.45295607484196865, 'Education': 0.533454142952336, 'Processes': 0.633187638925238}
Running Eval
{'Cross Selling': 0.4636144645418216, 'Creates Incentive': 0.43130762384840243, 'Education': 0.5574542846998184, 'Processes': 0.6299546621387591}


## Custom Collator

In [21]:
collator = Collate(tokenizer, window_size=5, stride=3)
data_collator = collator.collate_fn

In [22]:
train_ds = MyDataset(train_df.reset_index())
test_ds = MyDataset(test_df.reset_index())

In [23]:
trainer = Trainer(train_ds=train_ds, test_ds=test_ds)

In [24]:
trainer.train()

  0%|          | 0/4689 [00:00<?, ?it/s]

KeyboardInterrupt: 