In [1]:
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import unidecode
import contractions
import sys
sys.path.append('..')

In [2]:
from PrepareDf import *

In [3]:
train_pt = '/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/train/'
test_pt = '/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/test/'

In [4]:
path_to_handscored_p ='../ScoringDetail_viw_all_subscore.p'

In [5]:
sub_score_categories = ['Cross Selling', 'Creates Incentive', 'Education', 'Processes', 'Product Knowledge', 'Greeting', 'Professionalism', 'Confidence',  'Retention',
                        'Documentation']
scoring_criteria = sub_score_categories[:4]

In [6]:
score_df, q_text = prepare_score_df(
    path_to_handscored_p, workgroup='all')
train_df = prepare_trancript_score_df(score_df, q_text, train_pt, None)
test_df = prepare_trancript_score_df(score_df, q_text, test_pt, None)

Dataframe creation done


100%|██████████| 12684/12684 [00:29<00:00, 428.25it/s]


Number of Calls = 12497


100%|██████████| 2407/2407 [01:25<00:00, 28.11it/s]

Number of Calls = 2404





In [7]:
def post_process(arr):
    arr = [txt.capitalize() for txt in arr]
    return '. '.join(arr)

In [8]:
train_df.text = train_df.text.apply(lambda x: post_process(x))
test_df.text = test_df.text.apply(lambda x: post_process(x))


In [9]:
from datasets import Dataset

In [10]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [11]:
# checkpoint='allenai/longformer-base-4096'
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, )
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=8)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [12]:
def preprocess_function(example):
    new_dict = tokenizer(example["text"], truncation=True)
    new_dict['labels'] = [example[criteria] for criteria in scoring_criteria]
    return new_dict

In [13]:
cols = train_ds.column_names
tokenized_ds_train = train_ds.map(preprocess_function, remove_columns=cols)
tokenized_ds_test = test_ds.map(preprocess_function, remove_columns=cols)
tokenized_ds_train.set_format('torch')
tokenized_ds_test.set_format('torch')


  0%|          | 0/12497 [00:00<?, ?ex/s]

  0%|          | 0/2404 [00:00<?, ?ex/s]

In [14]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
data_collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

In [15]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        for i in range(self.model.config.num_labels//2):
            loss = loss_fn(outputs[0][:, 2*i:2*(i+1)], labels[:, i])

        return (loss, outputs) if return_outputs else loss


In [16]:
from sklearn.metrics import roc_auc_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pos_proba = logits[:, [i for i in range(1, logits.shape[1], 2)]]
    auc_scores = roc_auc_score(labels, pos_proba, average=None)
    result = {}
    for i in range(len(auc_scores)):
        result[scoring_criteria[i]] = auc_scores[i]
    return result


In [19]:
training_args = TrainingArguments(
    output_dir="../logs/distil-bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='wandb',
    eval_steps=10,
    evaluation_strategy='epoch',
    run_name='distil-bert'
)

PyTorch: setting up devices


In [20]:
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_test,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 12497
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2346
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mkekuda95[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Cross selling,Creates incentive,Education,Processes
1,0.6748,0.663249,0.56597,0.570815,0.437485,0.637043
2,0.6351,0.682303,0.569068,0.51378,0.411018,0.650449
3,0.6116,0.652303,0.573646,0.53058,0.401733,0.663233


Saving model checkpoint to ../logs/distil-bert/checkpoint-500
Configuration saved in ../logs/distil-bert/checkpoint-500/config.json
Model weights saved in ../logs/distil-bert/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../logs/distil-bert/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../logs/distil-bert/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2404
  Batch size = 16
Saving model checkpoint to ../logs/distil-bert/checkpoint-1000
Configuration saved in ../logs/distil-bert/checkpoint-1000/config.json
Model weights saved in ../logs/distil-bert/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../logs/distil-bert/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../logs/distil-bert/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ../logs/distil-bert/checkpoint-1500
Configuration saved in ../logs/distil-bert/checkpoint-1500/config.json
Model weights saved in

TrainOutput(global_step=2346, training_loss=0.6373150525495525, metrics={'train_runtime': 13896.5669, 'train_samples_per_second': 2.698, 'train_steps_per_second': 0.169, 'total_flos': 4966866648244224.0, 'train_loss': 0.6373150525495525, 'epoch': 3.0})