# Training PiPP finder models

In [1]:
__author__ = "Christopher Potts"

## Set-up

In [2]:
# For Colab:

!pip install scikits.bootstrap
!pip install openai
!pip install transformers[torch]
!pip install evaluate
!pip install datasets

In [3]:
import numpy as np
import pandas as pd
import evaluate
from sklearn.metrics import classification_report
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

import utils

In [4]:
weights_name = "bert-base-cased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(weights_name)

In [6]:
# From https://huggingface.co/docs/transformers/main_classes/trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss; CP set the weights here:
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [8]:
df = pd.read_csv("annotated/pipp-labels.csv")

In [9]:
df = df.rename(columns={"sentence": "text", "PiPP": "label"})

In [10]:
df = df.sample(frac=1.0, random_state=1234)

In [11]:
full_dist = df.label.value_counts().to_frame()
full_dist['per'] = full_dist / full_dist.sum()
full_dist

Unnamed: 0,label,per
0,6598,0.936817
1,445,0.063183


## With train-test split for an eval

In [12]:
train_count = int(df.shape[0] * 0.80)

train_df = df.iloc[: train_count]
test_df = df.iloc[train_count: ]

In [13]:
test_dist = test_df.label.value_counts().to_frame()
test_dist['per'] = test_dist / test_dist.sum()
test_dist

Unnamed: 0,label,per
0,1314,0.932576
1,95,0.067424


In [14]:
train_dist = train_df.label.value_counts().to_frame()
train_dist['per'] = train_dist / train_dist.sum()
train_dist

Unnamed: 0,label,per
0,5284,0.937877
1,350,0.062123


In [15]:
train = Dataset.from_dict(train_df.to_dict(orient='list'))

In [16]:
dataset = DatasetDict({"train": train})

In [17]:
dataset_tokenized = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5634 [00:00<?, ? examples/s]

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(weights_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
trainer = CustomTrainer(model=model, train_dataset=dataset_tokenized['train'])

In [20]:
trainer.train()



Step,Training Loss
500,0.2336
1000,0.0717
1500,0.054
2000,0.0213


TrainOutput(global_step=2115, training_loss=0.09024120721411198, metrics={'train_runtime': 446.1311, 'train_samples_per_second': 37.886, 'train_steps_per_second': 4.741, 'total_flos': 4447103057694720.0, 'train_loss': 0.09024120721411198, 'epoch': 3.0})

In [21]:
trainer.save_model("models/pipp_evaluation_model")

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("models/pipp_evaluation_model")

In [23]:
utils.model_predict(["Happy though we were with the idea, we rejected it."], tokenizer, model)

array([1])

In [24]:
preds = []
test_exs = list(test_df.text.values)
batch_size = 20
for i in range(0, len(test_exs), batch_size):
    preds += list(utils.model_predict(test_exs[i: i+batch_size], tokenizer, model))

In [25]:
print(classification_report(test_df.label.values, preds))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1314
           1       0.85      0.99      0.92        95

    accuracy                           0.99      1409
   macro avg       0.93      0.99      0.96      1409
weighted avg       0.99      0.99      0.99      1409



## Train on all examples for a model to use to find examples

In [26]:
full_dataset = Dataset.from_dict(df.to_dict(orient='list'))
full_dataset = DatasetDict({"train": full_dataset})

In [27]:
full_dataset_tokenized = full_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7043 [00:00<?, ? examples/s]

In [28]:
full_model = AutoModelForSequenceClassification.from_pretrained(weights_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
full_trainer = CustomTrainer(model=full_model, train_dataset=full_dataset_tokenized['train'])

In [30]:
full_trainer.train()



Step,Training Loss
500,0.1547
1000,0.0838
1500,0.0613
2000,0.0378
2500,0.0232


TrainOutput(global_step=2643, training_loss=0.06883506029787904, metrics={'train_runtime': 557.0899, 'train_samples_per_second': 37.927, 'train_steps_per_second': 4.744, 'total_flos': 5559273488701440.0, 'train_loss': 0.06883506029787904, 'epoch': 3.0})

In [31]:
full_trainer.save_model("models/pipp_finder")

## Add to Hub

In [2]:
!pip install huggingface_hub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from huggingface_hub import notebook_login

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("models/pipp_finder")

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [9]:
model.push_to_hub("pipp-finder-bert-base-cased")



pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cgpotts/pipp-finder-bert-base-cased/commit/a37a618cd85fa39ea29060b6d1834ec18cebc554', commit_message='Upload BertForSequenceClassification', commit_description='', oid='a37a618cd85fa39ea29060b6d1834ec18cebc554', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
tokenizer.push_to_hub("pipp-finder-bert-base-cased")



CommitInfo(commit_url='https://huggingface.co/cgpotts/pipp-finder-bert-base-cased/commit/99e5013dcd2b4865b9d80bd92c0fcfc8a9bcdc2c', commit_message='Upload tokenizer', commit_description='', oid='99e5013dcd2b4865b9d80bd92c0fcfc8a9bcdc2c', pr_url=None, pr_revision=None, pr_num=None)