In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [2]:
model_args = ClassificationArgs()
model_args.max_seq_length = 512
model_args.train_batch_size = 12
model_args.eval_batch_size = 12
model_args.num_train_epochs = 5
model_args.evaluate_during_training = False
model_args.learning_rate = 1e-5
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True

model = ClassificationModel(
    "roberta", 
    "roberta-large", 
    num_labels=2,
    use_cuda=True,
    args=model_args
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

In [3]:
! nvidia-smi

Tue Dec  6 23:05:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:1A:00.0 Off |                    0 |
| N/A   35C    P0    43W / 300W |      3MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from datasets import load_dataset

boolq = load_dataset("boolq")
boolq

Found cached dataset boolq (/home/avpugachev/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'passage'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'answer', 'passage'],
        num_rows: 3270
    })
})

In [5]:
boolq['train'][0]

{'question': 'do iran and afghanistan speak the same language',
 'answer': True,
 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.'}

## Train

In [6]:
import pandas as pd

In [7]:
train_df = (
    pd.DataFrame(boolq["train"])
    .assign(text_a=lambda df: [x for x in df['question']])
    .assign(text_b=lambda df: [x for x in df['passage']])
    .drop(["question", "passage"], axis=1)
    .rename(columns={"answer": "labels"})
    .astype({"labels": int})
)
print(len(train_df))
train_df.head(3)

9427


Unnamed: 0,labels,text_a,text_b
0,1,do iran and afghanistan speak the same language,"Persian (/ˈpɜːrʒən, -ʃən/), also known by its ..."
1,1,do good samaritan laws protect those who help ...,Good Samaritan laws offer legal protection to ...
2,1,is windows movie maker part of windows essentials,Windows Movie Maker (formerly known as Windows...


In [8]:
model.train_model(
    train_df, 
    output_dir="./finetuned_roberta_large/",
    show_running_loss=True
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/786 [00:00<?, ?it/s]

(3930, 0.2791172950514047)

## Eval

In [9]:
eval_df = (
    pd.DataFrame(boolq["validation"])
    .assign(text_a=lambda df: [x for x in df['question']])
    .assign(text_b=lambda df: [x for x in df['passage']])
    .drop(["question", "passage"], axis=1)
    .rename(columns={"answer": "labels"})
    .astype({"labels": int})
)
print(len(eval_df))
eval_df.head(3)

3270


Unnamed: 0,labels,text_a,text_b
0,0,does ethanol take more energy make that produces,All biomass goes through at least some of thes...
1,1,is house tax and property tax are same,Property tax or 'house tax' is a local tax on ...
2,1,is pain experienced in a missing body part or ...,Phantom pain sensations are described as perce...


In [10]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

  0%|          | 0/3270 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/273 [00:00<?, ?it/s]

In [11]:
result

{'mcc': 0.6919630040353948,
 'tp': 1810,
 'tn': 988,
 'fp': 249,
 'fn': 223,
 'auroc': 0.9168473620985351,
 'auprc': 0.9403341913102592,
 'eval_loss': 0.9410035829155193}

In [12]:
eval_df_w_preds = (
    eval_df
    .assign(pred=model_outputs.argmax(-1))
)
eval_df_w_preds

Unnamed: 0,labels,text_a,text_b,pred
0,0,does ethanol take more energy make that produces,All biomass goes through at least some of thes...,0
1,1,is house tax and property tax are same,Property tax or 'house tax' is a local tax on ...,1
2,1,is pain experienced in a missing body part or ...,Phantom pain sensations are described as perce...,0
3,1,is harry potter and the escape from gringotts ...,Harry Potter and the Escape from Gringotts is ...,1
4,1,is there a difference between hydroxyzine hcl ...,Hydroxyzine preparations require a doctor's pr...,0
...,...,...,...,...
3265,1,is manic depression the same as bi polar,"Bipolar disorder, previously known as manic de...",1
3266,1,was whiskey galore based on a true story,SS Politician was an 8000-ton cargo ship owned...,1
3267,1,are there plants on the international space st...,Plant research continued on the International ...,1
3268,1,does the hockey puck have to cross the line to...,"In ice hockey, a goal is scored when the puck ...",1


In [13]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

In [14]:
roc_auc_score(eval_df_w_preds.labels, eval_df_w_preds.pred)

0.8445082174834709

In [15]:
accuracy_score(eval_df_w_preds.labels, eval_df_w_preds.pred)

0.8556574923547401

In [16]:
print(classification_report(eval_df_w_preds.labels, eval_df_w_preds.pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1237
           1       0.88      0.89      0.88      2033

    accuracy                           0.86      3270
   macro avg       0.85      0.84      0.85      3270
weighted avg       0.86      0.86      0.86      3270

