## imports

In [9]:
!pip install datasets transformers huggingface_hub

[0m

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [8]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [7]:
!pip install wandb  

[0m

In [1]:
import torch
import numpy as np
import pandas as pd

In [5]:
from transformers import DistilBertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer

from datasets import load_dataset, load_metric

import wandb
from huggingface_hub import notebook_login

from sklearn.metrics import accuracy_score, f1_score

In [4]:
import evaluate

In [6]:
torch.cuda.is_available()

True

## login

In [None]:
# notebook_login()

In [11]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# drive.mount('/content/drive')

## dataset

In [12]:
from datasets import interleave_datasets, Features, Value, ClassLabel

In [17]:
train_path_1 = '/kaggle/input/under-multi/IMDB_train_1.csv'
train_path_2 = '/kaggle/input/under-multi/IMDB_train_2.csv'
test_path = '/kaggle/input/under-multi/IMDB_test.csv'

In [18]:
imdb1 = load_dataset("csv", data_files={"train": train_path_1, "test": test_path})
imdb2 = load_dataset("csv", data_files={"train": train_path_2, "test": test_path})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ada634dca6432f17/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ada634dca6432f17/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b294b6be0d7837e0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b294b6be0d7837e0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
imdb1['train'].features.keys() 

dict_keys(['review_text', 'star_label', 'binary_label', 'over_1', 'over_2', 'over_3', 'over_4', 'over_5', 'over_6', 'over_7', 'over_8', 'over_9'])

In [20]:
labels = [label for label in imdb1['train'].features.keys() if label not in ["binary_label", "star_label", "review_text"]]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['over_1',
 'over_2',
 'over_3',
 'over_4',
 'over_5',
 'over_6',
 'over_7',
 'over_8',
 'over_9']

In [21]:
imdb1, imdb2 = imdb1.remove_columns(["binary_label"]), imdb2.remove_columns(["binary_label"]),
imdb1, imdb2 = imdb1.remove_columns(["star_label"]), imdb2.remove_columns(["star_label"])
imdb1, imdb2 = imdb1.rename_column("review_text", "text"), imdb2.rename_column("review_text", "text")

In [23]:
train1, train2 = imdb1["train"].shuffle(seed=41), imdb2["train"].shuffle(seed=41)
val_test_dataset = imdb1["test"].shuffle(seed=41).select(range(1000))

train_dataset = interleave_datasets([train1, train2]) # , stopping_strategy="all_exhausted"
val_dataset = val_test_dataset.select(range(500))
test_dataset = val_test_dataset.select(range(500, 1000))

In [24]:
print(train_dataset[0])
print(val_dataset[0])
print(test_dataset[0])

{'text': "What is supposed to be a simple generic mystery plot involving a dead philanthropist is, in fact, a head-ache inducing tale about a bunch of characters (the only big actor being Ginger Rogers, in a very early role) all trying to find the murderer among a small cast of residents in a posh apartment building. These characters range from utterly stupid to downright mean. As a cheap, low budget production, most of the action revolves around Rogers and her lead man (some guy, I don't care who he is 'cause he really sucked) talking about their various possibilities of solving the crime, while being constantly cut off by an absurd detective with his head in his butt. Honestly, I've never had a worse time watching an old b-rate movie of this type, and I've seen some real head-slappers.<br /><br />Oh, and the butler didn't do it, because there wasn't a butler. But pay attention to the guy who's closest to a butler. There ya go.<br /><br />--PolarisDiB", 'over_1': 0, 'over_2': 0, 'over

## tokenizer

In [25]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # distilbert-base-uncased

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [26]:
def preprocess_function(examples):

  text = examples["text"]
  encoding = tokenizer(text, padding=True, truncation=True)

  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  labels_matrix = np.zeros((len(text), len(labels)))

  for idx, label in enumerate(labels):
      labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  return  encoding

In [27]:
encoded_train = train_dataset.map(preprocess_function, batched=True, batch_size=None)
encoded_val = val_dataset.map(preprocess_function, batched=True, batch_size=None)
encoded_test = test_dataset.map(preprocess_function, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## test

In [28]:
example = encoded_train[0]
print(example.keys())

dict_keys(['text', 'over_1', 'over_2', 'over_3', 'over_4', 'over_5', 'over_6', 'over_7', 'over_8', 'over_9', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [29]:
tokenizer.decode(example['input_ids'])

"[CLS] what is supposed to be a simple generic mystery plot involving a dead philanthropist is, in fact, a head - ache inducing tale about a bunch of characters ( the only big actor being ginger rogers, in a very early role ) all trying to find the murderer among a small cast of residents in a posh apartment building. these characters range from utterly stupid to downright mean. as a cheap, low budget production, most of the action revolves around rogers and her lead man ( some guy, i don't care who he is'cause he really sucked ) talking about their various possibilities of solving the crime, while being constantly cut off by an absurd detective with his head in his butt. honestly, i've never had a worse time watching an old b - rate movie of this type, and i've seen some real head - slappers. < br / > < br / > oh, and the butler didn't do it, because there wasn't a butler. but pay attention to the guy who's closest to a butler. there ya go. < br / > < br / > - - polarisdib [SEP] [PAD]

In [30]:
example['labels']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [31]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

[]

## set format torch

In [32]:
encoded_train.set_format("torch")
encoded_val.set_format("torch")
encoded_test.set_format("torch")

## metrics

In [33]:
from numpy import cumprod

In [34]:
def get_pred_3(pred_10):
    if 0 <= pred_10 <= 3:
        return 0
    if 6 <= pred_10 <= 9:
        return 1
    return 2

In [35]:
def compute_metrics(pred):
    labels_9 = pred.label_ids
    probs_9 = pred.predictions

    pred_10 = (probs_9 > 0.5).cumprod(axis=1).sum(axis=1) # - 1 
    pred_3 = np.vectorize(get_pred_3)(pred_10)

    labels_10 = (labels_9 > 0.5).cumprod(axis=1).sum(axis=1)
    labels_3 = np.vectorize(get_pred_3)(labels_10)

    f1_10 = f1_score(labels_10, pred_10, average="weighted")
    acc_10 = accuracy_score(labels_10, pred_10)
    f1_3 = f1_score(labels_3, pred_3, average="weighted")
    acc_3 = accuracy_score(labels_3, pred_3)


    return {"accuracy_10": acc_10, 
            "f1_10": f1_10, 
            "accuracy_3": acc_3, 
            "f1_3": f1_3,
            }

## model

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [37]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           label2id=label2id,
                                                           id2label=id2label,
                                                           num_labels=9).to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## more tests

In [38]:
encoded_train[0]

TypeError: new(): invalid data type 'numpy.str_'

In [None]:
encoded_train[0]['labels'].type()

In [None]:
encoded_train['input_ids'][0]

In [None]:
#forward pass
# outputs = model(input_ids=encoded_train['input_ids'][0].unsqueeze(0), labels=encoded_train[0]['labels'].unsqueeze(0))
# outputs

In [None]:
encoded_train.features

## trainer

In [39]:
wandb.init(project="bert_sentiment")

[34m[1mwandb[0m: Currently logged in as: [33mc-nemo[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
3000 // 15 // 20

10

In [41]:
batch_size = 16
steps = 50 # 50 | len(encoded_train) // batch_size // ?

training_args = TrainingArguments(
    
    output_dir="results", 
    report_to="wandb",

    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=5, # 5

    evaluation_strategy="steps", # "epoch"
    eval_steps=steps,

    logging_steps=steps,

    save_strategy="steps", # steps,
    save_steps=steps,
    save_total_limit=1,

    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1_10"
)

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=encoded_train, eval_dataset=encoded_test,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [43]:
results = trainer.evaluate()
results

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: over_9, over_5, over_2, over_8, over_4, over_7, over_1, over_3, over_6, text. If over_9, over_5, over_2, over_8, over_4, over_7, over_1, over_3, over_6, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16


{'eval_loss': 0.2981860339641571,
 'eval_accuracy_10': 0.438,
 'eval_f1_10': 0.4570097648969687,
 'eval_accuracy_3': 0.942,
 'eval_f1_3': 0.9419993039749431}

In [44]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.123 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.008811…

0,1
eval/accuracy_10,▁▄▅▅▅▆▆▅▅▇▆▇▇▆▇▆▆▆███▆█▇███▇▇██▇██▇████▇
eval/accuracy_3,▁▇██▇█▇█▇███▇██▇████████████████████████
eval/f1_10,▁▄▅▅▅▆▆▅▅▇▆▇▇▆▇▆▆▆███▆█▇███▇▇█▇▇██▇████▇
eval/f1_3,▁███▇█▇█████████████████████████████████
eval/loss,█▃▂▂▃▂▃▂▂▁▁▁▂▂▁▁▁▁▁▁▁▂▁▂▂▁▁▂▁▁▂▂▁▁▁▂▂▂▂▂
eval/runtime,▁▁▁▁▂▁▂▂▂▁▄▃▃▃▂▅▂▃▄▃▃▅▄▅▄▄█▆▅▄▅▆▇▆▅▅▆█▇▆
eval/samples_per_second,████▇█▇▇▇█▅▆▆▆▇▄▇▆▅▆▆▄▅▄▅▅▁▃▄▅▄▃▂▃▃▄▃▁▂▃
eval/steps_per_second,█▇██▇█▇▇▇█▅▆▆▆▆▄▆▆▅▆▆▄▅▄▅▅▁▃▄▅▄▃▂▃▃▄▃▁▂▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy_10,0.438
eval/accuracy_3,0.942
eval/f1_10,0.45701
eval/f1_3,0.942
eval/loss,0.29819
eval/runtime,8.6776
eval/samples_per_second,57.62
eval/steps_per_second,3.688
train/epoch,2.09
train/global_step,5237.0


In [45]:
preds_output = trainer.predict(encoded_test) # test !- val, test the old models on new test
preds_output.metrics

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: over_9, over_5, over_2, over_8, over_4, over_7, over_1, over_3, over_6, text. If over_9, over_5, over_2, over_8, over_4, over_7, over_1, over_3, over_6, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500
  Batch size = 16


{'test_loss': 0.2981860339641571,
 'test_accuracy_10': 0.438,
 'test_f1_10': 0.4570097648969687,
 'test_accuracy_3': 0.942,
 'test_f1_3': 0.9419993039749431,
 'test_runtime': 8.7193,
 'test_samples_per_second': 57.344,
 'test_steps_per_second': 3.67}

## save

In [46]:
trainer.save_model('bert-multilabel-2/')

Saving model checkpoint to bert-multilabel-2/
Configuration saved in bert-multilabel-2/config.json
Model weights saved in bert-multilabel-2/pytorch_model.bin


In [47]:
model.save_pretrained('reserve/bert-multilabel-2/model/')
tokenizer.save_pretrained('reserve/bert-multilabel-2/tokenizer/')

Configuration saved in reserve/bert-multilabel-2/model/config.json
Model weights saved in reserve/bert-multilabel-2/model/pytorch_model.bin
tokenizer config file saved in reserve/bert-multilabel-2/tokenizer/tokenizer_config.json
Special tokens file saved in reserve/bert-multilabel-2/tokenizer/special_tokens_map.json


('reserve/bert-multilabel-2/tokenizer/tokenizer_config.json',
 'reserve/bert-multilabel-2/tokenizer/special_tokens_map.json',
 'reserve/bert-multilabel-2/tokenizer/vocab.txt',
 'reserve/bert-multilabel-2/tokenizer/added_tokens.json',
 'reserve/bert-multilabel-2/tokenizer/tokenizer.json')

In [None]:
# trainer.push_to_hub()

In [50]:
labels_9 = preds_output.label_ids
probs_9 = preds_output.predictions

pred_10 = (probs_9 > 0.5).cumprod(axis=1).sum(axis=1) # - 1 
pred_3 = np.vectorize(get_pred_3)(pred_10)

labels_10 = (labels_9 > 0.5).cumprod(axis=1).sum(axis=1)
labels_3 = np.vectorize(get_pred_3)(labels_10)

In [51]:
new_df = pd.DataFrame( )
new_df['true_10'] = labels_10
new_df['true_3'] = labels_3
new_df['pred_10'] = pred_10
new_df['pred_3'] = pred_3
new_df

Unnamed: 0,true_10,true_3,pred_10,pred_3
0,6,1,6,1
1,7,1,2,0
2,6,1,7,1
3,9,1,8,1
4,7,1,8,1
...,...,...,...,...
495,3,0,7,1
496,9,1,8,1
497,2,0,1,0
498,6,1,7,1


In [52]:
new_df.to_csv('preds_multilabel_.csv', header=True, index=False)