In [1]:
from IPython.display import clear_output

In [2]:
!pip install transformers==4.28.0
!pip install jsonlines
!pip install wandb 
!pip install evaluate
!pip install git+https://github.com/huggingface/accelerate
clear_output()

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Чтение исходных файлов и создание датасета

In [4]:
import jsonlines

with jsonlines.open("/content/enlarged_full_p_tuning_train.jsonl", "r") as fin:
    train_data = list(fin)
with jsonlines.open("/content/val.jsonl", "r") as fin:
    dev_data = list(fin)
with jsonlines.open("/content/test.jsonl", "r") as fin:
    test_data = list(fin)

In [5]:
# mapping expected ids to their labels 
id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [7]:
from torch.utils.data.dataset import Dataset

class BinaryQuestionsDataset(Dataset):

    def __init__(self, data, tokenizer, pos_label=True):
        self.data = data
        self.tokenizer = tokenizer
        self.pos_label = pos_label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        answer = self.tokenizer(item["question"], item["passage"], padding=True, truncation=True, max_length=512)
        if "label" in item:
            answer["labels"] = int(item["label"] == self.pos_label)
        return answer

train_dataset = BinaryQuestionsDataset(train_data, tokenizer)
dev_dataset = BinaryQuestionsDataset(dev_data, tokenizer)

In [None]:
len(train_dataset)

1749

### Дообучение

In [8]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33malexandra-fedorova1499[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
# Создаем проект на wandb
%env WANDB_PROJECT=rubert_deeppavlov_fine-tuning_5_epochs_with_add_data_p_tune

env: WANDB_PROJECT=rubert_deeppavlov_fine-tuning_5_epochs_with_add_data_p_tune


In [10]:
from transformers import set_seed

set_seed(1)

In [11]:
import evaluate

def compute_metrics(eval_pred):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    # eval_pred = (logits of the model, correct_labels)
    logits, labels = eval_pred
    pred_labels = np.argmax(logits, axis=-1)

    # we need to get metrics for each class
    accuracy = accuracy_metric.compute(references=labels, predictions=pred_labels)['accuracy']
    results_precision = precision_metric.compute(predictions=pred_labels, references=labels, labels=[1, 0], average=None)['precision']
    results_recall = recall_metric.compute(predictions=pred_labels, references=labels, labels=[1, 0], average=None)['recall']
    results_f1 = f1_metric.compute(predictions=pred_labels, references=labels, labels=[1, 0], average=None)['f1']
    precision_positive, precision_negative = results_precision[0], results_precision[1]
    recall_positive, recall_negative = results_recall[0], results_recall[1]
    f1_positive, f1_negative = results_f1[0], results_f1[1]
    
    return {"Accuracy": round(accuracy, 2), 
            "Precision Pos": round(precision_positive, 2),
            "Precision Neg": round(precision_negative, 2),
            "Recall Pos": round(recall_positive, 2),
            "Recall Neg": round(recall_negative, 2),
            "F1 Pos": round(f1_positive, 2), 
            "F1 Neg": round(f1_negative, 2)}

In [12]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import numpy as np

num_epochs = 5
batch_size = 8
num_batches = len(train_dataset) // batch_size + 1
num_gradient_updates = num_epochs * num_batches

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=num_gradient_updates,
)



In [13]:
training_args = TrainingArguments(
    output_dir="rubert_deeppavlov",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="Accuracy",
    seed=1,
    report_to="wandb",
)

In [14]:
trainer = Trainer(
    model=model,
    optimizers=(optimizer, scheduler), # optimizers=(optimizer, scheduler)
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer), # добавляет паддинг в более короткие элементы батча
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision pos,Precision neg,Recall pos,Recall neg,F1 pos,F1 neg
1,0.5224,0.760493,0.64,0.58,0.91,0.97,0.3,0.73,0.45
2,0.3394,0.990246,0.63,0.58,0.9,0.97,0.28,0.72,0.43
3,0.2571,1.177747,0.67,0.61,0.89,0.95,0.39,0.74,0.54
4,0.1994,1.432623,0.66,0.61,0.85,0.93,0.39,0.73,0.54
5,0.1264,1.316939,0.7,0.64,0.83,0.9,0.5,0.75,0.62


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

TrainOutput(global_step=4290, training_loss=0.2889642284188793, metrics={'train_runtime': 1460.7196, 'train_samples_per_second': 23.475, 'train_steps_per_second': 2.937, 'total_flos': 3294194607542280.0, 'train_loss': 0.2889642284188793, 'epoch': 5.0})

In [15]:
trainer.evaluate()

{'eval_loss': 1.3169386386871338,
 'eval_Accuracy': 0.7,
 'eval_Precision Pos': 0.64,
 'eval_Precision Neg': 0.83,
 'eval_Recall Pos': 0.9,
 'eval_Recall Neg': 0.5,
 'eval_F1 Pos': 0.75,
 'eval_F1 Neg': 0.62,
 'eval_runtime': 11.6538,
 'eval_samples_per_second': 70.449,
 'eval_steps_per_second': 8.838,
 'epoch': 5.0}

In [None]:
model.save_pretrained("path/to/model")

In [16]:
trainer.save_model("/content/drive/MyDrive/Diploma 2.0/rubert_deeppavlov_with_data_after_p_tune")

In [19]:
# obtaining results for test set
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Diploma 2.0/rubert_deeppavlov_with_data_after_p_tune", num_labels=2)

In [20]:
from transformers import pipeline

In [21]:
pipe = pipeline("text-classification", model="/content/drive/MyDrive/Diploma 2.0/rubert_deeppavlov_with_data_after_p_tune", tokenizer=tokenizer)

In [34]:
questions = [elem['question'] for elem in test_data]
passages = [elem['passage'] for elem in test_data]

In [42]:
test_result = []

In [None]:
for i, (question, passage) in enumerate(zip(questions, passages)):
    result = {}
    res = pipe(question, passage)
    answer = res[0]['label']
    if answer == 'yes':
        result['idx'] = i
        result['label'] = "true"
    else: 
        result['idx'] = i
        result['label'] = "false"
    test_result.append(result)

In [50]:
import jsonlines

with jsonlines.open('DaNetQA.jsonl', mode='w') as writer:
    writer.write_all(test_result)

In [51]:
with jsonlines.open('/content/drive/MyDrive/Diploma 2.0/DaNetQA/test_DaNetQA_fine_tune_add_full_p_tuned.jsonl', mode='w') as writer:
    writer.write_all(test_result)

In [52]:
from zipfile import ZipFile
 
with ZipFile("superglue_danetquestions.zip", "w") as myzip:
    myzip.write('DaNetQA.jsonl')