In [1]:
import torch
import numpy as np
import pandas as pd
import evaluate
import gc

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BertForTokenClassification
)

In [2]:
# Setting seed for reproducibility and checking whether cuda is on

torch.manual_seed(16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# We will use a distilled version of ruBert

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny2", num_labels=6
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [4]:
df = pd.read_csv('C:/repos/gosling-sight/data/cat_phones.csv', sep=';')
df = df.rename(columns={'автономность': 0, 'камеры': 1, 'экран': 2, 'дизайн': 3, 'железо': 4, 'звук': 5})
df = pd.melt(df, value_vars=[0, 1, 2, 3, 4, 5], var_name='labels', value_name='text')
df = df[['text', 'labels']]
dataset = Dataset.from_pandas(df)

In [5]:
df

Unnamed: 0,text,labels
0,При аккумуляторе ёмкостью 4300 мАч особых чуде...,0
1,В тонкий корпус Honor 70 производитель умудрил...,0
2,Ёмкость батареи Mate 50 Pro — 4700 мАч. Значен...,0
3,Смартфон Google Pixel 6 получил встроенную бат...,0
4,Смартфон заряжается с помощью комплектного зар...,0
...,...,...
85,У смартфона два динамика с поддержкой Dolby At...,5
86,"Pixel 7 Pro оснащён стереодинамиками, но стере...",5
87,На торцах расположены стереодинамики. Сохранил...,5
88,"У смартфона есть стерео, звук негромкий и дово...",5


In [7]:
# Defining preprocessing for text to get embedding lookup table and attention


def preprocess_function(examples):
    return tokenizer(
        examples["text"], padding=True, truncation=True, return_tensors="pt"
    )

In [8]:
dataset = dataset.map(preprocess_function, batched=True)  # processing dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
# dataset = dataset.train_test_split()
# train_dataset = dataset["train"].shuffle(seed=16).remove_columns(["text"])
# test_dataset = dataset["test"].remove_columns(["text"])

In [11]:
# We will max the precision-macro

metric = evaluate.load("f1")

In [12]:
# Function for trainer evaluation


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [13]:
# Configuring Trainer

training_args = TrainingArguments(
    output_dir="../models/tiny_rubert_fine_tune/",
    evaluation_strategy="steps",
    learning_rate=1.5e-3,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    weight_decay=0.24,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset.shuffle(seed=16),
    # eval_dataset=test_dataset,
    # compute_metrics=compute_metrics,
)

In [14]:
torch.cuda.empty_cache()
gc.collect()

22

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 90
  Num Epochs = 8
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 64
  Number of trainable parameters = 29195646


  0%|          | 0/64 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 486.0471, 'train_samples_per_second': 1.481, 'train_steps_per_second': 0.132, 'train_loss': 0.45278510451316833, 'epoch': 8.0}


TrainOutput(global_step=64, training_loss=0.45278510451316833, metrics={'train_runtime': 486.0471, 'train_samples_per_second': 1.481, 'train_steps_per_second': 0.132, 'train_loss': 0.45278510451316833, 'epoch': 8.0})

In [16]:
trainer.save_model("../models/tiny_rubert_fine_tune/")

Saving model checkpoint to ../models/tiny_rubert_fine_tune/
Configuration saved in ../models/tiny_rubert_fine_tune/config.json
Model weights saved in ../models/tiny_rubert_fine_tune/pytorch_model.bin


In [16]:
# trainer.evaluate(test_dataset)
