In [None]:
import os
import random
import evaluate
import mlflow

import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)


In [45]:

# disable WandB defaults
os.environ['WANDB_DISABLED'] = 'true'


# a seed for reproducibility
SEED = 42
# set seed
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# check for GPU device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device) 



Device available: cpu


In [46]:
df = pd.read_csv('../data/ml_data.csv')

In [47]:
df['price_class'] = pd.qcut(df['price'], q=10, labels=range(0, 10))

In [48]:
df_texts = df[['text', 'price_class']]

In [49]:
df_texts['text'] = df_texts['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_texts['text'] = df_texts['text'].astype(str)


In [50]:
df_texts

Unnamed: 0,text,price_class
0,"В продаже Skoda Kodiaq, полный привод.Оригинал...",9
1,Шевроле Круз 2010г 1.6 механика.Двигатель рабо...,3
2,"На ходу, на учёте, 1.6 8клоп, колеса зима шипы...",0
3,"Ваз-2107, в отличном техническом состоянии, вс...",3
4,Майбах S400 4matic в полном рестайлинге внаруж...,9
...,...,...
40446,"Автомобиль в идеальном состоянии, пробег, кузо...",9
40447,Продам ваз 2115.(Родной пробег 135.000).Птс ор...,1
40448,Машина на полном ходу все расходники поменяны....,4
40449,Только продажа цена снижена за срочность.,6


In [51]:
dataset = Dataset.from_pandas(df_texts)

In [52]:
splited_dataset = dataset.train_test_split(0.2)

In [53]:
train_dataset = splited_dataset['train']

In [54]:
test_dataset = splited_dataset['test'].train_test_split(0.5)['train']
val_dataset = splited_dataset['test'].train_test_split(0.5)['test']

In [55]:
full_dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'val': val_dataset})

In [56]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'price_class'],
        num_rows: 32360
    })
    test: Dataset({
        features: ['text', 'price_class'],
        num_rows: 4045
    })
    val: Dataset({
        features: ['text', 'price_class'],
        num_rows: 4046
    })
})

In [57]:
checkpoint = 'cointegrated/rubert-tiny2'
# bert tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# data collator for dynamic padding as per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [58]:
# cache a pre-trained BERT model for two-class classification
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
# define a tokenize function
def Tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

In [60]:
full_dataset['train'][0]

{'text': 'Была проведена полная диагностика, в результате которой нарушений не выявлено.Техническое состояние отличное.Птс оригинал.',
 'price_class': 7}

In [61]:
# tokenize entire data
tokenized_data = full_dataset.map(Tokenize_function, batched=True)

Map:   0%|          | 0/32360 [00:00<?, ? examples/s]

Map: 100%|██████████| 32360/32360 [00:03<00:00, 10316.36 examples/s]
Map: 100%|██████████| 4045/4045 [00:00<00:00, 10491.64 examples/s]
Map: 100%|██████████| 4046/4046 [00:00<00:00, 11125.18 examples/s]


In [62]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'price_class', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 32360
    })
    test: Dataset({
        features: ['text', 'price_class', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4045
    })
    val: Dataset({
        features: ['text', 'price_class', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4046
    })
})

In [63]:
tokenized_data = tokenized_data.remove_columns(['text'])
tokenized_data = tokenized_data.rename_column('price_class','labels')
tokenized_data.with_format('pt')

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 32360
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4045
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4046
    })
})

In [64]:
# use the pre-built metrics 
def compute_metrics(eval_preds):
    f1_metric = evaluate.load('f1')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels, average='macro')

In [65]:
def calculate_metrics(y_pred, y_test, average):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=average)
    recall = recall_score(y_test, y_pred, average=average)
    f1 = f1_score(y_test, y_pred, average=average)

    return accuracy, precision, recall, f1

In [66]:
y_test = test_dataset['price_class']

In [71]:
mlflow.set_tracking_uri('../mlruns')
mlflow.set_experiment('Text classificator (10)')

with mlflow.start_run():
    training_params = {
        'output_dir': 'bert-finetuning',
        'eval_strategy': 'epoch',
        'num_train_epochs': 3,
        'learning_rate': 5e-5,
        'weight_decay': 0.005,
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'report_to': 'none',
    }

    model_config = {'batch_size': 8}

    training_args = TrainingArguments(**training_params)
    
    mlflow.log_params(training_params)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_data['train'],
        eval_dataset=tokenized_data['val'],
        data_collator=data_collator,
        processing_class =tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    tuned_pipeline = pipeline(
        task='text-classification',
        model=trainer.model,
        batch_size=8,
        tokenizer=tokenizer,
        device='cpu',
    )
    predictions = trainer.predict(tokenized_data['test'])
    y_pred = np.argmax(predictions.predictions, axis=1) 

    accuracy, precision, recall, f1 = calculate_metrics(y_pred, y_test, 'macro')
    
    print('Rubert tiny2 model')
    print(f'  Accuracy: {accuracy}')
    print(f'  Precision: {precision}')
    print(f'  Recall: {recall}')
    print(f'  F1: {f1}')

    mlflow.log_metric('Accuracy', accuracy)
    mlflow.log_metric('Precision', precision)
    mlflow.log_metric('Recall', recall)
    mlflow.log_metric('F1', f1)

    
    # Логирование модели с примером входных данных
    mlflow.transformers.log_model(
        transformers_model=tuned_pipeline,
        artifact_path='bert-finetuning',
        model_config=model_config,
    )

  0%|          | 2/12135 [02:02<205:55:25, 61.10s/it]
  4%|▍         | 500/12135 [03:27<1:16:24,  2.54it/s]
  4%|▍         | 500/12135 [03:27<1:16:24,  2.54it/s]

{'loss': 1.5946, 'grad_norm': 17.380924224853516, 'learning_rate': 4.793984342810054e-05, 'epoch': 0.12}


  8%|▊         | 1000/12135 [06:37<1:11:59,  2.58it/s]
  8%|▊         | 1000/12135 [06:37<1:11:59,  2.58it/s]

{'loss': 1.5732, 'grad_norm': 30.89073944091797, 'learning_rate': 4.587968685620108e-05, 'epoch': 0.25}


 12%|█▏        | 1500/12135 [09:42<1:00:08,  2.95it/s]
 12%|█▏        | 1500/12135 [09:42<1:00:08,  2.95it/s]

{'loss': 1.54, 'grad_norm': 31.366422653198242, 'learning_rate': 4.3819530284301606e-05, 'epoch': 0.37}


 16%|█▋        | 2000/12135 [12:47<1:00:34,  2.79it/s]
 16%|█▋        | 2000/12135 [12:47<1:00:34,  2.79it/s]

{'loss': 1.5303, 'grad_norm': 14.59927749633789, 'learning_rate': 4.175937371240215e-05, 'epoch': 0.49}


 21%|██        | 2500/12135 [15:49<58:21,  2.75it/s]  
 21%|██        | 2500/12135 [15:49<58:21,  2.75it/s]

{'loss': 1.5565, 'grad_norm': 17.10332489013672, 'learning_rate': 3.969921714050268e-05, 'epoch': 0.62}


 25%|██▍       | 3000/12135 [18:59<52:31,  2.90it/s]  
 25%|██▍       | 3000/12135 [18:59<52:31,  2.90it/s]

{'loss': 1.5283, 'grad_norm': 30.031999588012695, 'learning_rate': 3.7639060568603216e-05, 'epoch': 0.74}


 29%|██▉       | 3500/12135 [22:02<51:17,  2.81it/s]  
 29%|██▉       | 3500/12135 [22:02<51:17,  2.81it/s]

{'loss': 1.5357, 'grad_norm': 23.741024017333984, 'learning_rate': 3.557890399670375e-05, 'epoch': 0.87}


 33%|███▎      | 4000/12135 [25:04<48:04,  2.82it/s]  
 33%|███▎      | 4000/12135 [25:04<48:04,  2.82it/s]

{'loss': 1.5103, 'grad_norm': 12.482612609863281, 'learning_rate': 3.3518747424804286e-05, 'epoch': 0.99}


 33%|███▎      | 4045/12135 [25:21<46:55,  2.87it/s]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 33%|███▎      | 4045/12135 [25:48<46:55,  2.87it/s]
[A

{'eval_loss': 1.8397983312606812, 'eval_f1': 0.30073667410344446, 'eval_runtime': 26.4125, 'eval_samples_per_second': 153.185, 'eval_steps_per_second': 19.158, 'epoch': 1.0}


 37%|███▋      | 4500/12135 [28:33<45:45,  2.78it/s]   
 37%|███▋      | 4500/12135 [28:33<45:45,  2.78it/s]

{'loss': 1.3789, 'grad_norm': 22.06755828857422, 'learning_rate': 3.145859085290482e-05, 'epoch': 1.11}


 41%|████      | 5000/12135 [31:36<42:43,  2.78it/s]  
 41%|████      | 5000/12135 [31:36<42:43,  2.78it/s]

{'loss': 1.3721, 'grad_norm': 20.23018455505371, 'learning_rate': 2.939843428100536e-05, 'epoch': 1.24}


 45%|████▌     | 5500/12135 [34:45<40:12,  2.75it/s]  
 45%|████▌     | 5500/12135 [34:45<40:12,  2.75it/s]

{'loss': 1.3935, 'grad_norm': 29.19651222229004, 'learning_rate': 2.7338277709105893e-05, 'epoch': 1.36}


 49%|████▉     | 6000/12135 [37:49<37:34,  2.72it/s]  
 49%|████▉     | 6000/12135 [37:49<37:34,  2.72it/s]

{'loss': 1.3869, 'grad_norm': 32.12706756591797, 'learning_rate': 2.5278121137206427e-05, 'epoch': 1.48}


 54%|█████▎    | 6500/12135 [40:52<32:29,  2.89it/s]  
 54%|█████▎    | 6500/12135 [40:52<32:29,  2.89it/s]

{'loss': 1.4156, 'grad_norm': 36.275516510009766, 'learning_rate': 2.3217964565306965e-05, 'epoch': 1.61}


 58%|█████▊    | 7000/12135 [44:00<32:43,  2.62it/s]  
 58%|█████▊    | 7000/12135 [44:00<32:43,  2.62it/s]

{'loss': 1.4064, 'grad_norm': 29.725627899169922, 'learning_rate': 2.11578079934075e-05, 'epoch': 1.73}


 62%|██████▏   | 7500/12135 [47:07<26:35,  2.91it/s]
 62%|██████▏   | 7500/12135 [47:07<26:35,  2.91it/s]

{'loss': 1.4396, 'grad_norm': 26.674325942993164, 'learning_rate': 1.9097651421508038e-05, 'epoch': 1.85}


 66%|██████▌   | 8000/12135 [51:50<27:43,  2.49it/s]  
 66%|██████▌   | 8000/12135 [51:50<27:43,  2.49it/s]

{'loss': 1.4187, 'grad_norm': 14.549647331237793, 'learning_rate': 1.7037494849608572e-05, 'epoch': 1.98}


 67%|██████▋   | 8090/12135 [52:33<30:01,  2.25it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                   

{'eval_loss': 1.889063835144043, 'eval_f1': 0.30628994013309774, 'eval_runtime': 32.998, 'eval_samples_per_second': 122.614, 'eval_steps_per_second': 15.334, 'epoch': 2.0}


 70%|███████   | 8500/12135 [56:11<23:37,  2.56it/s]   
 70%|███████   | 8500/12135 [56:11<23:37,  2.56it/s]

{'loss': 1.3074, 'grad_norm': 19.466066360473633, 'learning_rate': 1.4977338277709107e-05, 'epoch': 2.1}


 74%|███████▍  | 9000/12135 [59:21<25:53,  2.02it/s]
 74%|███████▍  | 9000/12135 [59:21<25:53,  2.02it/s]

{'loss': 1.2498, 'grad_norm': 32.61720275878906, 'learning_rate': 1.2917181705809641e-05, 'epoch': 2.22}


 78%|███████▊  | 9500/12135 [1:03:37<17:53,  2.45it/s]
 78%|███████▊  | 9500/12135 [1:03:37<17:53,  2.45it/s]

{'loss': 1.2945, 'grad_norm': 22.139362335205078, 'learning_rate': 1.0857025133910178e-05, 'epoch': 2.35}


 82%|████████▏ | 10000/12135 [1:22:34<36:36:55, 61.74s/it] 
 82%|████████▏ | 10000/12135 [1:22:34<36:36:55, 61.74s/it]

{'loss': 1.2877, 'grad_norm': 21.19995880126953, 'learning_rate': 8.796868562010712e-06, 'epoch': 2.47}


 87%|████████▋ | 10500/12135 [1:26:16<10:17,  2.65it/s]   
 87%|████████▋ | 10500/12135 [1:26:16<10:17,  2.65it/s]

{'loss': 1.3296, 'grad_norm': 18.96541404724121, 'learning_rate': 6.736711990111248e-06, 'epoch': 2.6}


 91%|█████████ | 11000/12135 [1:29:29<13:43,  1.38it/s]
 91%|█████████ | 11000/12135 [1:29:29<13:43,  1.38it/s]

{'loss': 1.339, 'grad_norm': 34.179893493652344, 'learning_rate': 4.676555418211785e-06, 'epoch': 2.72}


 95%|█████████▍| 11500/12135 [1:32:46<05:07,  2.07it/s]
 95%|█████████▍| 11500/12135 [1:32:46<05:07,  2.07it/s]

{'loss': 1.35, 'grad_norm': 24.613218307495117, 'learning_rate': 2.61639884631232e-06, 'epoch': 2.84}


 99%|█████████▉| 12000/12135 [1:36:47<01:09,  1.95it/s]
 99%|█████████▉| 12000/12135 [1:36:47<01:09,  1.95it/s]

{'loss': 1.3581, 'grad_norm': 43.470489501953125, 'learning_rate': 5.562422744128554e-07, 'epoch': 2.97}


100%|██████████| 12135/12135 [1:37:42<00:00,  2.62it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                       

100%|██████████| 12135/12135 [1:38:09<00:00,  2.62it/s]
[A
[A
100%|██████████| 12135/12135 [1:38:09<00:00,  2.06it/s]


{'eval_loss': 1.933677315711975, 'eval_f1': 0.3184332617611575, 'eval_runtime': 26.017, 'eval_samples_per_second': 155.514, 'eval_steps_per_second': 19.449, 'epoch': 3.0}
{'train_runtime': 5889.7328, 'train_samples_per_second': 16.483, 'train_steps_per_second': 2.06, 'train_loss': 1.4205874008851431, 'epoch': 3.0}


100%|██████████| 506/506 [00:24<00:00, 20.42it/s]


Rubert tiny2 model
  Accuracy: 0.3100123609394314
  Precision: 0.310259412496028
  Recall: 0.3113448643376934
  F1: 0.30994959600077765


