# Обучение NLP модели классификации тональности русских текстов с использованием хранилища FINDATALAKE

## Подготовка окружения

In [None]:
! git clone https://github.com/balezz/bert4classification.git
! pip install -r bert4classification/requirements.txt

In [2]:
%cd bert4classification

/content/bert4classification


## Получение данных из S3 хранилища

In [3]:
from minio import Minio

BUCKET_NAME = 'findatalake'
S3_FILE_NAME = 'rutwits.zip'
TMP_FILE = 'tmp.zip'
MINIO_URL = 'datalake.website:9000'

client = Minio(MINIO_URL,
               access_key='tester-1',
               secret_key='testerpass',
               secure=False)

client.fget_object(BUCKET_NAME, S3_FILE_NAME, TMP_FILE)

<minio.datatypes.Object at 0x7f1852d2f050>

In [4]:
! unzip -a {TMP_FILE}

Archive:  tmp.zip
  inflating: test.csv                [binary]
  inflating: train.csv               [binary]
  inflating: valid.csv               [binary]


## Подготовка данных для обучения классификатора тональности русских твитов

In [5]:
import pandas as pd

In [6]:
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('valid.csv')
test_data  = pd.read_csv('test.csv')

In [7]:
from bert_dataset import CustomDataset
from bert_classifier import BertClassifier

## Инициализация классификатора BERT

In [8]:
classifier = BertClassifier(
        model_path='cointegrated/rubert-tiny',
        tokenizer_path='cointegrated/rubert-tiny',
        n_classes=2,
        epochs=2
)

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [9]:
classifier.preparation(
        X_train=list(train_data['text']),
        y_train=list(train_data['label']),
        X_valid=list(valid_data['text']),
        y_valid=list(valid_data['label'])
    )



## Обучение модели

In [10]:
classifier.train()

Epoch 1/2
Train loss 0.7884642243042616 accuracy 0.6813131214851427
Val loss 0.7892908318045208 accuracy 0.7134851138353766
----------
Epoch 2/2
Train loss 0.7968849054986611 accuracy 0.7416178559613925
Val loss 0.9095830540785811 accuracy 0.7257443082311734
----------


## Проверка на тестовых данных

In [11]:
texts = list(test_data['text'])
labels = list(test_data['label'])

predictions = [classifier.predict(t) for t in texts]

In [12]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1score = precision_recall_fscore_support(labels, predictions,average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: 0.7237253367339127, recall: 0.7202497823366434, f1score: 0.72133812021524
