In [1]:
!pip install transformers sklearn pandas matplotlib

Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-win_amd64.whl (3.3 MB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=07942321bdd60c4ad4ad47acaa5e1c3694c9c28b51c0a61e8a4e7af7dea444ea
  Stored in directory: c:\users\howto\appdata\local\pip\cache\wheels\22\0b\40\fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: tokenizers, transformers, sklearn
Successfully installed sklearn-0.0 tokenizers-0.12.1 transformers-4.20.0


In [4]:
import os, re
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import Counter
from string import punctuation
import matplotlib.pyplot as plt
%matplotlib inline


import tensorflow as tf
from transformers import TFAutoModel
from transformers import AutoTokenizer

In [5]:
data = pd.read_csv('lenta_sample.csv')
data.dropna(subset=['topic', 'text'], inplace=True)

#### bert-base-multilingual

In [8]:
bert = TFAutoModel.from_pretrained('bert-base-multilingual-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

In [9]:
X = []

for text in data.title:
    ids = tokenizer.encode(text)
    X.append(ids[:512])

id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}   
    
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', maxlen=512)
y = tf.keras.utils.to_categorical([label2id[label] for label in data.topic.values])

train_index, valid_index = train_test_split(list(range(len(X))), test_size=0.05, stratify=data.topic)

X_train, y_train = X[train_index], y[train_index]
X_valid, y_valid = X[valid_index], y[valid_index]

In [18]:
input_word_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32,
                                     name="input_ids")

output = bert({"input_ids":input_word_ids})
drop = tf.keras.layers.Dropout(0.3)(output[0][:, 0])
dense = tf.keras.layers.Dense(y.shape[1], activation='softmax')(drop)

bert_clf = tf.keras.Model(inputs=input_word_ids, outputs=dense)

bert_clf.compile(tf.optimizers.Adam(learning_rate=2e-6,
                                    ), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy', 
                           tf.keras.metrics.RecallAtPrecision(0.80)])

In [None]:
bert_clf.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          batch_size=2,
          epochs=20)

In [13]:
from sklearn.metrics import classification_report

In [20]:
pred = bert_clf.predict(X_valid, batch_size=5).argmax(1)
print(classification_report(y_valid.argmax(1), pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

         Культура       0.50      0.33      0.40         3
           Россия       0.00      0.00      0.00         2
           Бизнес       0.00      0.00      0.00         2
        Экономика       0.00      0.00      0.00         2
  Наука и техника       0.10      0.67      0.17         3
   69-я параллель       0.00      0.00      0.00         1
      Бывший СССР       0.00      0.00      0.00         3
             Крым       0.00      0.00      0.00         0
          Легпром       0.00      0.00      0.00         1
              Дом       0.00      0.00      0.00         2
   Интернет и СМИ       0.00      0.00      0.00         2
         Из жизни       0.22      0.67      0.33         3
            Спорт       0.00      0.00      0.00         2
       Библиотека       0.00      0.00      0.00         0
         Ценности       0.00      0.00      0.00         2
              Мир       0.00      0.00      0.00       

##### roberta-base

In [21]:
tokenizer_roberta = AutoTokenizer.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [22]:
X = []

for text in data.title:
    ids = tokenizer_roberta.encode(text)
    
    X.append(ids[:512])
    
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', maxlen=512)

id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}

y = tf.keras.utils.to_categorical([label2id[label] for label in data.topic.values])

train_index, valid_index = train_test_split(list(range(len(X))), test_size=0.05, stratify=data.topic)

X_train, y_train = X[train_index], y[train_index]
X_valid, y_valid = X[valid_index], y[valid_index]

In [23]:
roberta = TFAutoModel.from_pretrained('roberta-base', trainable=True)

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [25]:
input_word_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32,
                                     name="input_ids")

output = roberta({"input_ids":input_word_ids})
drop = tf.keras.layers.Dropout(0.3)(output[0][:, 0]) 
dense = tf.keras.layers.Dense(y.shape[1], activation='softmax')(drop)

roberta_clf = tf.keras.Model(inputs=input_word_ids, outputs=dense)


roberta_clf.compile(tf.optimizers.Adam(learning_rate=2e-6,
                                    ), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy', 
                           tf.keras.metrics.RecallAtPrecision(0.80, name='rec_prec')])

In [None]:
roberta_clf.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          batch_size=2,
          epochs=20)

In [28]:
pred = roberta_clf.predict(X_valid, batch_size=5).argmax(1)
print(classification_report(y_valid.argmax(1), pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

         Культура       0.00      0.00      0.00         3
           Россия       0.00      0.00      0.00         2
           Бизнес       0.00      0.00      0.00         2
        Экономика       0.00      0.00      0.00         2
  Наука и техника       0.10      1.00      0.18         3
   69-я параллель       0.00      0.00      0.00         1
      Бывший СССР       0.00      0.00      0.00         3
             Крым       0.00      0.00      0.00         0
          Легпром       0.00      0.00      0.00         1
              Дом       0.00      0.00      0.00         2
   Интернет и СМИ       0.00      0.00      0.00         2
         Из жизни       0.00      0.00      0.00         3
            Спорт       0.00      0.00      0.00         2
       Библиотека       0.00      0.00      0.00         0
         Ценности       0.00      0.00      0.00         2
              Мир       0.00      0.00      0.00       

#### camembert-base

In [7]:
tokenizer_camembert = AutoTokenizer.from_pretrained("camembert-base")
camembert = TFAutoModel.from_pretrained('camembert-base', trainable=True)

Some layers from the model checkpoint at camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFCamembertModel were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta/pooler/dense/kernel:0', 'roberta/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
X = []

for text in data.title:
    ids = tokenizer_camembert.encode(text)
    X.append(ids[:512])

id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}   
    
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', maxlen=512)
y = tf.keras.utils.to_categorical([label2id[label] for label in data.topic.values])

train_index, valid_index = train_test_split(list(range(len(X))), test_size=0.05, stratify=data.topic)

X_train, y_train = X[train_index], y[train_index]
X_valid, y_valid = X[valid_index], y[valid_index]

In [10]:
input_word_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32,
                                     name="input_ids")

output = camembert({"input_ids":input_word_ids})
drop = tf.keras.layers.Dropout(0.3)(output[0][:, 0])
dense = tf.keras.layers.Dense(y.shape[1], activation='softmax')(drop)

camembert_clf = tf.keras.Model(inputs=input_word_ids, outputs=dense)

camembert_clf.compile(tf.optimizers.Adam(learning_rate=2e-6,
                                    ), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy', 
                           tf.keras.metrics.RecallAtPrecision(0.80)])

In [None]:
camembert_clf.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          batch_size=3,
          epochs=25)

In [14]:
pred = camembert_clf.predict(X_valid, batch_size=5).argmax(1)
print(classification_report(y_valid.argmax(1), pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

   69-я параллель       0.00      0.00      0.00         1
        Экономика       0.00      0.00      0.00         2
       Библиотека       0.00      0.00      0.00         0
            Спорт       0.00      0.00      0.00         2
           Бизнес       0.00      0.00      0.00         2
  Наука и техника       0.00      0.00      0.00         3
      Бывший СССР       0.00      0.00      0.00         3
         Культура       0.10      1.00      0.18         3
             Крым       0.00      0.00      0.00         0
         Из жизни       0.00      0.00      0.00         3
Силовые структуры       0.00      0.00      0.00         2
              Дом       0.00      0.00      0.00         2
              Мир       0.00      0.00      0.00         1
   Интернет и СМИ       0.00      0.00      0.00         2
         Ценности       0.00      0.00      0.00         2
           Россия       0.00      0.00      0.00       

**результаты на всех моделях почти одинаково плохи (возможно, недообучены из-за неполного количества эпох);**

**из полученных результатов лучше справилась модель bert**

# Задание 2

1) Модель BERT обучается с помощью двух разных задач: модели маскированного языка (MLM) и прогнозирования следующего предложения (NSP). 

RoBERTa является расширением модели Bert (оптимизированная и надежная версия BERT).

Модель RoBERTa обучается с большими партиями и более длинными последовательностями; это повышает точность конечной задачи по сравнению с BERT.

Когда данные передаются в модель RoBERTa, каждый раз выполняются разные стратегии маскирования. Модель BERT, с другой стороны, использует стратегию статического маскирования и выполняет маскирование только во время предварительной обработки данных .

Исходя из этого, делается вывод, что roberta лучше bert (хотя обучается дольше)

ссылки: https://arxiv.org/pdf/1907.11692.pdf;
 https://www.sciencedirect.com/science/article/pii/S0306457321002375
        
2) Модель T5 – это уже обученная многозадачная нейросеть: может хорошо понимать и генерировать текст. 
Модель можно дообучить на собственную задачу (например, генерация ответа, суммаризация текстов и даже перевод).
В отличие от bert t5 можно декодировать на свой вкус, поэтому эта модель многозадачнее.