**Получаем эмбеддинги из предобученных моделей (BERT, DistilBERT), после используем их для классификации с помощью LR, SVM**

# 1. Установка библиотек - transformers, torch

In [None]:
!pip install transformers torch



In [None]:
import torch

In [None]:
from transformers import BertTokenizer, BertModel

# 2. Инициализация BERT и DistilBERT

**Загрузка модели и токенизатора BERT**

In [None]:
model_name_bert = 'bert-base-uncased'
tokenizer_bert = BertTokenizer.from_pretrained(model_name_bert)
model_bert = BertModel.from_pretrained(model_name_bert)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# функция получения эмбеддингов из BERT'а

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Токенизация и преобразование в тензоры
        inputs = tokenizer_bert(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Получение эмбеддингов
        with torch.no_grad():
            outputs = model_bert(**inputs)

        # Берём эмбеддинг
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        embeddings.append(cls_embedding)

    return embeddings

**Загрузки модели и токенизатора DistilBERT**

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
model_name_distilbert = 'distilbert-base-uncased'
tokenizer_distilbert = DistilBertTokenizer.from_pretrained(model_name_distilbert)
model_distilbert = DistilBertModel.from_pretrained(model_name_distilbert)

In [None]:
# Функция для получения эмбеддингов DistilBERT

def get_distilbert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer_distilbert(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model_distilbert(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        embeddings.append(cls_embedding)
    return embeddings

## 3. Загрузка датасетов

In [None]:
!pip install datasets



In [None]:
import datasets

In [None]:
dataset_news = datasets.load_dataset("ag_news")

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
dataset_imdb = datasets.load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
import random

count0, count1, count2, count3 = 0, 0, 0, 0
dataset_short_news = []
for i in range(len(dataset_news['train'])):
  if dataset_news['train'][i]['label'] == 0 and count0 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_news['train'][i]['label'] == 1 and count1 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 1})
    count1 += 1
  elif dataset_news['train'][i]['label'] == 2 and count2 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 2})
    count2 += 1
  elif dataset_news['train'][i]['label'] == 3 and count3 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 3})
    count3 += 1

random.shuffle(dataset_short_news)
dataset_news = {'train': dataset_short_news[:6400], 'test': dataset_short_news[6400:]}

In [None]:
news_X = []
for i in range(len(dataset_news['train'])):
  news_X.append(dataset_news['train'][i]['news'])
for i in range(len(dataset_news['test'])):
  news_X.append(dataset_news['test'][i]['news'])

In [None]:
news_y = []
for i in range(len(dataset_news['train'])):
  news_y.append(dataset_news['train'][i]['label'])
for i in range(len(dataset_news['test'])):
  news_y.append(dataset_news['test'][i]['label'])

In [None]:
count0, count1 = 0, 0
dataset_short_imdb = []

for i in range(len(dataset_imdb['train'])):
  if dataset_imdb['train'][i]['label'] == 0 and count0 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_imdb['train'][i]['label'] == 1 and count1 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 1})
    count1 += 1

random.shuffle(dataset_short_imdb)
dataset_imdb = {'train': dataset_short_imdb[:6400], 'test': dataset_short_imdb[6400:]}

In [None]:
imdb_X = []
for i in range(len(dataset_imdb['train'])):
  imdb_X.append(dataset_imdb['train'][i]['text'])
for i in range(len(dataset_imdb['test'])):
  imdb_X.append(dataset_imdb['test'][i]['text'])

In [None]:
imdb_y = []
for i in range(len(dataset_imdb['train'])):
  imdb_y.append(dataset_imdb['train'][i]['label'])
for i in range(len(dataset_imdb['test'])):
  imdb_y.append(dataset_imdb['test'][i]['label'])

# 4. Получаем эмбеддинги для текстов

In [None]:
bert_embeddings_news = get_bert_embeddings(news_X)
bert_embeddings_imdb = get_bert_embeddings(imdb_X)

In [None]:
distil_embeddings_news = get_distilbert_embeddings(news_X)
distil_embeddings_imdb = get_distilbert_embeddings(imdb_X)

# 5. Обучаем LR на эмбеддингах

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

**news - LR**

In [None]:
X_train_bert_news, X_test_bert_news, y_train_news, y_test_news = train_test_split(bert_embeddings_news, news_y, test_size=0.2)
X_train_distil_news, X_test_distil_news, _, _ = train_test_split(distil_embeddings_news, news_y, test_size=0.2)

In [None]:
lr_news_bert = LogisticRegression()

In [None]:
lr_news_distil = LogisticRegression()

In [None]:
lr_news_bert.fit(X_train_bert_news, y_train_news)
y_pred_news_bert = lr_news_bert.predict(X_test_bert_news)
print("LR + BERT:\n", classification_report(y_test_news, y_pred_news_bert))

LR + BERT:
               precision    recall  f1-score   support

           0       0.85      0.86      0.86       376
           1       0.95      0.93      0.94       406
           2       0.82      0.86      0.84       396
           3       0.86      0.83      0.84       422

    accuracy                           0.87      1600
   macro avg       0.87      0.87      0.87      1600
weighted avg       0.87      0.87      0.87      1600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lr_news_distil.fit(X_train_distil_news, y_train_news)
y_pred_news_distil = lr_news_distil.predict(X_test_distil_news)
print("LR + DistilBERT:\n", classification_report(y_test_news, y_pred_news_distil))

LR + DistilBERT:
               precision    recall  f1-score   support

           0       0.25      0.27      0.26       376
           1       0.27      0.25      0.26       406
           2       0.25      0.25      0.25       396
           3       0.26      0.27      0.27       422

    accuracy                           0.26      1600
   macro avg       0.26      0.26      0.26      1600
weighted avg       0.26      0.26      0.26      1600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**news - SVM**

In [None]:
from sklearn.svm import SVC

In [None]:
svm_news_bert = SVC()
svm_news_distil = SVC()

In [None]:
svm_news_bert.fit(X_train_bert_news, y_train_news)
y_pred_news_bert_svm = svm_news_bert.predict(X_test_bert_news)
print("SVM + BERT:\n", classification_report(y_test_news, y_pred_news_bert_svm))

SVM + BERT:
               precision    recall  f1-score   support

           0       0.90      0.86      0.88       376
           1       0.94      0.96      0.95       406
           2       0.84      0.87      0.85       396
           3       0.88      0.87      0.88       422

    accuracy                           0.89      1600
   macro avg       0.89      0.89      0.89      1600
weighted avg       0.89      0.89      0.89      1600



In [None]:
svm_news_distil.fit(X_train_distil_news, y_train_news)
y_pred_news_distil_svm = svm_news_distil.predict(X_test_distil_news)
print("SVM + DistilBERT:\n", classification_report(y_test_news, y_pred_news_distil_svm))

SVM + DistilBERT:
               precision    recall  f1-score   support

           0       0.25      0.37      0.30       376
           1       0.23      0.23      0.23       406
           2       0.25      0.26      0.25       396
           3       0.30      0.17      0.21       422

    accuracy                           0.25      1600
   macro avg       0.26      0.25      0.25      1600
weighted avg       0.26      0.25      0.25      1600



**imdb - LR**

In [None]:
X_train_bert_imdb, X_test_bert_imdb, y_train_imdb, y_test_imdb = train_test_split(bert_embeddings_imdb, imdb_y, test_size=0.2)
X_train_distil_imdb, X_test_distil_imdb, _, _ = train_test_split(distil_embeddings_imdb, imdb_y, test_size=0.2)

In [None]:
lr_imdb_bert = LogisticRegression()
lr_imdb_distil = LogisticRegression()

In [None]:
lr_imdb_bert.fit(X_train_bert_imdb, y_train_imdb)
y_pred_bert = lr_imdb_bert.predict(X_test_bert_imdb)
print("LR + BERT:\n", classification_report(y_test_imdb, y_pred_bert))

LR + BERT:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87       808
           1       0.87      0.85      0.86       792

    accuracy                           0.86      1600
   macro avg       0.86      0.86      0.86      1600
weighted avg       0.86      0.86      0.86      1600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lr_imdb_distil.fit(X_train_distil_imdb, y_train_imdb)
y_pred_distil = lr_imdb_distil.predict(X_test_distil_imdb)
print("LR + DsitilBERT:\n", classification_report(y_test_imdb, y_pred_distil))

LR + DsitilBERT:
               precision    recall  f1-score   support

           0       0.51      0.50      0.51       808
           1       0.50      0.51      0.51       792

    accuracy                           0.51      1600
   macro avg       0.51      0.51      0.51      1600
weighted avg       0.51      0.51      0.51      1600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**imdb - SVM**

In [None]:
svm_imdb_bert = SVC()
svm_imdb_distil = SVC()

In [None]:
svm_imdb_bert.fit(X_train_bert_imdb, y_train_imdb)
y_pred_bert = svm_imdb_bert.predict(X_test_bert_imdb)
print("SVM + BERT:\n", classification_report(y_test_imdb, y_pred_bert))

SVM + BERT:
               precision    recall  f1-score   support

           0       0.86      0.89      0.87       808
           1       0.88      0.85      0.86       792

    accuracy                           0.87      1600
   macro avg       0.87      0.87      0.87      1600
weighted avg       0.87      0.87      0.87      1600



In [None]:
svm_imdb_distil.fit(X_train_distil_imdb, y_train_imdb)
y_pred_distil = svm_imdb_distil.predict(X_test_distil_imdb)
print("SVM + DistilBERT:\n", classification_report(y_test_imdb, y_pred_distil))

SVM + DistilBERT:
               precision    recall  f1-score   support

           0       0.51      0.42      0.46       808
           1       0.50      0.59      0.54       792

    accuracy                           0.51      1600
   macro avg       0.51      0.51      0.50      1600
weighted avg       0.51      0.51      0.50      1600



# Вывод

Лучшие результаты при классификации на BERT, на DistilBERT результаты хуже на обоих датасетах