**Получаем эмбеддинги из предобученных моделей (BERT, DistilBERT), после используем их для классификации с помощью LR, SVM**

# 1. Установка библиотек - transformers, torch

In [None]:
!pip install transformers torch

In [None]:
import torch

In [None]:
from transformers import BertTokenizer, BertModel

# 2. Инициализация BERT и DistilBERT

**Загрузка модели и токенизатора BERT**

In [None]:
model_name_bert = 'bert-base-uncased'
tokenizer_bert = BertTokenizer.from_pretrained(model_name_bert)
model_bert = BertModel.from_pretrained(model_name_bert)

In [None]:
# функция получения эмбеддингов из BERT'а

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Токенизация и преобразование в тензоры
        inputs = tokenizer_bert(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Получение эмбеддингов
        with torch.no_grad():
            outputs = model_bert(**inputs)

        # Берём эмбеддинг
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        embeddings.append(cls_embedding)

    return embeddings

**Загрузки модели и токенизатора DistilBERT**

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
model_name_distilbert = 'distilbert-base-uncased'
tokenizer_distilbert = DistilBertTokenizer.from_pretrained(model_name_distilbert)
model_distilbert = DistilBertModel.from_pretrained(model_name_distilbert)

In [None]:
# Функция для получения эмбеддингов DistilBERT

def get_distilbert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer_distilbert(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model_distilbert(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        embeddings.append(cls_embedding)
    return embeddings

## 3. Загрузка датасетов

In [None]:
!pip install datasets

In [None]:
import datasets

In [None]:
dataset_news = datasets.load_dataset("ag_news")

In [None]:
dataset_imdb = datasets.load_dataset("imdb")

In [None]:
import random

count0, count1, count2, count3 = 0, 0, 0, 0
dataset_short_news = []
for i in range(len(dataset_news['train'])):
  if dataset_news['train'][i]['label'] == 0 and count0 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_news['train'][i]['label'] == 1 and count1 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 1})
    count1 += 1
  elif dataset_news['train'][i]['label'] == 2 and count2 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 2})
    count2 += 1
  elif dataset_news['train'][i]['label'] == 3 and count3 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 3})
    count3 += 1

random.shuffle(dataset_short_news)
dataset_news = {'train': dataset_short_news[:6400], 'test': dataset_short_news[6400:]}

In [None]:
news_X = []
for i in range(len(dataset_news['train'])):
  news_X.append(dataset_news['train'][i]['news'])
for i in range(len(dataset_news['test'])):
  news_X.append(dataset_news['test'][i]['news'])

In [None]:
news_y = []
for i in range(len(dataset_news['train'])):
  news_y.append(dataset_news['train'][i]['label'])
for i in range(len(dataset_news['test'])):
  news_y.append(dataset_news['test'][i]['label'])

In [None]:
count0, count1 = 0, 0
dataset_short_imdb = []

for i in range(len(dataset_imdb['train'])):
  if dataset_imdb['train'][i]['label'] == 0 and count0 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_imdb['train'][i]['label'] == 1 and count1 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 1})
    count1 += 1

random.shuffle(dataset_short_imdb)
dataset_imdb = {'train': dataset_short_imdb[:6400], 'test': dataset_short_imdb[6400:]}

In [None]:
imdb_X = []
for i in range(len(dataset_imdb['train'])):
  imdb_X.append(dataset_imdb['train'][i]['text'])
for i in range(len(dataset_imdb['test'])):
  imdb_X.append(dataset_imdb['test'][i]['text'])

In [None]:
imdb_y = []
for i in range(len(dataset_imdb['train'])):
  imdb_y.append(dataset_imdb['train'][i]['label'])
for i in range(len(dataset_imdb['test'])):
  imdb_y.append(dataset_imdb['test'][i]['label'])

# 4. Получаем эмбеддинги для текстов

In [None]:
bert_embeddings_news = get_bert_embeddings(news_X)
bert_embeddings_imdb = get_bert_embeddings(imdb_X)

In [None]:
distil_embeddings_news = get_distilbert_embeddings(news_X)
distil_embeddings_imdb = get_distilbert_embeddings(imdb_X)

# 5. Обучаем LR на эмбеддингах

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

**news - LR**

In [None]:
X_train_bert_news, X_test_bert_news, y_train_news, y_test_news = train_test_split(bert_embeddings_news, news_y, test_size=0.2)
X_train_distil_news, X_test_distil_news, _, _ = train_test_split(distil_embeddings_news, news_y, test_size=0.2)

In [None]:
lr_news_bert = LogisticRegression()

In [None]:
lr_news_distil = LogisticRegression()

In [None]:
lr_news_bert.fit(X_train_bert_news, y_train_news)
y_pred_news_bert = lr_news_bert.predict(X_test_bert_news)
print("LR + BERT:\n", classification_report(y_test_news, y_pred_news_bert))

In [None]:
lr_news_distil.fit(X_train_distil_news, y_train_news)
y_pred_news_distil = lr_news_distil.predict(X_test_distil_news)
print("LR + DistilBERT:\n", classification_report(y_test_news, y_pred_news_distil))

**news - SVM**

In [None]:
from sklearn.svm import SVC

In [None]:
svm_news_bert = SVC()
svm_news_distil = SVC()

In [None]:
svm_news_bert.fit(X_train_bert_news, y_train_news)
y_pred_news_bert_svm = svm_news_bert.predict(X_test_bert_news)
print("SVM + BERT:\n", classification_report(y_test_news, y_pred_news_bert_svm))

In [None]:
svm_news_distil.fit(X_train_distil_news, y_train_news)
y_pred_news_distil_svm = svm_news_distil.predict(X_test_distil_news)
print("SVM + DistilBERT:\n", classification_report(y_test_news, y_pred_news_distil_svm))

**imdb - LR**

In [None]:
X_train_bert_imdb, X_test_bert_imdb, y_train_imdb, y_test_imdb = train_test_split(bert_embeddings_imdb, imdb_y, test_size=0.2)
X_train_distil_imdb, X_test_distil_imdb, _, _ = train_test_split(distil_embeddings_imdb, imdb_y, test_size=0.2)

In [None]:
lr_imdb_bert = LogisticRegression()
lr_imdb_distil = LogisticRegression()

In [None]:
lr_imdb_bert.fit(X_train_bert_imdb, y_train_imdb)
y_pred_bert = lr_imdb_bert.predict(X_test_bert_imdb)
print("LR + BERT:\n", classification_report(y_test_imdb, y_pred_bert))

In [None]:
lr_imdb_distil.fit(X_train_distil_imdb, y_train_imdb)
y_pred_distil = lr_imdb_distil.predict(X_test_distil_imdb)
print("LR + DsitilBERT:\n", classification_report(y_test_imdb, y_pred_distil))

**imdb - SVM**

In [None]:
svm_imdb_bert = SVC()
svm_imdb_distil = SVC()

In [None]:
svm_imdb_bert.fit(X_train_bert_imdb, y_train_imdb)
y_pred_bert = svm_imdb_bert.predict(X_test_bert_imdb)
print("SVM + BERT:\n", classification_report(y_test_imdb, y_pred_bert))

In [None]:
svm_imdb_distil.fit(X_train_distil_imdb, y_train_imdb)
y_pred_distil = svm_imdb_distil.predict(X_test_distil_imdb)
print("SVM + DistilBERT:\n", classification_report(y_test_imdb, y_pred_distil))

# Вывод

Лучшие результаты при классификации на BERT, на DistilBERT результаты хуже на обоих датасетах