# Загрузка датасетов

In [None]:
!pip install datasets

In [None]:
import datasets

In [None]:
dataset_news = datasets.load_dataset("ag_news")

In [None]:
dataset_imdb = datasets.load_dataset("imdb")

**Тексты и метки - news**

In [None]:
import random

count0, count1, count2, count3 = 0, 0, 0, 0
dataset_short_news = []
for i in range(len(dataset_news['train'])):
  if dataset_news['train'][i]['label'] == 0 and count0 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_news['train'][i]['label'] == 1 and count1 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 1})
    count1 += 1
  elif dataset_news['train'][i]['label'] == 2 and count2 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 2})
    count2 += 1
  elif dataset_news['train'][i]['label'] == 3 and count3 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 3})
    count3 += 1

random.shuffle(dataset_short_news)
dataset_news = {'train': dataset_short_news[:6400], 'test': dataset_short_news[6400:]}

In [None]:
news_X = []
for i in range(len(dataset_news['train'])):
  news_X.append(dataset_news['train'][i]['news'])
for i in range(len(dataset_news['test'])):
  news_X.append(dataset_news['test'][i]['news'])

In [None]:
news_y = []
for i in range(len(dataset_news['train'])):
  news_y.append(dataset_news['train'][i]['label'])
for i in range(len(dataset_news['test'])):
  news_y.append(dataset_news['test'][i]['label'])

**Тексты и метки - imdb**

In [None]:
count0, count1 = 0, 0
dataset_short_imdb = []

for i in range(len(dataset_imdb['train'])):
  if dataset_imdb['train'][i]['label'] == 0 and count0 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_imdb['train'][i]['label'] == 1 and count1 < 4000:
    dataset_short_imdb.append({'text': dataset_imdb['train'][i]['text'], 'label': 1})
    count1 += 1

random.shuffle(dataset_short_imdb)
dataset_imdb = {'train': dataset_short_imdb[:6400], 'test': dataset_short_imdb[6400:]}

In [None]:
imdb_X = []
for i in range(len(dataset_imdb['train'])):
  imdb_X.append(dataset_imdb['train'][i]['text'])
for i in range(len(dataset_imdb['test'])):
  imdb_X.append(dataset_imdb['test'][i]['text'])

In [None]:
imdb_y = []
for i in range(len(dataset_imdb['train'])):
  imdb_y.append(dataset_imdb['train'][i]['label'])
for i in range(len(dataset_imdb['test'])):
  imdb_y.append(dataset_imdb['test'][i]['label'])

# 1. Обучаем и тестируем LSTM

In [None]:
!pip install tensorflow keras

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

**Токенезируем тексты и подготавливаем метки - news**

In [None]:
token = Tokenizer()

In [None]:
token.fit_on_texts(news_X)  # создаём словарь слово - индекс

In [None]:
X_news = token.texts_to_sequences(news_X)  # предложение в последовательность чисел
X_news = pad_sequences(X_news, maxlen=50)  # выравнивание по длине

In [None]:
label_encoder = LabelEncoder()
y_news = label_encoder.fit_transform(news_y)
y_news = np.array(y_news)

**Токенезируем тексты и подготавливаем метки - imdb**

In [None]:
token_imdb = Tokenizer()

In [None]:
token_imdb.fit_on_texts(imdb_X)

In [None]:
X_imdb = token_imdb.texts_to_sequences(imdb_X)
X_imdb = pad_sequences(X_imdb, maxlen=200)

In [None]:
label_encoder_imdb = LabelEncoder()
y_imdb = label_encoder_imdb.fit_transform(imdb_y)
y_imdb = np.array(y_imdb)

**Делим на train/test**

In [None]:
X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(X_news, y_news, test_size=0.2, random_state=42)

In [None]:
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(X_imdb, y_imdb, test_size=0.2, random_state=42)

**LSTM для news**

In [None]:
model_news = Sequential([
    Embedding(input_dim=len(token.word_index) + 1, output_dim=100, input_shape=(50,)),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(4, activation='softmax')
])

In [None]:
model_news.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [None]:
model_news.summary()

**Обучение модели**

In [None]:
history_news = model_news.fit(X_train_news, y_train_news, validation_split=0.1, epochs=8, batch_size=64)

In [None]:
y_pred_news = model_news.predict(X_test_news)
y_pred_news = (y_pred_news > 0.5).astype(int)

In [None]:
y_pred_classes = np.argmax(y_pred_news, axis=1)

In [None]:
print(classification_report(y_test_news, y_pred_classes))

**LSTM для imdb**

In [None]:
model_imdb = Sequential([
    Embedding(input_dim=len(token_imdb.word_index) + 1, output_dim=100, input_shape=(100,)),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model_imdb.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [None]:
model_imdb.summary()

In [None]:
history_imdb = model_imdb.fit(X_train_imdb, y_train_imdb, validation_split=0.1, epochs=8, batch_size=64)

In [None]:
y_pred_imdb = model_imdb.predict(X_test_imdb)
y_pred_imdb = (y_pred_imdb > 0.5).astype(int)

In [None]:
y_pred_classes_imdb = np.argmax(y_pred_imdb, axis=1)

In [None]:
print(classification_report(y_test_imdb, y_pred_classes_imdb))