# 1. Добавляем классификатор naive_bayes и подбираем alpha

**ag-news**

In [None]:
!pip install numpy==1.23.5

In [None]:
import numpy as np

In [None]:
np.__version__

In [None]:
!pip install datasets
from datasets import load_dataset
dataset_news0 = load_dataset("ag_news")

In [None]:
count0, count1, count2, count3 = 0, 0, 0, 0
dataset_news = []
for i in range(len(dataset_news0['train'])):
  if dataset_news0['train'][i]['label'] == 0 and count0 < 2000:
    dataset_news.append({'news': dataset_news0['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_news0['train'][i]['label'] == 1 and count1 < 2000:
    dataset_news.append({'news': dataset_news0['train'][i]['text'], 'label': 1})
    count1 += 1
  elif dataset_news0['train'][i]['label'] == 2 and count2 < 2000:
    dataset_news.append({'news': dataset_news0['train'][i]['text'], 'label': 2})
    count2 += 1
  elif dataset_news0['train'][i]['label'] == 3 and count3 < 2000:
    dataset_news.append({'news': dataset_news0['train'][i]['text'], 'label': 3})
    count3 += 1
len(dataset_news)

In [None]:
import random
random.shuffle(dataset_news)

In [None]:
dataset_news_texts = [i['news'] for i in dataset_news]
dataset_news_labels = [i['label'] for i in dataset_news]

**imdb**

In [None]:
dataset_imdb0 = load_dataset("imdb")

count0, count1 = 0, 0
dataset_imdb = []

for i in range(len(dataset_imdb0['train'])):
  if dataset_imdb0['train'][i]['label'] == 0 and count0 < 4000:
    dataset_imdb.append({'text': dataset_imdb0['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_imdb0['train'][i]['label'] == 1 and count1 < 4000:
    dataset_imdb.append({'text': dataset_imdb0['train'][i]['text'], 'label': 1})
    count1 += 1
len(dataset_imdb)

In [None]:
random.shuffle(dataset_imdb)

In [None]:
dataset_imdb_texts = [i['text'] for i in dataset_imdb]
dataset_imdb_labels = [i['label'] for i in dataset_imdb]

# Базовая предобработка

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = list(set(stopwords.words("english")))

In [None]:
def preproccesing(text):
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if not word in stop_words]
  tokens = word_tokenize(re.sub(r'[^a-zA-Zа-яА-Я ]', '', ' '.join(tokens)))  # убираем спец символы, числа и знаки препинания
  return ' '.join(tokens)

In [None]:
dataset_news_texts = [preproccesing(i) for i in dataset_news_texts]
dataset_imdb_texts = [preproccesing(i) for i in dataset_imdb_texts]

# Векторизация

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Обучение naive_bayes и подбор alpha

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

In [None]:
# Подготовка моделей и параметров
models = {
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
    'ComplementNB': ComplementNB()
}

param_grid = {'clf__alpha': [0.01, 0.1, 0.5, 1.0]}

vectorizers = {
    'Tfidf': TfidfVectorizer()
}

# Цикл по датасетам
for dataset_name in ['imdb', 'ag_news']:
    print(f"Датасет: {dataset_name}")
    X, y = (dataset_imdb_texts, dataset_imdb_labels) if dataset_name == 'imdb' else (dataset_news_texts, dataset_news_labels)

    # Для бинарных классов используем f1, для многоклассовых f1_macro
    scoring = 'f1' if dataset_name == 'imdb' else 'f1_macro'

    for vec_name, vectorizer in vectorizers.items():
        print(f"Векторизация: {vec_name}")
        for model_name, model in models.items():
            pipe = Pipeline([
                ('vect', vectorizer),
                ('clf', model)
            ])

            grid = GridSearchCV(pipe, param_grid=param_grid, scoring=scoring, n_jobs=-1)
            grid.fit(X, y)
            best_alpha = grid.best_params_['clf__alpha']
            best_score = grid.best_score_

            print(f"{model_name}: Лучшая alpha = {best_alpha}, {scoring} = {best_score:.4f}")
            print()


# 2. Обучение Word2Vec на AG News

In [None]:
!pip install datasets gensim

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import multiprocessing

**Подготавливаем именно для news**

In [None]:
dataset_news_texts = [i.split() for i in dataset_news_texts]

In [None]:
print(dataset_news_texts[0])

In [None]:
model = Word2Vec(
    sentences=dataset_news_texts,  # Используем ваши предобработанные данные
    vector_size=300,
    window=5,
    min_count=5,
    sg=1,
    workers=multiprocessing.cpu_count(),
    epochs=10,
    negative=5,
    sample=1e-3
)

**Проверка качества модели**

In [None]:
# 10 тематических слов
test_words = ["apple",  "technology", "market", "investment", "president", "law", "olympics", "covid", "hospital", "law"]

for word in test_words:
    if word in model.wv:
        print(f"Top-5 похожих на {word}:")
        for similar, score in model.wv.most_similar(word, topn=5):
            print(f"  {similar} (score: {score:.2f})")
        print()
    else:
        print(f"Слово {word} отсутствует в словаре")
        print()