# 1. Настройка гиперпараметров Tf-IDF Vectorizer



*   **stopwords** - удаление стоп-слов
*   **analyzer** ('word', 'char') - "единица для векторизации"
*  **lowercaser** (True, False) - переводим ли текст в нижний регистр
*  **ngram_range** - диапазон n-грамм
* **max_df** - верхний порог частотности
* **min_df** - нижний порого частотности
* **use_idf** - используем ли IDF
* **norm** (None, 'l1', 'l2') - нормализация Tf-IDF вектора



# 2. Настройка гиперпараметров RandomForest



*   **n_estimators** - количество деревьев
*   **max_depth** - глубина дерева
* **min_samples_split** - минимальное число образцов
* **max_samples_split** - максимальное число образцов
* **criterion** ('gini', 'entropy') - функция, по которой алгоритм оценивает качество разбиения узлов в деревьях



# 3. Подготавливаем датасет AG News

In [None]:
!pip install datasets
from datasets import load_dataset
dataset_news = load_dataset("ag_news")

In [None]:
import random

count0, count1, count2, count3 = 0, 0, 0, 0
dataset_short_news = []
for i in range(len(dataset_news['train'])):
  if dataset_news['train'][i]['label'] == 0 and count0 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 0})
    count0 += 1
  elif dataset_news['train'][i]['label'] == 1 and count1 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 1})
    count1 += 1
  elif dataset_news['train'][i]['label'] == 2 and count2 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 2})
    count2 += 1
  elif dataset_news['train'][i]['label'] == 3 and count3 < 2000:
    dataset_short_news.append({'news': dataset_news['train'][i]['text'], 'label': 3})
    count3 += 1

random.shuffle(dataset_short_news)
dataset_news = {'train': dataset_short_news[:6400], 'test': dataset_short_news[6400:]}

In [None]:
train_texts_news = [s['news'] for s in dataset_news['train']]
test_texts_news = [s['news'] for s in dataset_news['test']]

train_labels_news = [s['label'] for s in dataset_news['train']]
test_labels_news = [s['label'] for s in dataset_news['test']]

# Выводим примеры
print(train_texts_news[0], train_labels_news[0])
print(test_texts_news[0], test_labels_news[0])

In [None]:
# из предобработки сделаю только токенезацию и удаление спец.символов
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import re

def preprocess(text):
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [re.sub(r'[^\w\s]', '', word) for word in tokens]

  return ' '.join(tokens)

In [None]:
train_texts_news = [preprocess(i) for i in train_texts_news]

# 4. Настройка гиперпараметров randomForest и Tf-IDF через GridSearchCV (полный перебор)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import pandas as pd

In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {
    "tfidf__stop_words": ["english"],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__analyzer": ["word"],
    "tfidf__lowercase": [True],
    "tfidf__max_df": [0.7,0.8, 0.95],
    "tfidf__min_df": [1, 3],
    'tfidf__use_idf': [True, False],
    "tfidf__norm": [None, 'l1', 'l2'],

    "clf__n_estimators": [100, 200, 250],
    "clf__max_depth": [None, 30],
    "clf__min_samples_split": [2, 5, 7],
}

In [None]:
res_news = []

for s_w in param_grid["tfidf__stop_words"]:
  for ng in param_grid['tfidf__ngram_range']:
    for analyz in param_grid['tfidf__analyzer']:
      for low in param_grid['tfidf__lowercase']:
        for max_df in param_grid['tfidf__max_df']:
          for min_df in param_grid['tfidf__min_df']:
            for use_idf in param_grid['tfidf__use_idf']:
              for n in param_grid['tfidf__norm']:
                for n_est in param_grid['clf__n_estimators']:
                  for max_d in param_grid['clf__max_depth']:
                    for min_s in param_grid['clf__min_samples_split']:
                      pipeline.set_params(
                          tfidf__stop_words = s_w,
                          tfidf__ngram_range = ng,
                          tfidf__analyzer = analyz,
                          tfidf__lowercase = low,
                          tfidf__max_df = max_df,
                          tfidf__min_df = min_df,
                          tfidf__use_idf = use_idf,
                          tfidf__norm = n,
                          clf__n_estimators = n_est,
                          clf__max_depth = max_d,
                          clf__min_samples_split = min_s
                      )

                      pipeline.fit(train_texts_news, train_labels_news)
                      y_pred_news = pipeline.predict(test_texts_news)
                      f1 = f1_score(test_labels_news, y_pred_news, average='weighted')

                      curr_news = [s_w, ng, analyz, low, max_df, min_df, use_idf, n, n_est, max_d, min_s, f1]
                      res_news.append(curr_news)
                      if len(res_news) % 10 == 0:
                        print(len(res_news))

In [None]:
df_results = pd.DataFrame(res_news, columns=["stopwords", "ngram_range", "analyzer", "lowercase", "max_df", "min_df", "use_idf", "norm", "n_estimator", "max_depth", "min_samples_split", 'f1-score'])
df_results.to_excel("results_news_tfidf_rf.xlsx", index=False)

Делаем то же самое для логистической регрессии

In [None]:
pipeline_lr = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression())
])

In [None]:
param_grid_lr = {
    "tfidf__stop_words": ["english"],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__analyzer": ["word"],
    "tfidf__lowercase": [True],
    "tfidf__max_df": [0.7, 0.8, 0.95],
    "tfidf__min_df": [1, 3],
    'tfidf__use_idf': [True, False],
    "tfidf__norm": [None, 'l1', 'l2'],

    "clf__penalty": ['l1', 'l2'],
    "clf__C": [0.01, 0.001, 0.0001, 1],  # сила регуляризации
    "clf__solver": ['liblinear', 'saga'],
    "clf__class_weight": ['balanced', None]
}

In [None]:
res_news_lr = []

for sw in param_grid_lr['tfidf__stop_words']:
  for ng in param_grid_lr['tfidf__ngram_range']:
    for analyz in param_grid_lr['tfidf__analyzer']:
      for low in param_grid_lr['tfidf__lowercase']:
        for max_df in param_grid_lr['tfidf__max_df']:
          for min_df in param_grid_lr['tfidf__min_df']:
            for use_idf in param_grid_lr['tfidf__use_idf']:
              for n in param_grid_lr['tfidf__norm']:
                for pen in param_grid_lr['clf__penalty']:
                  for c in param_grid_lr['clf__C']:
                    for sol in param_grid_lr['clf__solver']:
                      for cw in param_grid_lr['clf__class_weight']:
                          pipeline_lr.set_params(
                            tfidf__stop_words = sw,
                            tfidf__ngram_range = ng,
                            tfidf__analyzer = analyz,
                            tfidf__lowercase = low,
                            tfidf__max_df = max_df,
                            tfidf__min_df = min_df,
                            tfidf__use_idf = use_idf,
                            tfidf__norm = n,
                            clf__penalty = pen,
                            clf__C = c,
                            clf__solver = sol,
                            clf__class_weight = cw
                          )

                          pipeline_lr.fit(train_texts_news, train_labels_news)
                          y_pred_news = pipeline_lr.predict(test_texts_news)
                          f1 = f1_score(test_labels_news, y_pred_news, average='weighted')

                          curr_news = [sw, ng, analyz, low, max_df, min_df, use_idf, n, pen, c, sol, cw, f1]
                          res_news_lr.append(curr_news)
                          if len(res_news_lr) % 10 == 0:
                            print(len(res_news_lr))

In [None]:
df_results = pd.DataFrame(res_news_lr, columns=["stopwords", "ngram_range", "analyzer", "lowercase", "max_df", "min_df", "use_idf", "norm", "penalty", "C", "solver", "class_weight", 'f1-score'])
df_results.to_excel("results_news_tfidf_lr1.xlsx", index=False)