## Машинное обучение

Необходимые импорты

In [1]:
import os

from joblib import dump  # Для сериализации моделей
from data_load import raw_data_load  # Своя функция для загрузки данных

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Для настройки гиперпараметров
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Чтение предобработанных (в ноутбуке preprocessing) данных в датафреймы, отделение таргетов

In [2]:
data_dir = "../data"

train = pd.read_csv(os.path.join(data_dir, "prep_train.csv"))
test = pd.read_csv(os.path.join(data_dir, "prep_test.csv"))

x_train = train['prep_text']
y_train = train['sentiment']
x_test = test['prep_text']
y_test = test['sentiment']

Вычислим веса классов

In [3]:
array_class_weights = compute_class_weight(class_weight="balanced",
                                           classes=np.array([0, 1, 2]),
                                           y=y_train)
class_weights = dict(zip([0, 1, 2], array_class_weights))

sample_weights = np.array([class_weights[sentiment] for sentiment in y_train])  # Для XGB

Функция для оценки метрик

In [4]:
def evaluate_model(y_test, y_pred) -> pd.DataFrame:
    """
    Оценка метрик accuracy, precision, recall, f1-score на каждом классе с последующим усреднением
    :param y_test: тестовые таргеты
    :param y_pred: предсказанные таргеты
    :return: 
    """

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    metrics = {
        'Метрика': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        'Значение': [accuracy, precision, recall, f1]
    }

    df_metrics = pd.DataFrame(metrics)

    return df_metrics

Были протестированы с bag of words и tf-idf такие модели как: логистическая регрессия, наивный 
байесовский классификатор, метод ближайших соседей, дерево решений, случайный лес, градиентный 
бустинг. С word2vec были протестированы логистическая регрессия и градиентный бустинг. Код с 
тестированием всех этих моделей можно найти в ветке "Andrew"(замечание: там метрики чуть хуже, 
из-за неправильных параметров векторизаторов)  

Лучше всех себя показали:  
градиентный бустинг с bag of words  
логистическая регрессия с tf-idf

Обучение векторизатора bag of words и векторизация данных

In [5]:
bow = CountVectorizer(max_features=30000, ngram_range=(1, 2))
bow.fit(x_train)

x_train_bow = bow.transform(x_train)
x_test_bow = bow.transform(x_test)

Обучение векторизатора tf-idf и векторизация данных

In [6]:
tf_idf = TfidfVectorizer(max_features=30000, ngram_range=(1, 2))
tf_idf.fit(x_train)

x_train_tf_idf = tf_idf.transform(x_train)
x_test_tf_idf = tf_idf.transform(x_test)

### Логистическая регрессия c tf-idf
f1 на тренировочной - 0.746  
f1 на тестовой - 0.751

In [10]:
logreg_tf_idf = optuna.create_study(study_name="logreg_tf_idf", direction="maximize",
                                    sampler=TPESampler(), pruner=HyperbandPruner())


def objective(trial):
    C = trial.suggest_float('C', 0.0001, 15)
    model = LogisticRegression(C=C, max_iter=5000, class_weight=class_weights, random_state=42)

    f1_scores = cross_val_score(model, x_train_tf_idf, y_train, cv=4, scoring='f1_macro')

    return f1_scores.mean()


logreg_tf_idf.optimize(objective, n_trials=10)

[I 2024-11-01 04:01:38,418] A new study created in memory with name: logreg_tf_idf
[I 2024-11-01 04:03:59,244] Trial 0 finished with value: 0.7412686085589424 and parameters: {'C': 4.100399169930521}. Best is trial 0 with value: 0.7412686085589424.
[I 2024-11-01 04:07:33,568] Trial 1 finished with value: 0.7353146315973785 and parameters: {'C': 8.288003634597544}. Best is trial 0 with value: 0.7412686085589424.
[I 2024-11-01 04:10:40,230] Trial 2 finished with value: 0.7345823687377299 and parameters: {'C': 8.916021539612306}. Best is trial 0 with value: 0.7412686085589424.
[I 2024-11-01 04:12:31,876] Trial 3 finished with value: 0.7463210953461533 and parameters: {'C': 1.1765482129376392}. Best is trial 3 with value: 0.7463210953461533.
[I 2024-11-01 04:16:19,562] Trial 4 finished with value: 0.7304162471255756 and parameters: {'C': 14.046519031259857}. Best is trial 3 with value: 0.7463210953461533.
[I 2024-11-01 04:19:10,757] Trial 5 finished with value: 0.7391852675286782 and param

In [11]:
logreg_tf_idf_model = LogisticRegression(**logreg_tf_idf.best_params, max_iter=5000,
                                         class_weight=class_weights, random_state=42)
logreg_tf_idf_model.fit(x_train_tf_idf, y_train)
evaluate_model(y_test, logreg_tf_idf_model.predict(x_test_tf_idf))

Unnamed: 0,Метрика,Значение
0,Accuracy,0.753377
1,Precision,0.749077
2,Recall,0.763997
3,F1 Score,0.750754


Сохраним модель

In [12]:
dump(logreg_tf_idf_model, '../models/logreg_tf_idf_model.joblib')

['../models/logreg_tf_idf_model.joblib']

### Градиентный бустинг с bag of words  
f1 на тренировочной - 0.75  
f1 на тестовой - 0.753

In [13]:
xgb_bow = optuna.create_study(study_name="xgb_bow", direction="maximize",
                              sampler=TPESampler(), pruner=HyperbandPruner())


def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.9)
    n_estimators = trial.suggest_int('n_estimators', 100, 400)
    max_depth = trial.suggest_int('max_depth', 3, 15)

    model = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators,
                          max_depth=max_depth, n_jobs=4, random_state=42)

    f1_scores = cross_val_score(model, x_train_bow, y_train, cv=4, scoring='f1_macro',
                                params={'sample_weight': sample_weights})

    return f1_scores.mean()


xgb_bow.optimize(objective, n_trials=10)

[I 2024-11-01 04:30:39,493] A new study created in memory with name: xgb_bow
[I 2024-11-01 04:57:21,995] Trial 0 finished with value: 0.7006584625441592 and parameters: {'learning_rate': 0.04694872274805431, 'n_estimators': 263, 'max_depth': 10}. Best is trial 0 with value: 0.7006584625441592.
[I 2024-11-01 05:06:34,410] Trial 1 finished with value: 0.6539050752610619 and parameters: {'learning_rate': 0.03350275063700577, 'n_estimators': 121, 'max_depth': 8}. Best is trial 0 with value: 0.7006584625441592.
[I 2024-11-01 05:09:28,609] Trial 2 finished with value: 0.7176965561430141 and parameters: {'learning_rate': 0.4898393345998856, 'n_estimators': 286, 'max_depth': 3}. Best is trial 2 with value: 0.7176965561430141.
[I 2024-11-01 05:35:14,079] Trial 3 finished with value: 0.7460631573772633 and parameters: {'learning_rate': 0.32992847308014456, 'n_estimators': 274, 'max_depth': 13}. Best is trial 3 with value: 0.7460631573772633.
[I 2024-11-01 05:56:57,775] Trial 4 finished with valu

In [14]:
xgb_bow_model = XGBClassifier(**xgb_bow.best_params, n_jobs=4, random_state=42)
xgb_bow_model.fit(x_train_bow, y_train)
evaluate_model(y_test, xgb_bow_model.predict(x_test_bow))

Unnamed: 0,Метрика,Значение
0,Accuracy,0.763568
1,Precision,0.756233
2,Recall,0.75092
3,F1 Score,0.753125


Сохраним модель

In [15]:
dump(xgb_bow_model, '../models/xgb_bow_model.joblib')

['../models/xgb_bow_model.joblib']