## Машинное обучение

Необходимые импорты

In [1]:
import os

from joblib import dump
from data_load import raw_data_load  # Свои функции для загрузки данных

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Чтение предобработанных (в ноутбуке preprocessing) данных в датафреймы, отделение таргетов

In [3]:
data_dir = "../data"

train = pd.read_csv(os.path.join(data_dir, "prep_train.csv"))
test = pd.read_csv(os.path.join(data_dir, "prep_test.csv"))

x_train = train['prep_text'].values
y_train = train['sentiment'].values
x_test = test['prep_text'].values
y_test = test['sentiment'].values

Вычислим веса классов

In [4]:
array_class_weights = compute_class_weight(class_weight="balanced",
                                           classes=np.array([0, 1, 2]),
                                           y=y_train)
class_weights = dict(zip([0, 1, 2], array_class_weights))

sample_weights = np.array([class_weights[sentiment] for sentiment in y_train])  # Для XGB

Функция для оценки метрик

In [5]:
def evaluate_model(y_test, y_pred) -> pd.DataFrame:
    """
    Оценка метрик accuracy, precision, recall, f1-score на каждом классе с последующим усреднением
    :param y_test: тестовые таргеты
    :param y_pred: предсказанные таргеты
    :return: 
    """

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    metrics = {
        'Метрика': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        'Значение': [accuracy, precision, recall, f1]
    }

    df_metrics = pd.DataFrame(metrics)

    return df_metrics

Были протестированы с bag of words и tf-idf такие модели как: логистическая регрессия, наивный 
байесовский классификатор, метод ближайших соседей, дерево решений, случайный лес, градиентный 
бустинг. С word2vec были протестированы логистическая регрессия и градиентный бустинг. Код с 
тестированием всех этих моделей можно найти в ветке "Andrew".  

Лучше всех себя показали:  
градиентный бустинг с bag of words (f1-macro на тестовой - 0.720)  
логистическая регрессия с tf-idf (f1-macro на тестовой - 0.69)

Обучение векторизатора bag of words и сохранение данных или загрузка готовых данных (если они уже есть)

In [7]:
x_train_bow_path = '../data/x_train_bow.npz'
x_test_bow_path = '../data/x_test_bow.npz'

if not (os.path.exists(x_train_bow_path) and os.path.exists(x_test_bow_path)):
    bow = CountVectorizer(min_df=0.01, max_df=0.95, max_features=30000, ngram_range=(1, 2))
    bow.fit(x_train)

    x_train_bow = bow.transform(x_train)
    x_train_bow = x_train_bow.astype(np.float32)
    np.savez_compressed(x_train_bow_path, x_train_bow)

    x_test_bow = bow.transform(x_test)
    x_test_bow = x_test_bow.astype(np.float32)
    np.savez_compressed(x_test_bow_path, x_test_bow)

else:
    x_train_bow = np.load(x_train_bow_path)
    x_test_bow = np.load(x_test_bow_path)

Обучение векторизатора tf-idf и сохранение данных или загрузка готовых данных (если они уже есть)

In [8]:
x_train_tf_idf_path = '../data/x_train_tf_idf.npz'
x_test_tf_idf_path = '../data/x_test_tf_idf.npz'

if not (os.path.exists(x_train_tf_idf_path) and os.path.exists(x_test_tf_idf_path)):
    tf_idf = TfidfVectorizer(min_df=0.01, max_df=0.95, max_features=30000, ngram_range=(1, 2))
    tf_idf.fit(x_train)

    x_train_tf_idf = tf_idf.transform(x_train)
    x_train_word2vec = x_train_tf_idf.astype(np.float32)
    np.savez_compressed(x_train_tf_idf_path, x_train_tf_idf)

    x_test_tf_idf = tf_idf.transform(x_test)
    x_test_tf_idf = x_test_tf_idf.astype(np.float32)
    np.savez_compressed(x_test_tf_idf_path, x_test_tf_idf)

else:
    x_train_tf_idf = np.load(x_train_tf_idf_path)
    x_test_tf_idf = np.load(x_test_tf_idf_path)

### Логистическая регрессия c tf-idf
f1 на тренировочной - 0.69  
f1 на тестовой - 0.69

In [None]:
logreg_tf_idf = optuna.create_study(study_name="logreg_tf_idf", direction="maximize",
                                    sampler=TPESampler(), pruner=HyperbandPruner())


def objective(trial):
    C = trial.suggest_float('C', 0.0001, 15)
    # multi_class = trial.suggest_categorical('multi_class', ['multinomial', 'ovr'])

    # model = LogisticRegression(C=C, multi_class=multi_class, max_iter=2000)
    model = LogisticRegression(C=C, max_iter=5000, class_weight=class_weights)

    f1_scores = cross_val_score(model, x_train_tf_idf, y_train, cv=5, scoring='f1_macro')

    return f1_scores.mean()


logreg_tf_idf.optimize(objective, n_trials=10)

In [12]:
logreg_tf_idf_model = LogisticRegression(**logreg_tf_idf.best_params, max_iter=5000,
                                         class_weight=class_weights)
logreg_tf_idf_model.fit(x_train_tf_idf, y_train)
evaluate_model(y_test, logreg_tf_idf_model.predict(x_test_tf_idf))

Unnamed: 0,Метрика,Значение
0,Accuracy,0.693051
1,Precision,0.696387
2,Recall,0.710355
3,F1 Score,0.692172


Сохраним модель

In [15]:
dump(logreg_tf_idf_model, '../models/logreg_tf_idf_model.joblib')

['../models/logreg_tf_idf_model.joblib']

### Градиентный бустинг с bag of words  
f1 на тренировочной - 0.719  
f1 на тестовой - 0.72

In [None]:
xgb_bow = optuna.create_study(study_name="xgb_bow", direction="maximize",
                              sampler=TPESampler(), pruner=HyperbandPruner())


def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.9)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 15)

    model = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators,
                          max_depth=max_depth, n_jobs=4)

    f1_scores = cross_val_score(model, x_train_bow, y_train, cv=5, scoring='f1_macro',
                                params={'sample_weight': sample_weights})

    return f1_scores.mean()


xgb_bow.optimize(objective, n_trials=10)

In [13]:
# Взяты из логов optuna для лучшей попытки, надо было перезагрузить jupyter
best_params = {'learning_rate': 0.1506113744991435, 'n_estimators': 337, 'max_depth': 14}

xgb_bow_model = XGBClassifier(**best_params, n_jobs=4)
xgb_bow_model.fit(x_train_bow, y_train)
evaluate_model(y_test, xgb_bow_model.predict(x_test_bow))

Unnamed: 0,Метрика,Значение
0,Accuracy,0.733055
1,Precision,0.72445
2,Recall,0.718184
3,F1 Score,0.720614


Сохраним модель

In [14]:
dump(xgb_bow_model, '../models/xgb_bow_model.joblib')

['../models/xgb_bow_model.joblib']