### Создание модели

##### Для начала используем простую модель линейной регресии. В качестве метрики будем использовать MAPE.

In [1]:
import optuna
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import linear_model
from sklearn import metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_cleaned = pd.read_csv('data/data_cleaned.csv')

In [3]:
random_state = 42

In [4]:
X = df_cleaned.drop('target', axis=1)
y = df_cleaned['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [6]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print(f'Train MAPE: {metrics.mean_absolute_percentage_error(y_train, y_train_pred)}')
print(f'Test MAPE: {metrics.mean_absolute_percentage_error(y_test, y_test_pred)}')

Train MAPE: 13.182493592160593
Test MAPE: 21.11920839491114


Полученные показатели можно считать неплохими с учетом того, что это простая модель линейной регрессии.

Теперь попробуем использовать более сложные модели и сравним результаты. Для начала используем случайный лес.

In [7]:
rfr = RandomForestRegressor(random_state=random_state)
rfr.fit(X_train, y_train)
y_train_pred = rfr.predict(X_train)
y_test_pred = rfr.predict(X_test)

print(f'Train MAPE: {metrics.mean_absolute_percentage_error(y_train, y_train_pred)}')
print(f'Test MAPE: {metrics.mean_absolute_percentage_error(y_test, y_test_pred)}')

Train MAPE: 4.035859878783376
Test MAPE: 6.290838189412009


Метрика модели случайного леса лучше модели регрессии. Далее попробуем модель градиентного бустинга.

In [8]:
gbr = GradientBoostingRegressor(random_state=random_state)
gbr.fit(X_train, y_train)
y_train_pred = gbr.predict(X_train)
y_test_pred = gbr.predict(X_test)

print(f'Train MAPE: {metrics.mean_absolute_percentage_error(y_train, y_train_pred)}')
print(f'Test MAPE: {metrics.mean_absolute_percentage_error(y_test, y_test_pred)}')

Train MAPE: 9.82586357077138
Test MAPE: 15.781459983139891


Модель градиентного бустинга показывает более плохие результаты. Попробуем проверить модели при помощи кросс валидации.

In [9]:
cv_metrics = cross_validate(
    estimator=RandomForestRegressor(random_state=random_state),
    X=X,
    y=y,
    scoring=metrics.make_scorer(metrics.mean_absolute_percentage_error),
    return_train_score=True
)
print(cv_metrics)
print(np.mean(cv_metrics['train_score']))
print(np.mean(cv_metrics['test_score']))

{'fit_time': array([407.18534565, 330.95878363, 363.71971607, 340.1687572 ,
       339.19612837]), 'score_time': array([4.74601078, 3.68507648, 3.78123617, 3.57939839, 3.72935987]), 'test_score': array([ 5.75709314,  1.87889428, 20.8186291 , 20.4056051 ,  5.16808211]), 'train_score': array([5.67928343, 5.03283769, 3.01915574, 3.19833106, 5.33103015])}
4.452127615292162
10.805660743762292


In [10]:
cv_metrics = cross_validate(
    estimator=GradientBoostingRegressor(random_state=random_state),
    X=X,
    y=y,
    scoring=metrics.make_scorer(metrics.mean_absolute_percentage_error),
    return_train_score=True
)
print(cv_metrics)
print(np.mean(cv_metrics['train_score']))
print(np.mean(cv_metrics['test_score']))

{'fit_time': array([73.66043901, 64.34974408, 66.92585111, 63.47570348, 67.84982395]), 'score_time': array([0.1290071 , 0.13039327, 0.12976503, 0.13894677, 0.14653182]), 'test_score': array([ 8.88991186,  2.39616177, 13.86414744, 17.50464808, 14.77115311]), 'train_score': array([12.98088445, 13.90692724, 11.1839986 ,  9.14558729,  9.52262707])}
11.348004930850397
11.48520445138799


При кросс валидации модель случайного леса так же показала себя лучше, чем модель градиентного бустинга. Попробуем улучшить метрику при помощи подбора гиперпараметров.

In [11]:
def optuna_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 500, 1)
    max_depth = trial.suggest_int('max_depth', 10, 100, 1)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 6, 1)
    
    mdl = RandomForestRegressor(
        random_state=random_state,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf)
    
    mdl.fit(X_train, y_train)
    
    score = metrics.mean_absolute_percentage_error(y_test, mdl.predict(X_test))
    
    return score

In [12]:
%%time

study = optuna.create_study(study_name='GradientBoostingRegressor', direction='minimize')
study.optimize(optuna_rf, n_trials=20)

[32m[I 2023-04-09 11:16:35,625][0m A new study created in memory with name: GradientBoostingRegressor[0m
[32m[I 2023-04-09 11:30:59,369][0m Trial 0 finished with value: 6.966237755248876 and parameters: {'n_estimators': 330, 'max_depth': 91, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 0 with value: 6.966237755248876.[0m
[32m[I 2023-04-09 11:44:36,941][0m Trial 1 finished with value: 6.928439695889123 and parameters: {'n_estimators': 356, 'max_depth': 21, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 6.928439695889123.[0m
[32m[I 2023-04-09 11:48:38,112][0m Trial 2 finished with value: 7.120183720383741 and parameters: {'n_estimators': 103, 'max_depth': 73, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 1 with value: 6.928439695889123.[0m
[32m[I 2023-04-09 11:49:33,936][0m Trial 3 finished with value: 6.7627381484024855 and parameters: {'n_estimators': 22, 'max_depth': 68, 'min_samples_split': 6, 'min_samples_le

CPU times: user 3h 17min 51s, sys: 12.3 s, total: 3h 18min 3s
Wall time: 3h 18min 3s


Получилось подобрать гиперпараметры, которые улучшили качество предсказания. Используем их для обучения модели, которую потом будем использовать в продакшене.

In [13]:
final_model = RandomForestRegressor(
    random_state=random_state,
    **study.best_params)
final_model.fit(X_train, y_train)

In [14]:
y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

print(f'Train MAPE: {metrics.mean_absolute_percentage_error(y_train, y_train_pred)}')
print(f'Test MAPE: {metrics.mean_absolute_percentage_error(y_test, y_test_pred)}')

Train MAPE: 4.170049543499884
Test MAPE: 5.98430416904274


In [15]:
import pickle

with open('model/model.pkl', 'wb') as output:
    pickle.dump(final_model, output)