# Mobile Service (model train)

## 1 Подключение и настройка внешних модулей

In [1]:
# data processing
import pandas as pd
import numpy as np
import dask.dataframe as dd
import sklearn
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# отключение предупреждений
import warnings
warnings.filterwarnings('ignore')

# сериализация объектов
import pickle

## 2 Загрузка датасетов

In [2]:
df_train = pd.read_csv(r'data/data_train.csv')
df_features = dd.read_csv(r'data/features.csv', sep='\t', engine='python')

## 3 Предобработка данных

Вспомогательные структуры:

In [3]:
def feature_transformer(dataset, source_list, feature_name, transformer, prefitting=True):
    """Трансформация признаков."""

    # prefitting
    if prefitting:
        transformer.fit(dataset[source_list])

    # transforming
    if type(transformer) == sklearn.preprocessing._encoders.OneHotEncoder:

        df_OHE = pd.DataFrame(
            transformer.transform(dataset[source_list]),
            columns = transformer.get_feature_names_out()
        )
        dataset = pd.concat([dataset, df_OHE], axis=1).drop(source_list, axis=1)

    else:
        dataset[feature_name] = transformer.transform(dataset[source_list])
    
    # returning
    return dataset

In [4]:
def preprocess_train_dataset(df_train, df_features):
    "Предобработка тренировочного датасета."
    
    # хранилище трансформеров
    transformers = {}

    # объединение датасетов
    train_ids = df_train['id']
    df_train_user_features = df_features[df_features['id'].isin(train_ids)].compute()
    df_train_extended = pd.merge_asof(
        left=df_train.sort_values(by='buy_time'),
        right=df_train_user_features.sort_values(by='buy_time'),
        on='buy_time',
        by='id',
        direction='nearest'
    )

    # создание признака 'not_first_offer'
    not_first_offer_train = df_train_extended.duplicated('id').astype(int)
    df_train_extended.insert(loc=0, column='not_first_offer', value=not_first_offer_train)

    # создание признака 'buy_month'
    buy_month_train = pd.to_datetime(df_train_extended['buy_time'], unit='s').dt.month
    df_train_extended.insert(loc=0, column='buy_month', value=buy_month_train)

    # предобработка признака 'vas_id'
    vas_id_OHE = OneHotEncoder(handle_unknown='ignore', sparse=False)
    df_train_extended = feature_transformer(
        dataset=df_train_extended,
        source_list=['vas_id'],
        feature_name='vas_id',
        transformer=vas_id_OHE,
        prefitting=True
    )
    transformers['vas_id_OHE'] = vas_id_OHE

    # предобработка признака 'buy_month'
    buy_month_OHE = OneHotEncoder(handle_unknown='ignore', sparse=False)
    df_train_extended = feature_transformer(
        dataset=df_train_extended,
        source_list=['buy_month'],
        feature_name='buy_month',
        transformer=buy_month_OHE,
        prefitting=True
    )
    transformers['buy_month_OHE'] = buy_month_OHE

    # кодирование бинарных признаков
    with open(r'tools/nunique_2.pickle', 'rb') as f:
        nunique_2 = pickle.load(f)

    nunique_2_OE = OrdinalEncoder()
    df_train_extended[nunique_2] = nunique_2_OE.fit_transform(X=df_train_extended[nunique_2])

    # создание матрицы признаков
    X_train = df_train_extended.copy()

    # оптимизация типов данных в матрице признаков
    X_train[nunique_2] = X_train[nunique_2].astype('int8')

    # удаление лишних признаков
    with open(r'tools/selected_features.pickle', 'rb') as f:
        selected_features = pickle.load(f)

    X_train = X_train[selected_features]

    # создание вектора целевой переменной
    y = df_train_extended['target'].copy()

    # возврат результата
    with open(r'tools/transformers.pickle', 'wb') as f:
        pickle.dump(transformers, f, protocol=pickle.HIGHEST_PROTOCOL)

    return (X_train, y)

Предобработка тренировочного датасета:

In [5]:
%%time
X_train, y = preprocess_train_dataset(df_train, df_features)

CPU times: total: 14min 53s
Wall time: 14min 30s


## 4 Обучение модели

Настройка метода валидации модели:

In [6]:
cv_rskf = RepeatedStratifiedKFold(
    n_splits = 5,
    n_repeats=3,
    random_state=13
)

Инициализация модели:

In [7]:
xgbc = XGBClassifier(random_state=13, n_jobs=-1)

Утсановка грид-параметров модели:

In [8]:
xgbc_param_grid = {
    'colsample_bytree': [
#         0.5,
        1  # default
    ],    
    'learning_rate': [
#         0.1,
        0.3,  # default
#         0.6,
    ],   
    'max_depth': [
#         3,
        6,  # default
#         9,
#         12,
#         15,
#         18,
#         21,
#         24,
#         27,
#         30,
#         33,
#         None
    ],
    'n_estimators': [
#         50,
        100,  # default
#         200
    ],
    'n_jobs': [
        -1
    ],
    'random_state': [
        13
    ],
    'reg_alpha': [
#         0,  # default
        0.5,
#         1,
    ],
    'reg_lambda': [
#         0,
        0.5,
#         1 # default
    ],
    'subsample': [
#         0.5,
        1  # default
    ],
}

Настройка модели:

In [9]:
xgbc_gscv = GridSearchCV(
    estimator=xgbc,
    param_grid=xgbc_param_grid,
    scoring='f1_macro',
    n_jobs=-1,
    cv=cv_rskf,
    verbose=1
)

In [10]:
%%time
xgbc_gscv.fit(X=X_train, y=y)

Fitting 15 folds for each of 1 candidates, totalling 15 fits
CPU times: total: 2min 1s
Wall time: 3min 12s


## 5 Получение финального результата

Получение настроенной модели:

In [11]:
tuned_model = xgbc_gscv.best_estimator_

Сериализация настроенной модели:

In [12]:
with open(r'tools/model.pickle', 'wb') as f:
    pickle.dump(tuned_model, f, protocol=pickle.HIGHEST_PROTOCOL)