# Mobile Service (model predict)

## 1 Подключение и настройка внешних модулей

In [1]:
# data processing
import pandas as pd
import numpy as np
import dask.dataframe as dd
import sklearn
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# отключение предупреждений
import warnings
warnings.filterwarnings('ignore')

# сериализация объектов
import pickle

## 2 Загрузка датасетов

In [2]:
df_test = pd.read_csv(r'data/data_test.csv')
df_features = dd.read_csv(r'data/features.csv', sep='\t', engine='python')

## 3 Предобработка данных

Вспомогательные структуры:

In [3]:
def feature_transformer(dataset, source_list, feature_name, transformer, prefitting=True):
    """Трансформация признаков."""

    # prefitting
    if prefitting:
        transformer.fit(dataset[source_list])

    # transforming
    if type(transformer) == sklearn.preprocessing._encoders.OneHotEncoder:

        df_OHE = pd.DataFrame(
            transformer.transform(dataset[source_list]),
            columns = transformer.get_feature_names_out()
        )
        dataset = pd.concat([dataset, df_OHE], axis=1).drop(source_list, axis=1)

    else:
        dataset[feature_name] = transformer.transform(dataset[source_list])
    
    # returning
    return dataset

In [4]:
with open(r'tools/transformers.pickle', 'rb') as f:
    transformers = pickle.load(f)

In [5]:
def preprocess_test_dataset(df_test, df_features):
    "Предобработка тестового датасета."

    # объединение датасетов
    test_ids = df_test['id']
    df_test_user_features = df_features[df_features['id'].isin(test_ids)].compute()
    df_test_extended = pd.merge_asof(
        left=df_test.sort_values(by='buy_time'),
        right=df_test_user_features.sort_values(by='buy_time'),
        on='buy_time',
        by='id',
        direction='nearest'
    )

    # создание признака 'not_first_offer'
    not_first_offer_test = df_test_extended.duplicated('id').astype(int)
    df_test_extended.insert(loc=0, column='not_first_offer', value=not_first_offer_test)

    # создание признака 'buy_month'
    buy_month_test = pd.to_datetime(df_test_extended['buy_time'], unit='s').dt.month
    df_test_extended.insert(loc=0, column='buy_month', value=buy_month_test)

    # загрузка трансформеров
    with open(r'tools/transformers.pickle', 'rb') as f:
        transformers = pickle.load(f)

    # предобработка признака 'vas_id'
    vas_id_OHE = transformers['vas_id_OHE']
    df_test_extended = feature_transformer(
        dataset=df_test_extended,
        source_list=['vas_id'],
        feature_name='vas_id',
        transformer=vas_id_OHE,
        prefitting=False
    )

    # предобработка признака 'buy_month'
    buy_month_OHE = transformers['buy_month_OHE']
    df_test_extended = feature_transformer(
        dataset=df_test_extended,
        source_list=['buy_month'],
        feature_name='buy_month',
        transformer=buy_month_OHE,
        prefitting=False
    )

    # кодирование бинарных признаков
    with open(r'tools/nunique_2.pickle', 'rb') as f:
        nunique_2 = pickle.load(f)

    nunique_2.remove('target')
    nunique_2_OE = OrdinalEncoder()
    df_test_extended[nunique_2] = nunique_2_OE.fit_transform(X=df_test_extended[nunique_2])

    # создание матрицы признаков
    X_test = df_test_extended.copy()

    # оптимизация типов данных в матрице признаков
    X_test[nunique_2] = X_test[nunique_2].astype('int8')

    # удаление лишних признаков
    with open(r'tools/selected_features.pickle', 'rb') as f:
        selected_features = pickle.load(f)

    X_test = X_test[selected_features]

    # возврат результата
    return X_test

Предобработка тестовго датасета:

In [6]:
%%time
X_test = preprocess_test_dataset(df_test, df_features)

CPU times: total: 14min 47s
Wall time: 14min 32s


## 4 Загрузка модели и получение предсказания

Загрузка модели:

In [7]:
with open(r'tools/model.pickle', 'rb') as f:
    model = pickle.load(f)

Получение предсказания:

In [8]:
%%time
predictions = model.predict_proba(X_test)[:,1]

CPU times: total: 328 ms
Wall time: 47.9 ms


## 5 Получение финального результата

Формирование датафрейма выходных данных:

In [9]:
output = pd.DataFrame(
    {
        'buy_time': df_test.buy_time,
        'id': df_test.id,
        'vas_id': df_test.vas_id,
        'target': predictions   
    }
)

Сохранение выходных данных в файл:

In [10]:
output.to_csv(r'data/answers_test.csv', index=False)

Проверка выходных данных:

In [11]:
pd.read_csv(r'data/answers_test.csv', index_col=0)

Unnamed: 0_level_0,id,vas_id,target
buy_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1548018000,3130519,2.0,0.007305
1548018000,2000860,4.0,0.006503
1546808400,1099444,2.0,0.271045
1547413200,1343255,5.0,0.281009
1546808400,1277040,2.0,0.265747
...,...,...,...
1548018000,2502453,5.0,0.265747
1548018000,1693213,2.0,0.007339
1548018000,1891350,2.0,0.006277
1548018000,2437172,2.0,0.265747
