In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn import set_config

from functions import reduce_mem_usage

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier


from imblearn.over_sampling import RandomOverSampler

import time

In [2]:
%matplotlib inline

warnings.simplefilter("ignore")
pd.set_option("display.max_columns", 999)

In [3]:
RANDOM_STATE = 42

In [4]:
TRAIN_PATH = "data/data_train.csv"
TEST_PATH = "data/data_test.csv"
FEATURES_PATH = "data/features.csv"

data_train = pd.read_csv(TRAIN_PATH)
data_test = pd.read_csv(TEST_PATH)

In [5]:
data_train = reduce_mem_usage(data_train)
data_test = reduce_mem_usage(data_test)

Memory usage of the dataframe is 31.73 MB
Memory usage after optimization is: 15.86 MB
Decreased by 50.0%
Memory usage of the dataframe is 2.17 MB
Memory usage after optimization is: 1.09 MB
Decreased by 50.0%


In [6]:
answers_test = data_test

answers_test.shape

(71231, 4)

In [7]:
def process_featues(data_features, train, test):
    ids = np.unique(train['id'].append(test['id']))
    # Возьмем id только тех пользователей, что встречаются в трейн и тест выборках.
    data_features = data_features[data_features['id'].isin(ids)]
    # Дропнем Unnamed
    if 'Unnamed: 0' in data_features:
        data_features = data_features.drop(columns='Unnamed: 0')

    data_features = data_features.compute()
    # удалим признаки с единственным значением
    df_nunique = data_features.apply(lambda x: x.nunique(dropna=False))
    const = df_nunique[df_nunique ==1].index.tolist()
    data_features = data_features.drop(columns = const)
    # функция сжатия данных
    data_features = reduce_mem_usage(data_features)
    return data_features

In [8]:
import dask.dataframe as dd

data_features = dd.read_csv(FEATURES_PATH, sep ='\t')

In [9]:
data_features = process_featues(data_features, data_train, data_test)

Memory usage of the dataframe is 1728.06 MB
Memory usage after optimization is: 867.47 MB
Decreased by 49.8%


Мерджим данные по ближайшей дате. Возникает вопрос, как указать direction

Логично сделать direction='forward' чтобы время в data_features было не больше чем в выборках трейн и тест. Ведь мы не можем знать профиль пользователя "из будущего" при работе модели.

С другой стороны, direction='nearest' выбирает ближайшие даты. При обучении это будет более "свежий" профиль и даже если он "из будущего" - лучше характеризует пользователя в момент получения предложения подключить услугу. При реальном же использовании модели случаи получения фич "из будущего" будут исключены по-определению, ближайший доступный профиль априори будет иметь более раннюю дату, и подобные ситуации не будут возникать. Поэтому выбран данный способ объединения.

При этом я не знаю, какие именно признаки находятся в data_features, если точно известно что там отражено подключил ли юзер искомую услугу, то значимость данных признаков неоправданно вырастет, т.к. по сути они будут готовым ответом. Но реальной предсказательной силы они иметь не будут. В таком случае допустимо использовать только forward объединение. 

In [10]:
data_train = data_train.sort_values(by="buy_time")
data_test = data_test.sort_values(by="buy_time")
data_features = data_features.sort_values(by="buy_time")

In [11]:
train = pd.merge_asof(data_train, data_features, on='buy_time', by='id', direction='nearest')
valid = pd.merge_asof(data_test, data_features, on='buy_time', by='id', direction ='nearest')

In [12]:
del train['Unnamed: 0']
del valid['Unnamed: 0']

## data pipe

In [13]:
features = [f for f in valid.columns if f not in ['id']]

Разделим данные на X и y

In [14]:
X_train = train.drop('target', axis='columns')
y_train = train.target
X_valid = valid

Сохраним необходимые для ответа признаки в отдельную переменную, чтобы позже добавить target.

Разделим признаки на бинарные, категориальные и вещественные.

In [15]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X_train[features].fillna(0):
    val_count = len(X_train[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


Селектор колонок

In [16]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [17]:
# new_features_list = ['interval']
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)

        try:
#             if 'interval' in self.features_list:
#                 X['interval'] = X['buy_time_y'] - X['buy_time_x']
            

            return X
        except KeyError:            
            raise KeyError("DataFrame не содердит колонки buy_time")

Обработчик вещественных признаков
Поскольку вещественных признаков много больше чем предполагается оставить в конце, и чем остальных признаков, откинем менее значимые из них до объединения с другими признаками.

In [18]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
    ('nskb', SelectKBest(k=128, score_func=f_classif)),
])

Обработчик категориальных признаков

In [19]:
cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

Обработчик булевых признаков

In [20]:
bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

Собираем в общий пайплайн

In [21]:
transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

In [22]:
transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


Отберем признаки с помощью SelectKBest и логистической регрессии с регуляризацией L1 (было 298 признаков)


In [23]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=64, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [24]:
set_config(display='diagram')

fs_pipe

In [25]:
fs_pipe.fit(X_train, y_train)


In [26]:
X_train = fs_pipe.transform(X_train)

In [27]:
X_valid = fs_pipe.transform(X_valid)

## Model

Соберем финальный пайплайн.


In [28]:
ros = RandomOverSampler(random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [29]:
model = CatBoostClassifier(random_state=RANDOM_STATE)


In [30]:
model.fit(X_ros, y_ros)


Learning rate set to 0.236787
0:	learn: 0.4857543	total: 542ms	remaining: 9m 1s
1:	learn: 0.4077319	total: 822ms	remaining: 6m 50s
2:	learn: 0.3685533	total: 1.09s	remaining: 6m 2s
3:	learn: 0.3471900	total: 1.37s	remaining: 5m 41s
4:	learn: 0.3361187	total: 1.65s	remaining: 5m 28s
5:	learn: 0.3281733	total: 1.93s	remaining: 5m 19s
6:	learn: 0.3246187	total: 2.22s	remaining: 5m 14s
7:	learn: 0.3225191	total: 2.46s	remaining: 5m 4s
8:	learn: 0.3211967	total: 2.69s	remaining: 4m 56s
9:	learn: 0.3204051	total: 2.93s	remaining: 4m 50s
10:	learn: 0.3198662	total: 3.17s	remaining: 4m 44s
11:	learn: 0.3193046	total: 3.44s	remaining: 4m 42s
12:	learn: 0.3183402	total: 3.69s	remaining: 4m 40s
13:	learn: 0.3180816	total: 3.94s	remaining: 4m 37s
14:	learn: 0.3177915	total: 4.21s	remaining: 4m 36s
15:	learn: 0.3174058	total: 4.48s	remaining: 4m 35s
16:	learn: 0.3172682	total: 4.74s	remaining: 4m 34s
17:	learn: 0.3171261	total: 5.02s	remaining: 4m 34s
18:	learn: 0.3169894	total: 5.24s	remaining: 4m

157:	learn: 0.3036577	total: 39s	remaining: 3m 27s
158:	learn: 0.3035446	total: 39.3s	remaining: 3m 27s
159:	learn: 0.3034608	total: 39.5s	remaining: 3m 27s
160:	learn: 0.3033690	total: 39.7s	remaining: 3m 27s
161:	learn: 0.3032312	total: 40s	remaining: 3m 27s
162:	learn: 0.3031582	total: 40.3s	remaining: 3m 26s
163:	learn: 0.3030821	total: 40.5s	remaining: 3m 26s
164:	learn: 0.3030262	total: 40.7s	remaining: 3m 26s
165:	learn: 0.3029626	total: 41s	remaining: 3m 25s
166:	learn: 0.3028947	total: 41.3s	remaining: 3m 25s
167:	learn: 0.3028172	total: 41.5s	remaining: 3m 25s
168:	learn: 0.3027148	total: 41.8s	remaining: 3m 25s
169:	learn: 0.3026282	total: 42s	remaining: 3m 25s
170:	learn: 0.3025607	total: 42.3s	remaining: 3m 24s
171:	learn: 0.3024672	total: 42.5s	remaining: 3m 24s
172:	learn: 0.3023676	total: 42.8s	remaining: 3m 24s
173:	learn: 0.3022995	total: 43s	remaining: 3m 24s
174:	learn: 0.3022193	total: 43.2s	remaining: 3m 23s
175:	learn: 0.3021549	total: 43.5s	remaining: 3m 23s
176

313:	learn: 0.2919242	total: 1m 17s	remaining: 2m 49s
314:	learn: 0.2918458	total: 1m 17s	remaining: 2m 49s
315:	learn: 0.2917627	total: 1m 18s	remaining: 2m 49s
316:	learn: 0.2916935	total: 1m 18s	remaining: 2m 49s
317:	learn: 0.2916425	total: 1m 18s	remaining: 2m 48s
318:	learn: 0.2915902	total: 1m 18s	remaining: 2m 48s
319:	learn: 0.2915317	total: 1m 19s	remaining: 2m 48s
320:	learn: 0.2914401	total: 1m 19s	remaining: 2m 48s
321:	learn: 0.2913559	total: 1m 19s	remaining: 2m 47s
322:	learn: 0.2912802	total: 1m 19s	remaining: 2m 47s
323:	learn: 0.2912009	total: 1m 20s	remaining: 2m 47s
324:	learn: 0.2911219	total: 1m 20s	remaining: 2m 46s
325:	learn: 0.2910403	total: 1m 20s	remaining: 2m 46s
326:	learn: 0.2909727	total: 1m 20s	remaining: 2m 46s
327:	learn: 0.2909280	total: 1m 21s	remaining: 2m 46s
328:	learn: 0.2908723	total: 1m 21s	remaining: 2m 46s
329:	learn: 0.2908205	total: 1m 21s	remaining: 2m 45s
330:	learn: 0.2907365	total: 1m 21s	remaining: 2m 45s
331:	learn: 0.2906717	total:

465:	learn: 0.2821331	total: 1m 55s	remaining: 2m 12s
466:	learn: 0.2820859	total: 1m 55s	remaining: 2m 11s
467:	learn: 0.2820377	total: 1m 55s	remaining: 2m 11s
468:	learn: 0.2819963	total: 1m 55s	remaining: 2m 11s
469:	learn: 0.2819510	total: 1m 56s	remaining: 2m 11s
470:	learn: 0.2818586	total: 1m 56s	remaining: 2m 10s
471:	learn: 0.2817765	total: 1m 56s	remaining: 2m 10s
472:	learn: 0.2817087	total: 1m 56s	remaining: 2m 10s
473:	learn: 0.2816359	total: 1m 57s	remaining: 2m 10s
474:	learn: 0.2815860	total: 1m 57s	remaining: 2m 9s
475:	learn: 0.2815155	total: 1m 57s	remaining: 2m 9s
476:	learn: 0.2814643	total: 1m 58s	remaining: 2m 9s
477:	learn: 0.2814223	total: 1m 58s	remaining: 2m 9s
478:	learn: 0.2813491	total: 1m 58s	remaining: 2m 8s
479:	learn: 0.2812899	total: 1m 58s	remaining: 2m 8s
480:	learn: 0.2812628	total: 1m 58s	remaining: 2m 8s
481:	learn: 0.2812176	total: 1m 59s	remaining: 2m 8s
482:	learn: 0.2811833	total: 1m 59s	remaining: 2m 7s
483:	learn: 0.2811109	total: 1m 59s	r

619:	learn: 0.2730414	total: 2m 33s	remaining: 1m 34s
620:	learn: 0.2729780	total: 2m 34s	remaining: 1m 34s
621:	learn: 0.2729331	total: 2m 34s	remaining: 1m 33s
622:	learn: 0.2728706	total: 2m 34s	remaining: 1m 33s
623:	learn: 0.2728257	total: 2m 34s	remaining: 1m 33s
624:	learn: 0.2727623	total: 2m 35s	remaining: 1m 33s
625:	learn: 0.2727086	total: 2m 35s	remaining: 1m 32s
626:	learn: 0.2726511	total: 2m 35s	remaining: 1m 32s
627:	learn: 0.2725690	total: 2m 35s	remaining: 1m 32s
628:	learn: 0.2725302	total: 2m 36s	remaining: 1m 32s
629:	learn: 0.2724844	total: 2m 36s	remaining: 1m 31s
630:	learn: 0.2724417	total: 2m 36s	remaining: 1m 31s
631:	learn: 0.2723808	total: 2m 36s	remaining: 1m 31s
632:	learn: 0.2723364	total: 2m 37s	remaining: 1m 31s
633:	learn: 0.2722949	total: 2m 37s	remaining: 1m 30s
634:	learn: 0.2722314	total: 2m 37s	remaining: 1m 30s
635:	learn: 0.2721695	total: 2m 37s	remaining: 1m 30s
636:	learn: 0.2721409	total: 2m 38s	remaining: 1m 30s
637:	learn: 0.2720887	total:

774:	learn: 0.2649627	total: 3m 13s	remaining: 56s
775:	learn: 0.2649071	total: 3m 13s	remaining: 55.8s
776:	learn: 0.2648696	total: 3m 13s	remaining: 55.5s
777:	learn: 0.2647944	total: 3m 13s	remaining: 55.3s
778:	learn: 0.2647631	total: 3m 14s	remaining: 55s
779:	learn: 0.2647012	total: 3m 14s	remaining: 54.8s
780:	learn: 0.2646470	total: 3m 14s	remaining: 54.6s
781:	learn: 0.2646049	total: 3m 14s	remaining: 54.3s
782:	learn: 0.2645466	total: 3m 15s	remaining: 54.1s
783:	learn: 0.2644845	total: 3m 15s	remaining: 53.8s
784:	learn: 0.2644362	total: 3m 15s	remaining: 53.6s
785:	learn: 0.2643614	total: 3m 15s	remaining: 53.3s
786:	learn: 0.2643281	total: 3m 16s	remaining: 53.1s
787:	learn: 0.2642864	total: 3m 16s	remaining: 52.8s
788:	learn: 0.2642288	total: 3m 16s	remaining: 52.6s
789:	learn: 0.2641778	total: 3m 16s	remaining: 52.3s
790:	learn: 0.2641259	total: 3m 17s	remaining: 52.1s
791:	learn: 0.2640488	total: 3m 17s	remaining: 51.8s
792:	learn: 0.2640013	total: 3m 17s	remaining: 51.

930:	learn: 0.2571616	total: 3m 52s	remaining: 17.3s
931:	learn: 0.2570899	total: 3m 53s	remaining: 17s
932:	learn: 0.2570505	total: 3m 53s	remaining: 16.8s
933:	learn: 0.2570006	total: 3m 53s	remaining: 16.5s
934:	learn: 0.2569585	total: 3m 54s	remaining: 16.3s
935:	learn: 0.2569109	total: 3m 54s	remaining: 16s
936:	learn: 0.2568927	total: 3m 54s	remaining: 15.8s
937:	learn: 0.2568590	total: 3m 54s	remaining: 15.5s
938:	learn: 0.2568234	total: 3m 55s	remaining: 15.3s
939:	learn: 0.2567599	total: 3m 55s	remaining: 15s
940:	learn: 0.2567164	total: 3m 55s	remaining: 14.8s
941:	learn: 0.2566831	total: 3m 55s	remaining: 14.5s
942:	learn: 0.2566229	total: 3m 56s	remaining: 14.3s
943:	learn: 0.2565644	total: 3m 56s	remaining: 14s
944:	learn: 0.2565253	total: 3m 56s	remaining: 13.8s
945:	learn: 0.2564858	total: 3m 56s	remaining: 13.5s
946:	learn: 0.2564407	total: 3m 57s	remaining: 13.3s
947:	learn: 0.2563823	total: 3m 57s	remaining: 13s
948:	learn: 0.2563426	total: 3m 57s	remaining: 12.8s
949

<catboost.core.CatBoostClassifier at 0x7fe6f0f7aad0>

In [31]:
y_valid = model.predict_proba(X_valid)

## Сохраняем answers_test.csv

In [34]:
answers_test['target'] = y_valid[:, 1]

In [35]:
answers_test.to_csv('answers_test_1.csv', index=False,)

In [36]:
answers_test_loaded = pd.read_csv('answers_test_1.csv')

In [37]:
answers_test_loaded.head()

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,3130519,2.0,1548018000,0.000862
1,1,2000860,4.0,1548018000,0.004003
2,2,1099444,2.0,1546808400,0.801864
3,3,1343255,5.0,1547413200,0.877868
4,4,1277040,2.0,1546808400,0.709548


## Сохраняем модель.

In [38]:
import pickle


# save
with open('fs_pipe.pkl','wb') as f:
    pickle.dump(fs_pipe,f)

with open('model.pkl','wb') as f:
    pickle.dump(model,f)


In [39]:
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [40]:
loaded_model.predict(X_valid)

array([0., 0., 1., ..., 0., 1., 0.])