In [1]:
import os
import numpy as np
import pandas as pd
import tqdm

# в описании вакансии в требованиях указаны библиотеки бустинга
# добавим их ;)
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import  LGBMClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def compare_models(models, X, y, CV=5, scoring='roc_auc'):
    '''Сравнение моделей по метрике roc-auc на кросс-валидации''' 

    results = {}
    for model in tqdm.tqdm(models):
        name_model = str(model)[:str(model).find('(')]
        print(name_model, end=' ')
        curr_cv = cross_val_score(model, X, y, cv=CV, scoring='roc_auc')
        print('{}-fold CV is : {} +/- {}'.format(CV, curr_cv.mean().round(4), curr_cv.std().round(4)))
        results.update({name_model:[curr_cv.mean().round(4), curr_cv]})
    max_results = {k:v for k, v in results.items() if v == max(results.values())}
    print(f'Max results {max_results.keys()} = {max(results.values())}')
    return results

In [3]:
# смотрим список csv файлов
PATH = 'Churn_task/'
files = [filename for filename in os.listdir(PATH) if filename[-3:] == 'csv']
files

['TEST_ADDITIONAL_DATA.csv',
 'TEST_PREPARED.csv',
 'TEST_RAW_DATA.csv',
 'test_submit_example.csv',
 'TRAIN_ADDITIONAL_DATA.csv',
 'TRAIN_PREPARED.csv',
 'TRAIN_RAW_DATA.csv']

In [4]:
# загружаем данные
train = pd.read_csv(PATH + 'TRAIN_PREPARED.csv')
train_raw = pd.read_csv(PATH + 'TRAIN_PREPARED.csv')
train_add = pd.read_csv(PATH + 'TRAIN_ADDITIONAL_DATA.csv')
test = pd.read_csv(PATH + 'TEST_PREPARED.csv') 
test_raw = pd.read_csv(PATH + 'TEST_RAW_DATA.csv')
test_add = pd.read_csv(PATH + 'TEST_ADDITIONAL_DATA.csv')

In [5]:
train.head(3)

Unnamed: 0,cut_date,days_to_end,email,first_date,last_date,num_country_max_1days,num_city_max_1days,android_max_1days,smarttv_max_1days,iphone_max_1days,...,apple_max_30days,pc_max_30days,activity_1to3,activity_1to7,activity_1to14,activity_7to14,activity_7to21,activity_7to30,activity_14to30,label
0,2019-04-02 00:00:00,11,8ba752f2c5,2019-02-10 00:00:00,2019-03-13 09:36:59,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-04-02 00:00:00,10,752a6d96f7,2018-10-28 00:00:00,2019-04-01 20:51:29,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.005807,0.005774,0.00303,0.521792,1.0
2,2019-04-02 00:00:00,4,827f6afef3,2018-12-14 00:00:00,2019-04-01 18:00:49,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,0.0,0.0,0.937031,0.832238,0.598121,0.638308,0.0


Видно, что датафрейм уже готов. 145 колонок с признаками и 1 label. Можно попробовать обучить модели.

In [6]:
#посмотрим, менятеся ли label с течением времени
temp_df = train.groupby('email').agg({'label':'mean'})
temp_df[(temp_df.label != 0) & (temp_df.label != 1)].shape[0] / train.email.nunique() * 100

0.19

Всего у 0.19% email видно изменение label 

In [7]:
#посмотрим, есть ли пересечения email на train и на test
inner = train.merge(test, on='email')
inner.shape[0]

0

In [8]:
#пересечений нет. Уберем email и даты из данных.
X = train.drop(['cut_date', 'first_date', 'last_date','email','label'], axis=1)
y = train['label']

X_test = test.drop(['cut_date', 'first_date', 'last_date','email'], axis=1)

In [9]:
#возьмем модели из "коробки"
models = [LogisticRegression(solver='liblinear'),
          Lasso(), 
          Ridge(),
          CatBoostClassifier(n_estimators=200, silent=True), #verbose=1000
          RandomForestClassifier(n_estimators=100,n_jobs=-1),
          #XGBClassifier(n_estimators=200),
          LGBMClassifier(n_estimators=200), 
         ]

In [10]:
#сравним их на CV=3
results_0 = compare_models(models, X, y, CV=3)

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

LogisticRegression 3-fold CV is : 0.6441 +/- 0.0048


 17%|██████████████                                                                      | 1/6 [00:41<03:27, 41.42s/it]

Lasso 3-fold CV is : 0.6144 +/- 0.0015


 33%|████████████████████████████                                                        | 2/6 [00:48<02:05, 31.26s/it]

Ridge 3-fold CV is : 0.6613 +/- 0.0018


 50%|██████████████████████████████████████████                                          | 3/6 [00:50<01:06, 22.20s/it]

<catboost.core.CatBoostClassifier object at 0x000001BFD4585188 3-fold CV is : 0.809 +/- 0.002


 67%|████████████████████████████████████████████████████████                            | 4/6 [01:25<00:52, 26.17s/it]

RandomForestClassifier 3-fold CV is : 0.9709 +/- 0.0019


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [02:34<00:38, 38.88s/it]

LGBMClassifier 3-fold CV is : 0.832 +/- 0.003


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [02:51<00:00, 28.65s/it]


Max results dict_keys(['RandomForestClassifier']) = [0.9709, array([0.9733731 , 0.96884724, 0.97043379])]


Хочется выбрать RandomForest, но есть ощущение, что модель переобучилась.  

Так как пользователи перемещаны, и при изменением days_to_end значение label не меняется, то получается, что при кросс-валидации попадаются пользователи, которых модель уже видела. Поэтому такой высокий скор.  

Нужно проверить это предположение, отсортировать пользователей по email и использовать 15% holdout для верности.

In [11]:
train_sorted = train.sort_values(by='email').reset_index().drop('index', axis=1)
X_train = train_sorted[:153007].drop(['cut_date', 'first_date', 'last_date','email','label'], axis=1)
y_train = train_sorted[:153007]['label']
X_holdout = train_sorted[153007:].drop(['cut_date', 'first_date', 'last_date','email','label'], axis=1)
y_holdout = train_sorted[153007:]['label']

In [12]:
results_1 = compare_models(models, X_train, y_train, CV=3)

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

LogisticRegression 3-fold CV is : 0.6382 +/- 0.0085


 17%|██████████████                                                                      | 1/6 [00:28<02:20, 28.18s/it]

Lasso 3-fold CV is : 0.6146 +/- 0.0076


 33%|████████████████████████████                                                        | 2/6 [00:34<01:26, 21.69s/it]

Ridge 3-fold CV is : 0.6553 +/- 0.0094


 50%|██████████████████████████████████████████                                          | 3/6 [00:35<00:46, 15.44s/it]

<catboost.core.CatBoostClassifier object at 0x000001BFD4585188 3-fold CV is : 0.633 +/- 0.0124


 67%|████████████████████████████████████████████████████████                            | 4/6 [01:06<00:40, 20.12s/it]

RandomForestClassifier 3-fold CV is : 0.626 +/- 0.0117


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [01:48<00:26, 26.62s/it]

LGBMClassifier 3-fold CV is : 0.6465 +/- 0.0094


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [02:01<00:00, 20.25s/it]


Max results dict_keys(['Ridge']) = [0.6553, array([0.66832776, 0.64630109, 0.6511627 ])]


Похоже на правду, но скор низковат :(

Отберем признаки для моделей  
см. feature_selection_lgbm и feature_selection_linear

In [13]:
#минимальный тюнинг параметров для модели1
clf1 = LGBMClassifier(n_estimators=100)
param_grid1 = {'n_estimators':[100,200,500]}
grid_cv1 = GridSearchCV(clf1, param_grid1, cv=3, scoring='roc_auc', n_jobs=-1, return_train_score=True)
grid_cv1.fit(X,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'n_estimators': [100, 200, 500]},
             pre_dispatch='

In [14]:
grid_cv1.best_params_

{'n_estimators': 500}

In [15]:
# фичи из feature_selection_linear
linears_features = ['num_country_max_1days', 'time_spent_sum_3days', 'time_spent_max_3days',
       'num_title_ep_max_3days', 'num_title_ru_sum_3days', 'top_4_sum_3days',
       'top_8_sum_3days', 'time_spent_sum_7days', 'num_title_ep_max_7days',
       'num_title_ru_sum_7days', 'num_title_ru_max_7days',
       'num_city_max_7days', 'top_1_sum_7days', 'top_5_sum_7days',
       'top_9_sum_7days', 'top_10_sum_7days', 'iphone_max_7days',
       'ipad_max_7days', 'time_spent_max_14days', 'num_title_ep_max_14days',
       'num_title_ru_sum_14days', 'num_title_ru_max_14days',
       'num_country_max_14days', 'top_1_sum_14days', 'top_4_sum_14days',
       'top_5_sum_14days', 'top_6_sum_14days', 'top_9_sum_14days',
       'smarttv_max_14days', 'ipad_max_14days', 'time_spent_sum_21days',
       'num_title_ru_sum_21days', 'num_title_ru_max_21days',
       'num_city_max_21days', 'top_3_sum_21days', 'top_5_sum_21days',
       'top_7_sum_21days', 'top_8_sum_21days', 'top_10_sum_21days',
       'iphone_max_21days', 'apple_max_21days', 'num_title_ep_max_30days',
       'top_1_sum_30days', 'top_2_sum_30days', 'top_4_sum_30days',
       'top_6_sum_30days', 'top_8_sum_30days', 'android_max_30days',
       'smarttv_max_30days', 'apple_max_30days', 'activity_1to3',
       'activity_7to30']

In [16]:
clf2 = LogisticRegression(solver='liblinear', random_state=17)
clf2.fit(X[linears_features],y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
predictions_lgbm = grid_cv1.best_estimator_.predict_proba(X_test)[:,1]
predictions_linear = clf2.predict_proba(X_test[linears_features])[:,1]
# просто усредним прогнозы, хотя можно подобрать коэфициент
predictions = (predictions_lgbm + predictions_linear) / 2
#делаем сабмит
pd.Series(predictions,
          index=test.email,
          name='prediction').to_csv('submit_01.csv', index_label='email', header=True)

В этом решении не предусмотра генерация фичей.
Чтобы разобраться, какие фичи уже были предусмотрены 'TRAIN_PREPARED.csv' а какие еще можно сгенерить, нужно больше времени ;)
Надеюсь, это не отразить на оценке моего решения.