# Решение контеста https://boosters.pro/champ_10

### Амир Мирас Сабыргалиулы

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from tqdm import tqdm_notebook
import re
from dateutil.relativedelta import relativedelta
import ffm
from sklearn.model_selection import cross_val_score, cross_val_predict
import  matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold, GroupKFold, train_test_split
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import Imputer, MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn import ensemble
import scipy.sparse as sp

%reload_ext autoreload
%autoreload 2
%matplotlib inline

### Вспомогательные патчи

In [2]:
def df_only(df, *patterns):
    columns = set()
    for pattern in patterns:
        columns |= set([x for x in df.columns if re.search("^{}".format(pattern), x)])
    return df[list(columns)]

def df_omit(df, *patterns):
    columns = set(df.columns)
    for pattern in patterns:
        columns &= set([x for x in df.columns if not bool(re.search("^{}".format(pattern), x))])
    return df[list(columns)]


pd.DataFrame.only = df_only
pd.DataFrame.omit = df_omit

### Обработчики данных

In [3]:
def handler_client_data(df):
    h_df = pd.DataFrame()
    h_df['fn:client_data:age'] = df['age']
    h_df['fc:client_data:sex'] = df['sex'].map({'female': 0, 'male': 1})
    h_df['dt:client_data:create'] = df['create_datetime']
    h_df['id_user'] = df['id_user']
    return h_df

def handler_show_data(df, bag_of_words=True):
    h_df = df.groupby('id_show').first().reset_index()
    h_df['IdBuilding'] = df.groupby('id_show').apply(
        lambda x: sum(x['IdBuilding'].apply(lambda x: [str(x)]), [])).values
    h_df['fn:show_data:building_count'] = h_df['IdBuilding'].apply(len)
    h_df = h_df.rename(
        columns={
            'organizer_id': 'id:show_data:organizer', 
            'age_category': 'fc:show_data:age_category', 
            'duration': 'fn:show_data:duration', 
            'parent_genre_id': 'list:show_data:parent_genre',
            'child_genre_id': 'list:show_data:child_genre', 
            'IdBuilding': 'list:show_data:building', 
            'show_maxprice': 'fn:show_data:show_maxprice', 
            'show_minprice': 'fn:show_data:show_minprice', 
            'show_meanprice': 'fn:show_data:show_meanprice', 
            'show_stdprice': 'fn:show_data:show_stdprice' 
        }
    )
    find_numbers = lambda x: str(x) if x != x else re.findall("[0-9]+", x)
    h_df['list:show_data:child_genre'] = h_df['list:show_data:child_genre'].apply(find_numbers)
    h_df['list:show_data:parent_genre'] = h_df['list:show_data:parent_genre'].apply(find_numbers)
    return h_df

def handler_impressions(df):
    h_df = df.copy()
    return h_df.rename(
        columns={
            'rank': 'fn:impressions:rank', 
            'event_datetime_m': 'dt:impressions:event'
        }
    ).sort_values('dt:impressions:event').reset_index(drop=True)

def handler_no_impressions(df):
    h_df = df.copy()
    return h_df.rename(
        columns={
            'event_datetime_m': 'dt:no_impressions:event'
        }
    )

def handler_show_images(df):
    h_df = df.copy()
    return df.rename(
        columns={
            'image_id': 'id:show_images:image', 
        }
    )

def handler_show_rating(df):
    h_df = df.copy()
    return df.rename(
        columns={
            'date_time': 'dt:show_rating', 
            'rating': 'fn:show_rating:rating',
            'rating_count': 'fn:show_rating:rating_count',
            'review_count': 'fn:show_rating:review_count'
        }
    )

### Загрузка и обработка данных

In [4]:
impressions = handler_impressions(pd.read_csv("impressions.сsv", parse_dates=['event_datetime_m']))
test_impressions = handler_impressions(pd.read_csv("test.csv", parse_dates=['event_datetime_m']))
click_impressions = handler_no_impressions(pd.read_csv("clicks_no_impressions.сsv", parse_dates=['event_datetime_m']))
show_data = handler_show_data(pd.read_csv("show_data.сsv"))
client_data = handler_client_data(pd.read_csv("client_data.сsv", parse_dates=['create_datetime']))
show_rating = handler_show_rating(pd.read_csv("show_rating.сsv", parse_dates=['date_time']))
show_images = handler_show_images(pd.read_csv("show_images.сsv"))

In [5]:
df = pd.concat([impressions, test_impressions]).reset_index(drop=True)

In [6]:
df['dt:impressions:event:hour'] = df['dt:impressions:event'].apply(
    lambda x: datetime(x.year, x.month, x.day, x.hour))
df['dt:impressions:event:day'] = df['dt:impressions:event'].apply(
    lambda x: datetime(x.year, x.month, x.day))

### Маски для каждого месяца: будем обучаться на марте и предсказывать апрель

In [7]:
mask_train = (df['is_clicked'].notnull())
apr = df['is_clicked'].isnull()
yan = mask_train & (
    df['dt:impressions:event'] < datetime(2017, 2, 1, 0, 0, 0)) & (
    df['dt:impressions:event'] >= datetime(2017, 1, 1, 0, 0, 0))
feb = mask_train & (
    df['dt:impressions:event'] < datetime(2017, 3, 1, 0, 0, 0)) & (
    df['dt:impressions:event'] >= datetime(2017, 2, 1, 0, 0, 0))
mar = mask_train & (
    df['dt:impressions:event'] >= datetime(2017, 3, 1, 0, 0, 0))

In [8]:
yan_no = (
    click_impressions['dt:no_impressions:event'] < datetime(2017, 2, 1, 0, 0, 0)) & (
    click_impressions['dt:no_impressions:event'] >= datetime(2017, 1, 1, 0, 0, 0))
feb_no = (
    click_impressions['dt:no_impressions:event'] < datetime(2017, 3, 1, 0, 0, 0)) & (
    click_impressions['dt:no_impressions:event'] >= datetime(2017, 2, 1, 0, 0, 0))
mar_no = (
    click_impressions['dt:no_impressions:event'] < datetime(2017, 4, 1, 0, 0, 0)) & (
    click_impressions['dt:no_impressions:event'] >= datetime(2017, 3, 1, 0, 0, 0))

In [9]:
yan_sr = (
    show_rating['dt:show_rating'] < datetime(2017, 2, 1)) & (
    show_rating['dt:show_rating'] >= datetime(2017, 1, 1))
feb_sr = (
    show_rating['dt:show_rating'] < datetime(2017, 3, 1)) & (
    show_rating['dt:show_rating'] >= datetime(2017, 2, 1))
mar_sr = (
    show_rating['dt:show_rating'] < datetime(2017, 4, 1)) & (
    show_rating['dt:show_rating'] >= datetime(2017, 3, 1))

### Фичи по клиентам и мероприятиям

In [10]:
df = df.merge(show_data, on='id_show', how='left')
df = df.merge(client_data, on='id_user', how='left')
df = df.merge(show_images, on='id_show', how='left')
df = df.merge(
    df.groupby(['id_user', 'id_show']).size().reset_index().rename(
        columns={0: 'fk:id_user_id_show'}), 
    on=['id_user', 'id_show'],
    how='left'
)

In [11]:
df['fk:id_user'] = df['id_user'].map(df['id_user'].value_counts())
df['fk:id_show'] = df['id_show'].map(df['id_show'].value_counts())

df['fn:impressions:event:hour'] = df['dt:impressions:event'].dt.hour
df['fn:impressions:event:weekday'] = df['dt:impressions:event'].dt.weekday
df['fn:impressions:event:minute'] = df['dt:impressions:event'].dt.minute
df['fc:show_images:have_image'] = df['id:show_images:image'].notnull()

### Счетчики

In [12]:
merge_columns = [
    ['dt:impressions:event', 'id_user'],
    ['dt:impressions:event', 'id_user', 'id_show'],
    ['dt:impressions:event', 'id_user', 'fn:impressions:rank'],
    ['dt:impressions:event', 'id_user', 'id_show', 'fn:impressions:rank'],
    ['dt:impressions:event', 'id_show'],
    ['dt:impressions:event', 'id_show', 'fn:impressions:rank'],
    ['dt:impressions:event:hour', 'id_user'],
    ['dt:impressions:event:hour', 'id_user', 'id_show'],
    ['dt:impressions:event:hour', 'id_user', 'fn:impressions:rank'],
    ['dt:impressions:event:hour', 'id_user', 'id_show', 'fn:impressions:rank'],
    ['dt:impressions:event:hour', 'id_show'],
    ['dt:impressions:event:hour', 'id_show', 'fn:impressions:rank'],
    ['dt:impressions:event:day', 'id_user'],
    ['dt:impressions:event:day', 'id_user', 'id_show'],
    ['dt:impressions:event:day', 'id_user', 'fn:impressions:rank'],
    ['dt:impressions:event:day', 'id_user', 'id_show', 'fn:impressions:rank'],
    ['dt:impressions:event:day', 'id_show'],
    ['dt:impressions:event:day', 'id_show', 'fn:impressions:rank'],
]

for cols in merge_columns:
    print('fm:{}'.format('_'.join([x.split(":")[-1] for x in cols])))
    df = df.merge(
        df.groupby(cols).size().reset_index().rename(
            columns={0: 'fm:{}'.format('_'.join([x.split(":")[-1] for x in cols]))}), 
        on=cols,
        how='left'
    )

fm:event_id_user
fm:event_id_user_id_show
fm:event_id_user_rank
fm:event_id_user_id_show_rank
fm:event_id_show
fm:event_id_show_rank
fm:hour_id_user
fm:hour_id_user_id_show
fm:hour_id_user_rank
fm:hour_id_user_id_show_rank
fm:hour_id_show
fm:hour_id_show_rank
fm:day_id_user
fm:day_id_user_id_show
fm:day_id_user_rank
fm:day_id_user_id_show_rank
fm:day_id_show
fm:day_id_show_rank


In [13]:
train = df.loc[mar].copy()
test = df.loc[apr].copy()
val_train = df.loc[feb].copy()
val_test = df.loc[mar].copy()

y = df.loc[mar, 'is_clicked']
y_train = df.loc[feb, 'is_clicked']
y_test = df.loc[mar, 'is_clicked']

### Фичи по кликабельности из clicks_no_impressions

In [14]:
def click_no_features(df, df_clicks, days=None, month=None):
    max_date = df['dt:impressions:event'].min()
    if days is not None:
        min_date = max_date - relativedelta(days=days)
        name = "{}_days".format(days)
    elif month is not None:
        min_date = max_date - relativedelta(month=month)
        name = "{}_month".format(month)
    else: 
        raise Exception
    date = df_clicks['dt:no_impressions:event']
    mask = (date >= min_date) & (date < max_date)
    
    return_df = df.merge(
        df_clicks[mask].groupby(['id_user', 'id_show']).size().reset_index().rename(
            columns={0: 'fi:id_user_id_show_{}'.format(name)}), 
        on=['id_user', 'id_show'],
        how='left'
    )
    return_df['fi:id_user_{}'.format(name)] = return_df['id_user'].map(
        df_clicks.loc[mask, 'id_user'].value_counts(normalize=True))
    return_df['fi:id_show_{}'.format(name)] = return_df['id_show'].map(
        df_clicks.loc[mask, 'id_show'].value_counts(normalize=True))
    return return_df

In [15]:
val_train = click_no_features(val_train, click_impressions, month=1)
val_train = click_no_features(val_train, click_impressions, days=3)
val_train = click_no_features(val_train, click_impressions, days=7)
val_train = click_no_features(val_train, click_impressions, days=14)
val_train = click_no_features(val_train, click_impressions, days=21)
val_train = click_no_features(val_train, click_impressions, days=45)

val_test = click_no_features(val_test, click_impressions, month=1)
val_test = click_no_features(val_test, click_impressions, days=3)
val_test = click_no_features(val_test, click_impressions, days=7)
val_test = click_no_features(val_test, click_impressions, days=14)
val_test = click_no_features(val_test, click_impressions, days=21)
val_test = click_no_features(val_test, click_impressions, days=45)

In [16]:
train = click_no_features(train, click_impressions, month=1)
train = click_no_features(train, click_impressions, days=3)
train = click_no_features(train, click_impressions, days=7)
train = click_no_features(train, click_impressions, days=14)
train = click_no_features(train, click_impressions, days=21)
train = click_no_features(train, click_impressions, days=45)

test = click_no_features(test, click_impressions, month=1)
test = click_no_features(test, click_impressions, days=3)
test = click_no_features(test, click_impressions, days=7)
test = click_no_features(test, click_impressions, days=14)
test = click_no_features(test, click_impressions, days=21)
test = click_no_features(test, click_impressions, days=45)

### Фичи по рейтингу мероприятия

In [17]:
val_train = val_train.merge(
    show_rating.loc[yan_sr, ['fn:show_rating:rating', 'fn:show_rating:rating_count', 
                         'fn:show_rating:review_count', 'id_show']
               ].groupby('id_show').mean().reset_index().rename(
        columns={
            'fn:show_rating:rating': 'fs:show_rating:rating',
            'fn:show_rating:rating_count': 'fs:show_rating:rating_count', 
            'fn:show_rating:review_count': 'fs:show_rating:review_count'
        }), 
    on=['id_show'],
    how='left'
)

val_test = val_test.merge(
    show_rating.loc[feb_sr, ['fn:show_rating:rating', 'fn:show_rating:rating_count', 
                         'fn:show_rating:review_count', 'id_show']
               ].groupby('id_show').mean().reset_index().rename(
        columns={
            'fn:show_rating:rating': 'fs:show_rating:rating',
            'fn:show_rating:rating_count': 'fs:show_rating:rating_count', 
            'fn:show_rating:review_count': 'fs:show_rating:review_count'
        }), 
    on=['id_show'],
    how='left'
)

In [18]:
train = train.merge(
    show_rating.loc[feb_sr, ['fn:show_rating:rating', 'fn:show_rating:rating_count', 
                         'fn:show_rating:review_count', 'id_show']
               ].groupby('id_show').mean().reset_index().rename(
        columns={
            'fn:show_rating:rating': 'fs:show_rating:rating',
            'fn:show_rating:rating_count': 'fs:show_rating:rating_count', 
            'fn:show_rating:review_count': 'fs:show_rating:review_count'
        }), 
    on=['id_show'],
    how='left'
)

test = test.merge(
    show_rating.loc[mar_sr, ['fn:show_rating:rating', 'fn:show_rating:rating_count', 
                         'fn:show_rating:review_count', 'id_show']
               ].groupby('id_show').mean().reset_index().rename(
        columns={
            'fn:show_rating:rating': 'fs:show_rating:rating',
            'fn:show_rating:rating_count': 'fs:show_rating:rating_count', 
            'fn:show_rating:review_count': 'fs:show_rating:review_count'
        }), 
    on=['id_show'],
    how='left'
)

### Сглаженные средние по предыдущему месяцу

In [19]:
FEB_MEAN = df.loc[feb, 'is_clicked'].mean()
YAN_MEAN = df.loc[yan, 'is_clicked'].mean()
MAR_MEAN = df.loc[mar, 'is_clicked'].mean()

yan_user_mean = df.loc[yan].groupby('id_user')['is_clicked'].mean()
feb_user_mean = df.loc[feb].groupby('id_user')['is_clicked'].mean()
mar_user_mean = df.loc[mar].groupby('id_user')['is_clicked'].mean()

yan_show_mean = df.loc[yan].groupby('id_show')['is_clicked'].mean()
feb_show_mean = df.loc[feb].groupby('id_show')['is_clicked'].mean()
mar_show_mean = df.loc[mar].groupby('id_show')['is_clicked'].mean()

yan_user_size = df.loc[yan].groupby('id_user')['is_clicked'].size()
feb_user_size = df.loc[feb].groupby('id_user')['is_clicked'].size()
mar_user_size = df.loc[mar].groupby('id_user')['is_clicked'].size()

yan_show_size = df.loc[yan].groupby('id_show')['is_clicked'].size()
feb_show_size = df.loc[feb].groupby('id_show')['is_clicked'].size()
mar_show_size = df.loc[mar].groupby('id_show')['is_clicked'].size()

ALPHA = 2
yan_user_target_ctr = (yan_user_mean * yan_user_size + ALPHA * YAN_MEAN) / (yan_user_size + ALPHA)
feb_user_target_ctr = (feb_user_mean * feb_user_size + ALPHA * FEB_MEAN) / (feb_user_size + ALPHA)
mar_user_target_ctr = (mar_user_mean * mar_user_size + ALPHA * MAR_MEAN) / (mar_user_size + ALPHA)

yan_show_target_ctr = (yan_show_mean * yan_show_size + ALPHA * YAN_MEAN) / (yan_show_size + ALPHA)
feb_show_target_ctr = (feb_show_mean * feb_show_size + ALPHA * FEB_MEAN) / (feb_show_size + ALPHA)
mar_show_target_ctr = (mar_show_mean * mar_show_size + ALPHA * MAR_MEAN) / (mar_show_size + ALPHA)

val_train['fo:id_user'] = val_train['id_user'].map(yan_user_target_ctr)
val_test['fo:id_user'] = val_test['id_user'].map(feb_user_target_ctr)

val_train['fo:id_show'] = val_train['id_show'].map(yan_show_target_ctr)
val_test['fo:id_show'] = val_test['id_show'].map(feb_show_target_ctr)

train['fo:id_user'] = train['id_user'].map(feb_user_target_ctr)
test['fo:id_user'] = test['id_user'].map(mar_user_target_ctr)

train['fo:id_show'] = train['id_show'].map(feb_show_target_ctr)
test['fo:id_show'] = test['id_show'].map(mar_show_target_ctr)

### Сглаженные средние по предыдущим месяцам

In [24]:
FEB_MEAN = df.loc[feb | yan, 'is_clicked'].mean()
YAN_MEAN = df.loc[yan, 'is_clicked'].mean()
MAR_MEAN = df.loc[mar | feb | yan, 'is_clicked'].mean()

yan_user_mean = df.loc[yan].groupby('id_user')['is_clicked'].mean()
feb_user_mean = df.loc[feb | yan].groupby('id_user')['is_clicked'].mean()
mar_user_mean = df.loc[mar | feb | yan].groupby('id_user')['is_clicked'].mean()

yan_show_mean = df.loc[yan].groupby('id_show')['is_clicked'].mean()
feb_show_mean = df.loc[feb | yan].groupby('id_show')['is_clicked'].mean()
mar_show_mean = df.loc[mar | feb | yan].groupby('id_show')['is_clicked'].mean()

yan_user_size = df.loc[yan].groupby('id_user')['is_clicked'].size()
feb_user_size = df.loc[feb | yan].groupby('id_user')['is_clicked'].size()
mar_user_size = df.loc[mar | feb | yan].groupby('id_user')['is_clicked'].size()

yan_show_size = df.loc[yan].groupby('id_show')['is_clicked'].size()
feb_show_size = df.loc[feb | yan].groupby('id_show')['is_clicked'].size()
mar_show_size = df.loc[mar | feb | yan].groupby('id_show')['is_clicked'].size()

ALPHA = 3
yan_user_target_ctr = (yan_user_mean * yan_user_size + ALPHA * YAN_MEAN) / (yan_user_size + ALPHA)
feb_user_target_ctr = (feb_user_mean * feb_user_size + ALPHA * FEB_MEAN) / (feb_user_size + ALPHA)
mar_user_target_ctr = (mar_user_mean * mar_user_size + ALPHA * MAR_MEAN) / (mar_user_size + ALPHA)

yan_show_target_ctr = (yan_show_mean * yan_show_size + ALPHA * YAN_MEAN) / (yan_show_size + ALPHA)
feb_show_target_ctr = (feb_show_mean * feb_show_size + ALPHA * FEB_MEAN) / (feb_show_size + ALPHA)
mar_show_target_ctr = (mar_show_mean * mar_show_size + ALPHA * MAR_MEAN) / (mar_show_size + ALPHA)

val_train['fr:id_user'] = val_train['id_user'].map(yan_user_target_ctr)
val_test['fr:id_user'] = val_test['id_user'].map(feb_user_target_ctr)

val_train['fr:id_show'] = val_train['id_show'].map(yan_show_target_ctr)
val_test['fr:id_show'] = val_test['id_show'].map(feb_show_target_ctr)

train['fr:id_user'] = train['id_user'].map(feb_user_target_ctr)
test['fr:id_user'] = test['id_user'].map(mar_user_target_ctr)

train['fr:id_show'] = train['id_show'].map(feb_show_target_ctr)
test['fr:id_show'] = test['id_show'].map(mar_show_target_ctr)

In [26]:
fm = val_train.only('fm').columns.tolist()
fc = val_train.only('fc').columns.tolist()
fo = val_train.only('fo').columns.tolist()
fn = val_train.only('fn').columns.tolist()
fk = val_train.only('fk').columns.tolist()
fs = val_train.only('fs').columns.tolist()
fi = val_train.only('fi').columns.tolist()
fr = val_train.only('fr').columns.tolist()

columns = fm + fc + fo + fn + fk + fs + fi + fr

### Метапризнаки

In [28]:
def meta_features(df_val, t_val, df_train, t_train, df_test, features):
    models = [
        LGBMClassifier(n_estimators=200, learning_rate=0.1, 
                     max_depth=4,
                     random_state=1, n_jobs=-1),
        LGBMRegressor(n_estimators=200, learning_rate=0.1, 
                     max_depth=5,
                     random_state=1, n_jobs=-1),
        XGBRegressor(n_estimators=200, learning_rate=0.1, 
                     max_depth=4,
                     random_state=1, n_jobs=-1),
        ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=200, 
                                      max_depth=5, random_state=1),
        ensemble.RandomForestClassifier(
            n_jobs=-1, n_estimators=200, max_depth=5, random_state=1)
    ]
    imp = Imputer()
    scaler = MinMaxScaler()
    f_train = []
    f_test = []
    for model in tqdm_notebook(models):
        model.fit(scaler.fit_transform(
            imp.fit_transform(df_val[features].values)), t_val.values)
        if hasattr(model, 'predict_proba'):
            f_tr = model.predict_proba(
                scaler.transform(imp.transform(df_train[features].values)))[:, 1]
        else:
            f_tr = model.predict(
                scaler.transform(imp.transform(df_train[features].values)))

        model.fit(scaler.fit_transform(
            imp.fit_transform(df_train[features].values)), t_train.values)
        if hasattr(model, 'predict_proba'):
            f_te = model.predict_proba(
                scaler.transform(imp.transform(df_test[features].values)))[:, 1]
        else:
            f_te = model.predict(
                scaler.transform(imp.transform(df_test[features].values)))
        f_train.append(f_tr.reshape(-1, 1))
        f_test.append(f_te.reshape(-1, 1))
    return pd.DataFrame(
        np.concatenate(f_train, axis=1),
        index=df_train.index
    ), pd.DataFrame(
        np.concatenate(f_test, axis=1),
        index=df_test.index
    )

In [29]:
mcnk = fm + fc + fn + fk
mkois = fm + fk + fo + fi + fs
mnkoi = fm + fn + fk + fo + fi

mcnk_train, mcnk_test = meta_features(val_train, y_train, train, y, test, mcnk)
mkois_train, mkois_test = meta_features(val_train, y_train, train, y, test, mkois)
mnkoi_train, mnkoi_test = meta_features(val_train, y_train, train, y, test, mnkoi)










In [39]:
meta_train = [mcnk_train, mkois_train, mnkoi_train]

meta_test = [mcnk_test, mkois_test, mnkoi_test]

In [31]:
f_train = []
f_test = []
for i, meta in enumerate(meta_train):
    meta.columns = [
        "lgbclf_{}".format(i),
        "lgbreg_{}".format(i),
        "xgbreg_{}".format(i),
        "etclf_{}".format(i),
        "rfclf_{}".format(i),
    ]
    f_train.append(meta)

for i, meta in enumerate(meta_test):
    meta.columns = [
        "lgbclf_{}".format(i),
        "lgbreg_{}".format(i),
        "xgbreg_{}".format(i),
        "etclf_{}".format(i),
        "rfclf_{}".format(i),
    ]
    f_test.append(meta)

In [32]:
stack_train = pd.concat(f_train, axis=1)
stack_test = pd.concat(f_test, axis=1)

In [33]:
filt = []
for col in stack_train.columns:
    if log_loss(
            y, 
            np.concatenate(
                [1 - stack_train[col].values.reshape(-1, 1), stack_train[col].values.reshape(-1, 1)], axis=1
            )
        ) <= 0.3:
        filt.append(col)

    print(col, 
        log_loss(
            y, 
            np.concatenate(
                [1 - stack_train[col].values.reshape(-1, 1), stack_train[col].values.reshape(-1, 1)], axis=1
            )
        )
    )

lgbclf_0 0.114758013355
lgbreg_0 0.397655539248
xgbreg_0 0.360116542298
etclf_0 0.249134746924
rfclf_0 0.093029819474
lgbclf_1 0.114006274527
lgbreg_1 0.389707535492
xgbreg_1 0.118376452903
etclf_1 0.261404240198
rfclf_1 0.0926040159345
lgbclf_2 0.106920849378
lgbreg_2 0.39599138448
xgbreg_2 0.304324002502
etclf_2 0.267616010386
rfclf_2 0.0915280882243


In [34]:
strain = stack_train[filt]
stest = stack_test[filt]

### Обучаем 5 lgb-ов и усредняем ответы

In [35]:
p = []
for i in [0, 1, 2, 3, 4]:
    lgb = LGBMClassifier(n_estimators=420, learning_rate=0.01, 
                         max_depth=6, subsample=1.0 - i / 100,
                         random_state=i * 777, n_jobs=-1)
    lgb.fit(pd.concat([train[columns], strain], axis=1).values, y.values)
    preds = lgb.predict_proba(pd.concat([test[columns], stest], axis=1).values)
    print(preds[:, 1].mean())
    p.append(preds[:, 1])

0.135920757928
0.136250732593
0.136172055813
0.136474041405
0.136439349003


In [36]:
preds = []
for ans in p:
    preds.append(ans)
    print(ans.mean())

0.135920757928
0.136250732593
0.136172055813
0.136474041405
0.136439349003


In [37]:
sub = pd.DataFrame(np.mean(preds, axis=0), index=test['id'].astype(int)).rename_axis(
    {0: 'answer'}, axis=1)
sub.to_csv('stackv11.csv')