![](header.png)

## Постановка задачи
Необходимо определить стоимость транзакций для каждого потенциального клиента. Это первый шаг, для персонализации своих услуг.

Согласно исследованиям Epsilon, 80% клиентов с большей вероятностью будут вести бизнес с теми, кто предоставляет персонализированное обслуживание. Банковское дело не является исключением.

Цифровизация повседневной жизни означает, что клиенты ожидают, что услуги будут предоставляться персонализированно и своевременно и даже, прежде чем они даже осознают, что им нужна эта услуга. 

Метрика оценки - RMSLE

## Подход к решению задачи
Применяем раздичные подходы и методы ML, после чего блендим решения в одно.

## Data Leak
Для того, что бы вникнуть в Data Leak по этому соревнованию обнаруженный пользователем [Giba](https://www.kaggle.com/titericz). Рекомендую почитать [обсуждение](https://www.kaggle.com/c/santander-value-prediction-challenge/discussion/61329) и посмотреть [кернел](https://www.kaggle.com/johnfarrell/giba-s-property-extended-extended-result).  

## Подгружаем все необходимое

In [1]:
import numpy as np 
import pandas as pd 
import os
print(os.listdir("../input"))

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['santander-value-prediction-challenge', 'publicsubs']


In [3]:
train = pd.read_csv("../input/santander-value-prediction-challenge/train.csv")
test = pd.read_csv("../input/santander-value-prediction-challenge/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

[Здесь](https://www.kaggle.com/johnfarrell/giba-s-property-extended-result) мы берем столбцы временных рядов

In [4]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'] 

Ищем data leak:
1.  Получите строку из всех значений после удаления первых двух
1.  Для всех строк мы сдвигаем строку на два шага и снова делаем строку
1.  Просто найдите строки, где строка из 2 соответствует строке из 1
1.  Получите 1-й шаг в строке из 3 (в настоящее время есть дополнительное условие только для получения значения, если мы получили ровно одно совпадение на шаге 3)

In [5]:
from multiprocessing import Pool
CPU_CORES = 1
def _get_leak(df, cols, lag=0):
    series_str = df[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
    series_shifted_str = df[cols].shift(lag+2, axis=1)[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
    target_rows = series_shifted_str.progress_apply(lambda x: np.where(x == series_str)[0])
    target_vals = target_rows.apply(lambda x: df.loc[x[0], cols[lag]] if len(x)==1 else 0)
    return target_vals

def get_all_leak(df, cols=None, nlags=15):
    df =  df.copy()

    for i in range(nlags):
        print("Processing lag {}".format(i))
        df["leaked_target_"+str(i)] = _get_leak(df, cols, i)
    return df

In [6]:
test["target"] = train["target"].mean()

all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
all_df.head()

Unnamed: 0,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1,15ace8c9f,fb0f5dbfe,58e056e12,20aa07010,024c577b9,d6bb78916,b43a7cfd5,58232a6fb,1702b5bf0,324921c7b,62e59a501,2ec5b290f,241f0f867,fb49e4212,66ace2992,f74e8f13d,5c6487af1,963a49cdc,26fc93eb7,1931ccfdd,703885424,70feb1494,491b9ee45,23310aa6f,e176a204a,6619d81fc,1db387535,fc99f9426,91f701ba2,0572565c2,190db8488,adb64ff71,c47340d97,c5a231d81,0ff32eb98
0,000d6aaf2,38000000.0,1866666.66,12066666.66,700000.0,600000.0,900000.0,4100000.0,0.0,0.0,0.0,0.0,0.0,0.0,28000000.0,0.0,6050000.0,0.0,0.0,950000.0,0.0,0.0,1733333.34,0.0,13200000.0,3205000.0,2000000.0,0.0,1200000.0,0.0,0.0,0.0,400000.0,0.0,0.0,5000000.0,400000.0,0.0,0.0,0.0,0.0,0.0
1,000fbd867,600000.0,0.0,2850000.0,2225000.0,1800000.0,800000.0,0.0,0.0,3300000.0,2200000.0,0.0,2000000.0,0.0,0.0,16000000.0,7000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0027d6b71,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5500000.0,0.0,0.0,12000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,2000000.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002a68644,14400000.0,0.0,0.0,0.0,0.0,37662000.0,0.0,4000000.0,6700000.0,2000000.0,5400000.0,0.0,0.0,0.0,0.0,0.0,0.0,1180000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8000000.0,0.0,0.0


In [None]:
NLAGS = 25
all_df = get_all_leak(all_df, cols=cols, nlags=NLAGS)

Processing lag 0


In [None]:
#https://www.kaggle.com/tezdhar/breaking-lb-fresh-start, LB 0.69
BEST_69 = pd.read_csv("../input/publicsubs/baseline_submission_with_leaks.csv")

# ？
ROUNED_MIN2 = pd.read_csv("../input/publicsubs/baseline_submission_with_leaks_ROUNDED_MINUS2.csv")

# https://www.kaggle.com/johnfarrell/baseline-with-lag-select-fake-rows-dropped, LB: 0.69
NOFAKE = pd.read_csv("../input/publicsubs/non_fake_sub_lag_29.csv")

# https://www.kaggle.com/ogrellier/feature-scoring-vs-zeros/output, xgb, LB 0.66
XGB = pd.read_csv("../input/publicsubs/leaky_submission.csv")

# https://www.kaggle.com/zeus75/xgboost-features-scoring-with-ligthgbm-model/output, LB 0.65
XGB1 = pd.read_csv("../input/publicsubs/leaky_submission1.csv")

# https://www.kaggle.com/the1owl/love-is-the-answer/output?scriptVersionId=4733381, 0.63
BLEND04 = pd.read_csv("../input/publicsubs/blend04.csv") 

# https://www.kaggle.com/prashantkikani/santad-label-is-present-in-row/output.  0.63
ISLABEL = pd.read_csv("../input/publicsubs/final.csv")

# https://www.kaggle.com/danil328/ligthgbm-with-bayesian-optimization/output  0.65
MYSUB = pd.read_csv("../input/publicsubs/my_submission.csv")

# https://www.kaggle.com/nulldata/jiazhen-to-armamut-via-gurchetan1000-0-56/output 0.56
JIAZHEN = pd.read_csv("../input/publicsubs/baseline_sub_lag_37.csv")

In [None]:
CORR = pd.DataFrame()
#CORR['BEST_69'] = BEST_69.target
#CORR['ROUNED_MIN2'] = ROUNED_MIN2.target
#CORR['NOFAKE'] = NOFAKE.target
#CORR['XGB'] = XGB.target
CORR['XGB1'] = XGB1.target
CORR['BLEND04'] = BLEND04.target
CORR['ISLABEL'] = ISLABEL.target
CORR['MYSUB'] = MYSUB.target
CORR['JIAZHEN'] = JIAZHEN.target

In [None]:
print(CORR.corr())

In [None]:
ENS_LEAKS = BEST_69.copy()

ENS_LEAKS.target = 0.6*JIAZHEN['target']+0.1*BLEND04['target']+0.1*ISLABEL['target']+0.1*XGB1['target']+0.1*MYSUB['target']

ENS_LEAKS.to_csv("ENS_LEAKS.csv", index=None)

In [None]:
ENS_LEAKS = BEST_69.copy()

ENS_LEAKS.target = 0.5*JIAZHEN['target']+0.15*BLEND04['target']+0.1*ISLABEL['target']+0.1*XGB1['target']+0.15*MYSUB['target']

ENS_LEAKS.to_csv("ENS_LEAKS.csv", index=None)