In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt

In [2]:
reverse_list = [0,1,2,3,4,5,6,7,8,11,15,16,18,19,
                22,24,25,26,27,41,29,
                32,35,37,40,48,49,47,
                55,51,52,53,60,61,62,103,65,66,67,69,
                70,71,74,78,79,
                82,84,89,90,91,94,95,96,97,99,
                105,106,110,111,112,118,119,125,128,
                130,133,134,135,137,138,
                140,144,145,147,151,155,157,159,
                161,162,163,164,167,168,
                170,171,173,175,176,179,
                180,181,184,185,187,189,
                190,191,195,196,199]

In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [4]:
def scale_dfs(train_df, test_df):
    public_index = np.load("../input/public_LB.npy")
    private_index = np.load("../input/private_LB.npy")
    synthetic_index = np.load("../input/synthetic_samples_indexes.npy")
    private_df, public_df, synthetic_df = test_df.iloc[private_index], test_df.iloc[public_index], test_df.iloc[synthetic_index]
    
    df = pd.concat([train_df, private_df, public_df])
    for col in df.columns:
        mean = df[col].mean()
        std = df[col].std()
        df[col] = (df[col] - mean)/std
        test_df[col] = (test_df[col] - mean)/std
    return df[:len(train_df)], test_df

In [5]:
train_labels = train_df['target']
train_index = np.array(train_df.index)

train_df.drop(['ID_code', 'target'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)

In [6]:
train_df, test_df = scale_dfs(train_df, test_df)

In [7]:
df = pd.concat([train_df, test_df])
df.iloc[:,reverse_list] = df.iloc[:,reverse_list]*-1
train_df, test_df = df[:len(train_df)], df[len(train_df):]

In [8]:
train_df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0.575795,1.274444,-0.453379,0.830418,-0.23635,0.537107,0.33672,-0.609621,1.561255,-1.474505,...,-0.266249,1.150882,0.817744,-0.410031,0.169047,1.58061,-1.023351,-0.375911,-1.028085,-0.211599
1,-0.271662,0.622371,-1.192676,0.685742,-0.792504,-1.539111,-0.24278,0.002261,-0.859338,0.419448,...,-0.969442,-0.091191,0.443836,1.908395,-0.816605,-1.520076,-1.068886,-0.131172,0.824545,-0.503319
2,0.679856,0.276048,-0.518717,-0.538043,0.305429,0.511711,-1.768452,0.563124,1.561045,-1.308042,...,0.069197,-0.775204,-0.174022,-0.411333,1.151288,-2.29516,1.617364,-0.697307,-0.382853,-0.354228
3,-0.126794,0.129301,0.666879,-0.197321,-0.929708,-0.409944,-0.499001,0.472634,1.84359,0.548973,...,-0.273091,0.893323,-0.818467,-0.477527,1.607266,0.791952,0.958309,1.501107,0.69619,0.546479
4,0.275856,-0.035855,-0.819674,0.075511,-0.740066,-0.954817,-0.611762,-0.792297,-1.795132,0.090005,...,1.033234,-0.686243,-1.406086,1.468424,-1.499663,0.96092,-0.298661,0.644304,0.705394,0.528342


In [26]:
df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0.575795,1.274444,-0.453379,0.830418,-0.23635,0.537107,0.33672,-0.609621,1.561255,-1.474505,...,-0.266249,1.150882,0.817744,-0.410031,0.169047,1.58061,-1.023351,-0.375911,-1.028085,-0.211599
1,-0.271662,0.622371,-1.192676,0.685742,-0.792504,-1.539111,-0.24278,0.002261,-0.859338,0.419448,...,-0.969442,-0.091191,0.443836,1.908395,-0.816605,-1.520076,-1.068886,-0.131172,0.824545,-0.503319
2,0.679856,0.276048,-0.518717,-0.538043,0.305429,0.511711,-1.768452,0.563124,1.561045,-1.308042,...,0.069197,-0.775204,-0.174022,-0.411333,1.151288,-2.29516,1.617364,-0.697307,-0.382853,-0.354228
3,-0.126794,0.129301,0.666879,-0.197321,-0.929708,-0.409944,-0.499001,0.472634,1.84359,0.548973,...,-0.273091,0.893323,-0.818467,-0.477527,1.607266,0.791952,0.958309,1.501107,0.69619,0.546479
4,0.275856,-0.035855,-0.819674,0.075511,-0.740066,-0.954817,-0.611762,-0.792297,-1.795132,0.090005,...,1.033234,-0.686243,-1.406086,1.468424,-1.499663,0.96092,-0.298661,0.644304,0.705394,0.528342


In [30]:
for i, col in enumerate(df.columns[:200]):
    if col != "var_68":
        temp = df.groupby("var_68", as_index=False).agg({col:{"max_{}".format(col):np.max, 
                                                                    "min_{}".format(col):np.min}})
        temp.columns = ["var_68", "max_{}".format(col), "min_{}".format(col)]
        temp["maxmin_diff_{}".format(i)] = temp["max_{}".format(col)]-temp["min_{}".format(col)]
        temp.drop(columns=["max_{}".format(col), "min_{}".format(col)], inplace=True)

        df = pd.merge(df, temp, how="left", on="var_68")

In [31]:
df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,maxmin_diff_196,max_var_197,min_var_197,maxmin_diff_197,max_var_198,min_var_198,maxmin_diff_198,max_var_199,min_var_199,maxmin_diff_199
0,0.575795,1.274444,-0.453379,0.830418,-0.23635,0.537107,0.33672,-0.609621,1.561255,-1.474505,...,5.054581,3.210043,-2.762058,5.972101,2.545896,-2.714581,5.260477,3.060445,-2.222139,5.282585
1,-0.271662,0.622371,-1.192676,0.685742,-0.792504,-1.539111,-0.24278,0.002261,-0.859338,0.419448,...,4.735194,2.493199,-2.577473,5.070672,2.95987,-2.573666,5.533535,2.961988,-2.591228,5.553215
2,0.679856,0.276048,-0.518717,-0.538043,0.305429,0.511711,-1.768452,0.563124,1.561045,-1.308042,...,5.255114,3.654894,-2.654673,6.309567,2.659698,-3.181518,5.841216,2.958585,-2.420722,5.379307
3,-0.126794,0.129301,0.666879,-0.197321,-0.929708,-0.409944,-0.499001,0.472634,1.84359,0.548973,...,4.735194,2.493199,-2.577473,5.070672,2.95987,-2.573666,5.533535,2.961988,-2.591228,5.553215
4,0.275856,-0.035855,-0.819674,0.075511,-0.740066,-0.954817,-0.611762,-0.792297,-1.795132,0.090005,...,5.038681,2.869754,-2.588548,5.458302,2.560283,-2.507378,5.067661,2.794742,-2.991413,5.786154


In [33]:
train_df, test_df = df[:len(train_df)], df[len(train_df):]

In [34]:
statistics_dict = {}
for col in train_df.columns[200:]:
    statistic, pvalue = ks_2samp(train_df[train_labels==0][col], train_df[train_labels==1][col])
    statistics_dict[col] = statistic
    #fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    #sns.kdeplot(train_df[train_labels==0][col], ax=ax, label='Target == 0')
    #sns.kdeplot(train_df[train_labels==1][col], ax=ax, label='Target == 1')
    #ax.set_title('name: {}, statistics: {:.5f}, pvalue: {:5f}'.format(col, statistic, pvalue))
    #plt.show()

In [35]:
sorted(statistics_dict.items(), reverse=True, key=lambda x: x[1])

[('maxmin_diff_188', 0.017845399865922984),
 ('maxmin_diff_130', 0.01694960582352817),
 ('min_var_78', 0.016945191704360207),
 ('maxmin_diff_107', 0.016562184447522887),
 ('min_var_41', 0.016548912220039624),
 ('max_var_165', 0.01611539980648169),
 ('max_var_188', 0.016085311333782326),
 ('maxmin_diff_125', 0.01607542049858901),
 ('min_var_43', 0.016052441080970647),
 ('max_var_171', 0.015965038202558512),
 ('maxmin_diff_54', 0.015925764158066846),
 ('max_var_17', 0.01592251773383163),
 ('maxmin_diff_16', 0.015917908353502486),
 ('maxmin_diff_147', 0.01578452174820416),
 ('maxmin_diff_78', 0.015678485534166453),
 ('min_var_26', 0.015601081907909597),
 ('maxmin_diff_18', 0.015540250035556591),
 ('maxmin_diff_53', 0.015525422356557073),
 ('maxmin_diff_113', 0.015501218269786121),
 ('maxmin_diff_135', 0.015368052923593944),
 ('max_var_130', 0.015351311906473875),
 ('max_var_127', 0.015278876100298155),
 ('min_var_59', 0.015261314765042),
 ('maxmin_diff_165', 0.015084605073609147),
 ('maxm

In [37]:
df.columns

Index(['var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7',
       'var_8', 'var_9',
       ...
       'maxmin_diff_196', 'max_var_197', 'min_var_197', 'maxmin_diff_197',
       'max_var_198', 'min_var_198', 'maxmin_diff_198', 'max_var_199',
       'min_var_199', 'maxmin_diff_199'],
      dtype='object', length=797)