In [2]:
import numpy as np
import pandas as pd
import gc
import time
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import base64
import os
from sklearn.preprocessing import StandardScaler
import datetime

In [3]:
num_folds = 5
windows_flag = False
SEED = 42

In [4]:
print('Load Train Data.')
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print('\nShape of Train Data: {}\t Shape of Test Data: {}'
    .format(train_df.shape, test_df.shape))

train_df.drop(['ID_code'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)

train_labels = train_df['target']
train_index = np.array(train_df.index)

train_df.drop(['target'], axis=1, inplace=True)

oof_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])

Load Train Data.

Shape of Train Data: (200000, 202)	 Shape of Test Data: (200000, 201)


In [13]:
def shuffle_features(df, target, seed):
    _seed = seed
    _df = pd.DataFrame()
    for column in df.columns:
        _df[column] = df[column].sample(frac=1,random_state=_seed).values
        _seed += 1
    return _df

In [14]:
def upsample_data(df, target):
    df_1 = df[target==1]
    for i in range(8):
        df = pd.concat([df, shuffle_features(df_1, 1, SEED+10*i)], axis=0, sort=False)
        target = pd.concat([target, pd.Series(np.ones(len(df_1)))], axis=0)

    df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    target = target.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return df, target

In [18]:
def expand_data(df, target):
    df_1 = df[target==1]
    df_0 = df[target==0]
    df = pd.concat([df_1, df_0], axis=0, sort=False)
    target = pd.concat([target[target==1], target[target==0]], axis=0)
    for i in range(4):
        df = pd.concat([df, shuffle_features(df_1, 1, SEED+10*i)], axis=0, sort=False)
        target = pd.concat([target, pd.Series(np.ones(len(df_1)))], axis=0)
    for i in range(4):
        df = pd.concat([df, shuffle_features(df_0, 0, (i+2)*SEED+10*i)], axis=0, sort=False)
        target = pd.concat([target, pd.Series(np.zeros(len(df_0)))], axis=0)

    df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    target = target.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return df, target

In [19]:
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [20]:
for counter, ids in enumerate(skf.split(train_index, train_labels)):
    print('\nFold {}'.format(counter+1))
    X_train, y_train = train_df.loc[ids[0],:], train_labels.loc[ids[0]]
    print("Upsample train data in fold.")
    X_train, y_train = upsample_data(X_train, y_train)
    print("Shape in train before expanding:", X_train.shape, "\tratio of positive to negative samples:", y_train.value_counts()[1]/y_train.value_counts()[0])
    
    print("Expanding data.")
    X_train, y_train = expand_data(X_train, y_train)
    
    X_val, y_val = train_df.loc[ids[1], :], train_labels.loc[ids[1]]
    
    print("Shape in train after expanding:", X_train.shape, "\tratio of positive to negative samples:", y_train.value_counts()[1]/y_train.value_counts()[0])
    
    print("Shape in val:", X_val.shape, "\tratio of positive to negative samples:", y_val.value_counts()[1]/y_val.value_counts()[0])


Fold 1
Upsample train data in fold.
Shape in train before expanding: (288623, 200) 	ratio of positive to negative samples: 1.0054265881976918
Expanding data.
Shape in train after expanding: (1443115, 200) 	ratio of positive to negative samples: 1.0054265881976918
Shape in val: (40001, 200) 	ratio of positive to negative samples: 0.11172563297295796

Fold 2
Upsample train data in fold.
Shape in train before expanding: (288623, 200) 	ratio of positive to negative samples: 1.0054265881976918
Expanding data.
Shape in train after expanding: (1443115, 200) 	ratio of positive to negative samples: 1.0054265881976918
Shape in val: (40001, 200) 	ratio of positive to negative samples: 0.11172563297295796

Fold 3
Upsample train data in fold.
Shape in train before expanding: (288624, 200) 	ratio of positive to negative samples: 1.0054196022845707
Expanding data.
Shape in train after expanding: (1443120, 200) 	ratio of positive to negative samples: 1.0054196022845707
Shape in val: (40000, 200) 	rat