# Run LGBM on other training set

In last sections, I get lgbm's parameters on a training set and achieved good performance on the validation set

In this section, I will build other form of training set, and check whether the performance on validation set can be better

**Specifically, I will change the proportion of resampling and augmentation, and check whether the performance on validation set can be better**

In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTETomek
from time import time
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

## 1. Load data

In [3]:
# load the data
df = pd.read_csv("data/train.csv")
print(df.shape)

(200000, 202)


In [4]:
df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
df_data, df_label = df[df.columns[2:]], df[df.columns[1]]

In [6]:
df_data.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [7]:
df_label = pd.DataFrame(df_label)
df_label.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [35]:
# df_label.to_csv("data/label_nosmote.csv")

## 2. Preprocessing

+ Add sum / min / max / mean / std / skew / kurtosis / median / moving average
    + For moving average, read np.ma.average for more details
+ Variance of each column
+ Here is also a magic feature, read https://www.kaggle.com/c/santander-customer-transaction-prediction/discussion/87486#latest-506429 for more details

In [9]:
# add some basic columns
def basic_preprocessing(df):
    columns = df.columns
    df['sum'] = df[columns].sum(axis=1)  
    df['min'] = df[columns].min(axis=1)
    df['max'] = df[columns].max(axis=1)
    df['mean'] = df[columns].mean(axis=1)
    df['std'] = df[columns].std(axis=1)
    df['skew'] = df[columns].skew(axis=1)
    df['kurt'] = df[columns].kurtosis(axis=1)
    df['med'] = df[columns].median(axis=1)
    
    # add round features
    for column in columns:
        df['r1_' + column] = np.round(df[column], 1)
        df['r2_' + column] = np.round(df[column], 2)
    return df

In [10]:
df_data_prep = df_data.copy()
df_data_prep = basic_preprocessing(df_data_prep)
print(df_data_prep.shape)
df_data_prep.head()

(200000, 608)


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,r1_var_195,r2_var_195,r1_var_196,r2_var_196,r1_var_197,r2_var_197,r1_var_198,r2_var_198,r1_var_199,r2_var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,-2.4,-2.4,7.9,7.88,8.6,8.56,12.8,12.78,-1.1,-1.09
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,2.0,2.03,8.1,8.13,8.8,8.79,18.4,18.36,2.0,1.95
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,3.1,3.14,-6.5,-6.52,8.3,8.27,14.7,14.72,0.4,0.4
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,-1.3,-1.27,-2.9,-2.93,10.3,10.29,18.0,17.97,-9.0,-9.0
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.5,-1.51,3.9,3.93,9.5,9.5,18.0,18.0,-8.8,-8.81


---
## 3. Split training and validation set

20% validation data

In [11]:
def split_train_val(df, train_path=None, val_path=None):
    df_train, df_val = df[:160000], df[160000:]
    print(df_train.shape, df_val.shape)
    # df_train.to_csv(train_path)
    # df_val.to_csv(val_path)
    return df_train, df_val

In [13]:
# train_path = "data/train_data.csv"
# val_path = "data/val_data.csv"
train_data, val_data = split_train_val(df_data_prep)
# train_path = "data/train_label.csv"
# val_path = "data/val_label.csv"
train_label, val_label = split_train_val(df_label)

(160000, 608) (40000, 608)
(160000, 1) (40000, 1)


---

## 4. Resample and augmentation

In [14]:
# this function is modified from https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment
def augment(df, features, t):
    x = df.iloc[:,:-1].values
    y = df['target'].values
    
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn]).astype(np.uint64)
    
    features = pd.DataFrame(x, columns=features)
    labels = pd.DataFrame(y, columns=['target'])
    combined = pd.concat([features, labels], axis=1)
    return combined

In [22]:
def resample_and_augment(df_data, df_label, t=3, zero_fraction=0.75):
    """
    we will sample only 1/4 data from 0s, then combine them with 1s
    remember to combine features with label before feeding into this function
    """
    # combine data and target
    features = df_data.columns
    df_data = pd.concat([df_data, df_label], axis=1)
    df_ones = df_data[df_data['target'] == 1]
    df_zeros = df_data[df_data['target'] == 0]
    print("Original 1s {}, 0s {}".format(df_ones.shape[0], df_zeros.shape[0]))
    # augment 1s
    aug_ones = augment(df_ones, features, t)
    print("Now we have {} 1s".format(aug_ones.shape[0]))
    
    df_zeros_part = df_zeros.sample(frac=zero_fraction)
    print("part of 0s: {}".format(df_zeros_part.shape[0]))
    
    # combine and shuffle
    df_combine = pd.concat([df_zeros_part, aug_ones]).sample(frac=1)
    print("Combined: {}".format(df_combine.shape))
    train_data_aug = df_combine.iloc[:,:-1]
    train_label_aug = df_combine.loc[:,'target']
    return train_data_aug, train_label_aug

In [75]:
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=3, zero_fraction=0.75)
train_data_aug.shape

Original 1s 16049, 0s 143951
Now we have 64196 1s
part of 0s: 107963
Combined: (172159, 609)


(172159, 608)

In [81]:
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug.head()

Unnamed: 0,target
90426,0.0
36931,0.0
26352,0.0
3559,0.0
59031,1.0


## LGBM on different kinds of training set

In [16]:
param_base = {
    'boosting_type': 'gbdt', 
    'boost_from_average': False, 
    'objective': 'binary', 
    'tree_learner': 'serial', 
    'verbosity': 1,
    'learning_rate': 0.01, 
    'num_threads': 22,
    'metric':'auc',
    
    'num_leaves': 5, 
    'max_depth': 15,
    'min_data_in_leaf': 150,
    'min_sum_hessian_in_leaf': 10,
    
    'bagging_freq': 1,
    'bagging_fraction': 0.6,
    'feature_fraction': 0.05
    }

In [17]:
def train_lgbm(param, train_data, train_label, val_data, val_label):
    print("===== Build dataset for lgbm")
    lgbm_train_data = lgb.Dataset(train_data, label=train_label)
    lgbm_val_data = lgb.Dataset(val_data, label=val_label)
    
    print("===== Start training")
    start_time = time()
    clf = lgb.train(param, 
                    lgbm_train_data, 
                    1000000, 
                    valid_sets = [lgbm_train_data, lgbm_val_data], 
                    verbose_eval = 1000, 
                    early_stopping_rounds = 3000)
    training_time = (time() - start_time) / 60.
    print("===== Training time: {:.2f}min".format(training_time))
    
    # compute auroc
    print("===== Get prediction")
    pred_tr = clf.predict(train_data, num_iteration=clf.best_iteration)
    pred_cv = clf.predict(val_data, num_iteration=clf.best_iteration)
    # get metrics
    print("===== Build metrics")
    train_label, val_label = np.ravel(train_label), np.ravel(val_label)
    a_tr = roc_auc_score(train_label, pred_tr)
    a_cv = roc_auc_score(val_label, pred_cv)
    progress = "auroc|train {:.4f}|val {:.4f}".format(a_tr, a_cv)
    print("Final result")
    print(progress)

In [None]:
# t=3, zero_fraction=0.75 (default)
# auroc|train 0.9830|val 0.9026

In [23]:
# variant 001: t = 3, zero_fraction = 1
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=3, zero_fraction=1)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")

Original 1s 16049, 0s 143951
Now we have 64196 1s
part of 0s: 143951
Combined: (208147, 609)


In [27]:
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)
# training's auc: 0.983164 valid_1's auc: 0.90323

===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.954629	valid_1's auc: 0.835267
[2000]	training's auc: 0.966091	valid_1's auc: 0.863185
[3000]	training's auc: 0.970495	valid_1's auc: 0.876884
[4000]	training's auc: 0.973023	valid_1's auc: 0.884786
[5000]	training's auc: 0.974708	valid_1's auc: 0.890103
[6000]	training's auc: 0.975992	valid_1's auc: 0.893918
[7000]	training's auc: 0.97696	valid_1's auc: 0.896595
[8000]	training's auc: 0.977747	valid_1's auc: 0.898532
[9000]	training's auc: 0.978393	valid_1's auc: 0.899994
[10000]	training's auc: 0.978955	valid_1's auc: 0.900874
[11000]	training's auc: 0.97946	valid_1's auc: 0.90165
[12000]	training's auc: 0.979942	valid_1's auc: 0.90215
[13000]	training's auc: 0.980371	valid_1's auc: 0.90253
[14000]	training's auc: 0.980812	valid_1's auc: 0.902745
[15000]	training's auc: 0.981236	valid_1's auc: 0.902948
[16000]	training's auc: 0.981668	valid_1's au

NameError: name 'roc_auc_score' is not defined

In [29]:
# variant 002: t = 3, zero_fraction = 0.50
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=3, zero_fraction=0.50)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 64196 1s
part of 0s: 71976
Combined: (136172, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.951308	valid_1's auc: 0.833722
[2000]	training's auc: 0.96557	valid_1's auc: 0.8627
[3000]	training's auc: 0.970548	valid_1's auc: 0.877038
[4000]	training's auc: 0.973331	valid_1's auc: 0.88518
[5000]	training's auc: 0.975186	valid_1's auc: 0.890699
[6000]	training's auc: 0.97654	valid_1's auc: 0.894449
[7000]	training's auc: 0.977578	valid_1's auc: 0.89705
[8000]	training's auc: 0.978431	valid_1's auc: 0.898908
[9000]	training's auc: 0.979134	valid_1's auc: 0.900168
[10000]	training's auc: 0.979757	valid_1's auc: 0.901066
[11000]	training's auc: 0.980326	valid_1's auc: 0.901606
[12000]	training's auc: 0.980847	valid_1's auc: 0.901929
[13000]	training's auc: 0.981373	valid_1's auc: 0.902246
[14000]	training's auc: 0.981878	valid_1's auc: 0.902344
[15000]	tra

In [30]:
# variant 003: t = 3, zero_fraction = 0.25
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=3, zero_fraction=0.25)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 64196 1s
part of 0s: 35988
Combined: (100184, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.946879	valid_1's auc: 0.830813
[2000]	training's auc: 0.963882	valid_1's auc: 0.86097
[3000]	training's auc: 0.970503	valid_1's auc: 0.875983
[4000]	training's auc: 0.973839	valid_1's auc: 0.88449
[5000]	training's auc: 0.975921	valid_1's auc: 0.890018
[6000]	training's auc: 0.977406	valid_1's auc: 0.89368
[7000]	training's auc: 0.978537	valid_1's auc: 0.8961
[8000]	training's auc: 0.979453	valid_1's auc: 0.897778
[9000]	training's auc: 0.980227	valid_1's auc: 0.899026
[10000]	training's auc: 0.98093	valid_1's auc: 0.899872
[11000]	training's auc: 0.981571	valid_1's auc: 0.900366
[12000]	training's auc: 0.982205	valid_1's auc: 0.900821
[13000]	training's auc: 0.982817	valid_1's auc: 0.901103
[14000]	training's auc: 0.983414	valid_1's auc: 0.901229
[15000]	tra

In [32]:
# variant 004: t = 1, zero_fraction = 1, i.e. this is dataset without resampling and augmenting
# train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=1, zero_fraction=1)
train_data_aug = train_data.copy()
train_label_aug = train_label.copy().astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.850095	valid_1's auc: 0.847071
[2000]	training's auc: 0.8802	valid_1's auc: 0.872906
[3000]	training's auc: 0.894248	valid_1's auc: 0.884561
[4000]	training's auc: 0.902668	valid_1's auc: 0.890981
[5000]	training's auc: 0.908267	valid_1's auc: 0.895264
[6000]	training's auc: 0.912363	valid_1's auc: 0.897825
[7000]	training's auc: 0.915532	valid_1's auc: 0.899648
[8000]	training's auc: 0.918101	valid_1's auc: 0.900834
[9000]	training's auc: 0.920288	valid_1's auc: 0.901627
[10000]	training's auc: 0.922338	valid_1's auc: 0.902165
[11000]	training's auc: 0.924199	valid_1's auc: 0.902358
[12000]	training's auc: 0.925984	valid_1's auc: 0.902624
[13000]	training's auc: 0.927725	valid_1's auc: 0.9027
[14000]	training's auc: 0.92947	valid_1's auc: 0.902749
[15000]	training's auc: 0.931133	valid_1's auc: 0.902734
[16000]	training's auc: 0.932756	valid_1's au

In [33]:
# variant 005: t = 2, zero_fraction = 1
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=2, zero_fraction=1)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 48147 1s
part of 0s: 143951
Combined: (192098, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.94184	valid_1's auc: 0.838381
[2000]	training's auc: 0.955878	valid_1's auc: 0.865369
[3000]	training's auc: 0.96144	valid_1's auc: 0.8784
[4000]	training's auc: 0.964635	valid_1's auc: 0.886184
[5000]	training's auc: 0.966826	valid_1's auc: 0.891305
[6000]	training's auc: 0.968427	valid_1's auc: 0.894746
[7000]	training's auc: 0.969693	valid_1's auc: 0.897219
[8000]	training's auc: 0.970696	valid_1's auc: 0.89902
[9000]	training's auc: 0.971524	valid_1's auc: 0.900204
[10000]	training's auc: 0.972264	valid_1's auc: 0.901096
[11000]	training's auc: 0.972919	valid_1's auc: 0.901803
[12000]	training's auc: 0.973539	valid_1's auc: 0.902261
[13000]	training's auc: 0.974121	valid_1's auc: 0.902516
[14000]	training's auc: 0.974721	valid_1's auc: 0.902682
[15000]	t

In [34]:
# variant 006: t = 2, zero_fraction = 0.75
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=2, zero_fraction=0.75)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 48147 1s
part of 0s: 107963
Combined: (156110, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.941058	valid_1's auc: 0.837121
[2000]	training's auc: 0.955865	valid_1's auc: 0.865111
[3000]	training's auc: 0.961501	valid_1's auc: 0.878448
[4000]	training's auc: 0.964794	valid_1's auc: 0.88613
[5000]	training's auc: 0.967075	valid_1's auc: 0.89119
[6000]	training's auc: 0.968732	valid_1's auc: 0.89469
[7000]	training's auc: 0.970041	valid_1's auc: 0.897245
[8000]	training's auc: 0.971045	valid_1's auc: 0.898989
[9000]	training's auc: 0.971914	valid_1's auc: 0.900209
[10000]	training's auc: 0.97265	valid_1's auc: 0.901083
[11000]	training's auc: 0.973336	valid_1's auc: 0.901664
[12000]	training's auc: 0.973989	valid_1's auc: 0.902021
[13000]	training's auc: 0.974614	valid_1's auc: 0.902251
[14000]	training's auc: 0.975223	valid_1's auc: 0.90233
[15000]	t

In [35]:
# variant 007: t = 4, zero_fraction = 1
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=4, zero_fraction=1)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 80245 1s
part of 0s: 143951
Combined: (224196, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.960825	valid_1's auc: 0.834688
[2000]	training's auc: 0.972488	valid_1's auc: 0.862765
[3000]	training's auc: 0.976149	valid_1's auc: 0.876318
[4000]	training's auc: 0.978205	valid_1's auc: 0.884361
[5000]	training's auc: 0.979588	valid_1's auc: 0.889884
[6000]	training's auc: 0.98061	valid_1's auc: 0.893642
[7000]	training's auc: 0.981415	valid_1's auc: 0.896445
[8000]	training's auc: 0.982046	valid_1's auc: 0.89832
[9000]	training's auc: 0.982574	valid_1's auc: 0.899888
[10000]	training's auc: 0.983027	valid_1's auc: 0.900921
[11000]	training's auc: 0.98343	valid_1's auc: 0.901615
[12000]	training's auc: 0.983798	valid_1's auc: 0.902113
[13000]	training's auc: 0.984164	valid_1's auc: 0.902474
[14000]	training's auc: 0.984519	valid_1's auc: 0.902687
[15000]

In [36]:
# variant 008: t = 4, zero_fraction = 0.75
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=4, zero_fraction=0.75)
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug = train_label_aug.astype("int64")
train_lgbm(param_base, train_data_aug, train_label_aug, val_data, val_label)

Original 1s 16049, 0s 143951
Now we have 80245 1s
part of 0s: 107963
Combined: (188208, 609)
===== Build dataset for lgbm
===== Start training
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.960205	valid_1's auc: 0.833062
[2000]	training's auc: 0.972098	valid_1's auc: 0.861646
[3000]	training's auc: 0.976043	valid_1's auc: 0.875457
[4000]	training's auc: 0.978251	valid_1's auc: 0.883816
[5000]	training's auc: 0.979698	valid_1's auc: 0.889454
[6000]	training's auc: 0.980735	valid_1's auc: 0.892974
[7000]	training's auc: 0.981557	valid_1's auc: 0.895876
[8000]	training's auc: 0.982227	valid_1's auc: 0.897878
[9000]	training's auc: 0.982766	valid_1's auc: 0.899305
[10000]	training's auc: 0.983247	valid_1's auc: 0.900334
[11000]	training's auc: 0.983674	valid_1's auc: 0.901227
[12000]	training's auc: 0.984073	valid_1's auc: 0.901781
[13000]	training's auc: 0.984448	valid_1's auc: 0.902041
[14000]	training's auc: 0.984816	valid_1's auc: 0.902273
[150

## Finished, in next part we will try to run on dataset with dimension reduction

## PCA following is not used --> reduce the performance

## 3. PCA
Here I reduce the dimension to 60

In [38]:
def dimension_reduction(df, n_components=200):
    pca = PCA(n_components=n_components)
    df_processed = pca.fit_transform(df)
    df_processed = pd.DataFrame(df_processed, columns = ['var_pca_{}'.format(i) for i in range(n_components)])
    return df_processed

In [40]:
df_data_pca = dimension_reduction(df_data_prep)
print(df_data_pca.shape)
df_data_pca.head()

(200000, 200)


Unnamed: 0,var_pca_0,var_pca_1,var_pca_2,var_pca_3,var_pca_4,var_pca_5,var_pca_6,var_pca_7,var_pca_8,var_pca_9,...,var_pca_190,var_pca_191,var_pca_192,var_pca_193,var_pca_194,var_pca_195,var_pca_196,var_pca_197,var_pca_198,var_pca_199
0,-105.860128,-5.400743,37.417087,3.935585,3.77736,4.637825,-18.165755,14.889708,-29.852264,6.936472,...,-0.087427,0.108608,-0.786297,0.853786,-0.416321,-0.124596,0.560179,0.046222,-0.049892,-0.254745
1,-54.355348,-70.943602,-28.984502,-30.938672,1.802316,-2.775054,2.662385,-21.835452,-10.575077,21.371416,...,0.06977,-0.95839,0.152369,0.402786,0.310333,-0.335566,-0.435642,0.488061,-0.054247,0.210107
2,110.48636,24.541343,3.941811,23.961675,-15.154132,-9.355453,4.965564,35.465815,4.862782,0.919669,...,0.24064,-0.281654,0.025007,0.780489,-0.774717,0.37703,-0.357587,0.188147,0.243865,-0.336344
3,69.503042,-10.319518,-26.484079,22.524163,-37.881464,-15.064502,27.688344,14.940958,14.542261,-4.162862,...,-0.109669,-0.399767,-0.412265,-0.472733,0.074901,-0.754768,-0.589803,-0.404724,-0.30419,-0.307361
4,13.596137,-93.729275,-43.654269,35.223109,44.294176,-10.231406,19.801837,-28.270031,31.285243,-27.998473,...,0.405957,2.634627,-0.538385,-0.123605,-0.193773,0.195125,-0.09121,-0.423971,-0.280581,0.246918


In [41]:
train_data, val_data = split_train_val(df_data_pca)
train_label, val_label = split_train_val(df_label)

(160000, 200) (40000, 200)
(160000, 1) (40000, 1)
