In [25]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb


In [26]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# This prints out (rows, columns) in each dataframe
print('Train shape:', df_train.shape)
print('Test shape:', df_test.shape)

print('Columns:', df_train.columns)

y_train = df_train['target'].values
id_train = df_train['id'].values
id_test = df_test['id'].values

# People are saying these colunms are useless
col_to_drop = df_train.columns[df_train.columns.str.startswith('ps_calc_')]
df_train = df_train.drop(col_to_drop, axis=1)  
df_test = df_test.drop(col_to_drop, axis=1)  

# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing
x_train = df_train.drop(['target', 'id'], axis=1)
x_test = df_test.drop(['id'], axis=1)

len_train = len(x_train)
len_test = len(x_test)

x_all = pd.concat([x_train,x_test])

Train shape: (595212, 59)
Test shape: (892816, 58)
Columns: Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'p

In [27]:
# https://www.kaggle.com/tilii7/dimensionality-reduction-pca-tsne

from sklearn.decomposition import PCA, FastICA


n_comp = 8
# PCA
print('\nRunning PCA ...')
pca = PCA(n_components=n_comp, svd_solver='full', random_state=1001)
x_pca = pca.fit_transform(x_all)
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

print('Individual variance contributions:')
for j in range(n_comp):
    print(pca.explained_variance_ratio_[j])
    

#ICA
ica = FastICA(n_components=n_comp, random_state=1001)
ica2_results = ica.fit_transform(x_all)
    
for i in range(1, n_comp+1):
    x_all['pca_' + str(i)] = x_pca[:,i-1]
    
    x_all['ica_' + str(i)] = ica2_results[:,i-1]

    





Running PCA ...
Explained variance: 0.9945
Individual variance contributions:
0.936701152817
0.026236607816
0.0113679015548
0.00632251660249
0.00553843010137
0.00374594338885
0.00304063774345
0.00156651044832


In [28]:
print('Columns:', x_all.columns)

x_train = x_all.head(len_train)
x_test = x_all.tail(len_test)

Columns: Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'pca_1', 'ica_1',
       'pca_2', 'ica_2', 'pca_3', 'ica_3', 'pca_4', 'ica_4', 'pca_5', 'ica_5',
       'pca_6', 'ica_6', 'pca_7', 'ica_7', 'pca_8', 'ica_8'],
      dtype='object')


In [9]:
# Take a random 20% of the dataset as validation data
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)
print('Train samples: {} Validation samples: {}'.format(len(x_train), len(x_valid)))

Train samples: 476169 Validation samples: 119043


In [10]:
# Convert our data into XGBoost format
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)
d_test = xgb.DMatrix(x_test)

# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.02
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9

# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

# This is the data xgboost will test on after eachboosting round
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train the model! We pass in a max of 10,000 rounds (with early stopping after 100)
# and the custom metric (maximize=True tells xgb that higher metric is better)
mdl = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, feval=gini_xgb, maximize=True, verbose_eval=10)

# Predict on our test data
p_test = mdl.predict(d_test)

# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = p_test
sub.to_csv('xgb1.csv', index=False)

print(sub.head())

[0]	train-gini:0.207231	valid-gini:0.185754
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-gini:0.258187	valid-gini:0.232299
[20]	train-gini:0.266275	valid-gini:0.238546
[30]	train-gini:0.269701	valid-gini:0.239675
[40]	train-gini:0.271273	valid-gini:0.240487
[50]	train-gini:0.274406	valid-gini:0.241715
[60]	train-gini:0.277528	valid-gini:0.243946
[70]	train-gini:0.279208	valid-gini:0.243919
[80]	train-gini:0.283517	valid-gini:0.246366
[90]	train-gini:0.285353	valid-gini:0.246499
[100]	train-gini:0.289745	valid-gini:0.248597
[110]	train-gini:0.293176	valid-gini:0.250028
[120]	train-gini:0.297959	valid-gini:0.251996
[130]	train-gini:0.302551	valid-gini:0.25352
[140]	train-gini:0.307219	valid-gini:0.25513
[150]	train-gini:0.310734	valid-gini:0.257352
[160]	train-gini:0.315883	valid-gini:0.259297
[170]	train-gini:0.320993	valid-gini:0.260987
[180]	train-gini:0.326468	valid-gini:0.

In [29]:

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True



# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

X = x_train
features = X.columns
X = X.values
y = df_train['target'].values
sub=df_test['id'].to_frame()
sub['target']=0

nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=10)
    sub['target'] += xgb_model.predict(xgb.DMatrix(x_test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()
sub.head(2)

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':5, 'num_leaves': 10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=10, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(x_test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub.to_csv('sub10.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

 xgb kfold: 1  of  5 : 
[0]	train-gini:0.184787	valid-gini:0.181391
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-gini:0.235278	valid-gini:0.233369
[20]	train-gini:0.237198	valid-gini:0.233442
[30]	train-gini:0.242538	valid-gini:0.237413
[40]	train-gini:0.24421	valid-gini:0.239684
[50]	train-gini:0.244821	valid-gini:0.240043
[60]	train-gini:0.247574	valid-gini:0.241823
[70]	train-gini:0.248363	valid-gini:0.24191
[80]	train-gini:0.252457	valid-gini:0.244597
[90]	train-gini:0.253192	valid-gini:0.244584
[100]	train-gini:0.254692	valid-gini:0.245033
[110]	train-gini:0.256448	valid-gini:0.245302
[120]	train-gini:0.259962	valid-gini:0.247203
[130]	train-gini:0.263019	valid-gini:0.248174
[140]	train-gini:0.266401	valid-gini:0.250509
[150]	train-gini:0.26896	valid-gini:0.252256
[160]	train-gini:0.272418	valid-gini:0.254064
[170]	train-gini:0.27541	valid-gini:0.255314
[180]	train-gini:

Unnamed: 0,id,target
0,0,0.027331
1,1,0.026201
