In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from  sklearn.preprocessing import LabelEncoder
from  sklearn.preprocessing import OneHotEncoder
import scipy 
from scipy.sparse import coo_matrix, hstack

from __future__ import division
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
 

In [2]:
X = np.load( "../data/sparse/train_new.csv.npy")
X_test = np.load("../data/sparse/test_new.csv.npy" )


In [3]:
feat_labels = np.arange( X.shape[0] )

In [4]:
labels_train = pd.read_csv("../data/labels_train.csv", header = None )[1].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split( X,  labels_train, test_size=0.4, random_state=0)

In [6]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.9
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "metric":["auc" ,"binary_logloss"] , 
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": True,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
}

In [7]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)
final_cv_train = np.zeros(len(labels_train))
final_cv_pred = np.zeros(len( X_test))
x_score = []
for s in xrange(16):
    cv_train = np.zeros(len(labels_train))
    cv_pred = np.zeros(len(X_test))

    params['seed'] = s

    if True:
        kf = kfold.split( X ,   labels_train )

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], labels_train[train_fold], labels_train[validate]
            dtrain = lgb.Dataset(X_train, label_train)
            dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid , verbose_eval=100,early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)
            
            
            score = roc_auc_score( label_validate, cv_train[validate] )
            print( score )
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print roc_auc_score(labels_train, cv_train)
        print "current score:", roc_auc_score( labels_train , final_cv_train / (s + 1.)), s+1
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(roc_auc_score( labels_train , cv_train))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.603812	valid_0's auc: 0.710271
[200]	valid_0's binary_logloss: 0.579053	valid_0's auc: 0.719163
[300]	valid_0's binary_logloss: 0.561839	valid_0's auc: 0.719873
[400]	valid_0's binary_logloss: 0.546489	valid_0's auc: 0.719172
Early stopping, best iteration is:
[339]	valid_0's binary_logloss: 0.555673	valid_0's auc: 0.720215
0.7202149454900566
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.60546	valid_0's auc: 0.709878
[200]	valid_0's binary_logloss: 0.581408	valid_0's auc: 0.717052
[300]	valid_0's binary_logloss: 0.563946	valid_0's auc: 0.719065
[400]	valid_0's binary_logloss: 0.548996	valid_0's auc: 0.719217
[500]	valid_0's binary_logloss: 0.534752	valid_0's auc: 0.719564
Early stopping, best iteration is:
[459]	valid_0's binary_logloss: 0.540422	valid_0's auc: 0.719722
0.7197219135216482
Training until validation scores don't improve for 1

[300]	valid_0's binary_logloss: 0.563026	valid_0's auc: 0.719456
[400]	valid_0's binary_logloss: 0.548013	valid_0's auc: 0.71966
Early stopping, best iteration is:
[357]	valid_0's binary_logloss: 0.554341	valid_0's auc: 0.720198
0.7201979094120844
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.605355	valid_0's auc: 0.710566
[200]	valid_0's binary_logloss: 0.580962	valid_0's auc: 0.720017
[300]	valid_0's binary_logloss: 0.563384	valid_0's auc: 0.721019
[400]	valid_0's binary_logloss: 0.548099	valid_0's auc: 0.720611
Early stopping, best iteration is:
[316]	valid_0's binary_logloss: 0.560675	valid_0's auc: 0.721539
0.7215389627742643
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.605867	valid_0's auc: 0.711866
[200]	valid_0's binary_logloss: 0.581555	valid_0's auc: 0.720803
[300]	valid_0's binary_logloss: 0.563822	valid_0's auc: 0.722626
[400]	valid_0's binary_logloss: 0.549024	valid_0's a

Early stopping, best iteration is:
[438]	valid_0's binary_logloss: 0.543172	valid_0's auc: 0.722531
0.7225314061012652
cv score:
0.7204791740324379
current score: 0.726394237884844 7
[0.7214444369990691, 0.7184811856937755, 0.7198605780730781, 0.7208473499900011, 0.7225314061012652]
([396, 261, 360, 371, 438], 365.2)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.604167	valid_0's auc: 0.709609
[200]	valid_0's binary_logloss: 0.578961	valid_0's auc: 0.719844
[300]	valid_0's binary_logloss: 0.561383	valid_0's auc: 0.721369
Early stopping, best iteration is:
[264]	valid_0's binary_logloss: 0.56723	valid_0's auc: 0.721527
0.7215274369024931
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.605334	valid_0's auc: 0.710092
[200]	valid_0's binary_logloss: 0.580734	valid_0's auc: 0.718771
[300]	valid_0's binary_logloss: 0.563201	valid_0's auc: 0.71996
[400]	valid_0's binary_logloss: 0.54835	valid_0'

[400]	valid_0's binary_logloss: 0.549278	valid_0's auc: 0.718464
Early stopping, best iteration is:
[316]	valid_0's binary_logloss: 0.561465	valid_0's auc: 0.71974
0.7197403313546276
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.604628	valid_0's auc: 0.707904
[200]	valid_0's binary_logloss: 0.580789	valid_0's auc: 0.717604
[300]	valid_0's binary_logloss: 0.563257	valid_0's auc: 0.719177
[400]	valid_0's binary_logloss: 0.547807	valid_0's auc: 0.719348
Early stopping, best iteration is:
[351]	valid_0's binary_logloss: 0.555073	valid_0's auc: 0.719817
0.7198168312666976
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.604884	valid_0's auc: 0.71227
[200]	valid_0's binary_logloss: 0.580929	valid_0's auc: 0.719554
[300]	valid_0's binary_logloss: 0.563602	valid_0's auc: 0.720013
Early stopping, best iteration is:
[293]	valid_0's binary_logloss: 0.56478	valid_0's auc: 0.720144
0.7201439348303682


KeyboardInterrupt: 