In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_df = pd.read_csv('../data/train/train_feature_21to41.csv', header = 0, encoding = 'utf-8')
test_b_df = pd.read_csv('../data/Test-B/test_b_feature_21to41.csv', header = 0, encoding = 'utf-8')

In [4]:
dtrain = lgb.Dataset(train_df.drop(['uid','label'],axis=1),label=train_df.label)
dtest = lgb.Dataset(test_b_df.drop(['uid'],axis=1))

In [5]:
id_series = pd.DataFrame({'uid': test_b_df.uid})
def do3(x):
    tmp = "%04d" % x['uid']
    return 'u' + str(tmp)
id_series.loc[:, 'uid'] = id_series.apply(do3, axis = 1)

In [55]:
lgb_params =  {
   'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'metric_freq': 100,
    'is_training_metric': True,
    'min_data_in_leaf': 360,
    'num_leaves': 60,
    'learning_rate': 0.07,
    'is_unbalance': True,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'min_hessian': 0.05,
     'verbosity':-1
#    'gpu_device_id':2,
#     'device':'gpu'
 #   'lambda_l1': 0.001,
 #   'skip_drop': 0.95,
 #   'max_drop' : 10,
# 'lambda_l2': 0.005,
 #'num_threads': 18,
}

In [56]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

In [67]:
cv_results = lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=150,verbose_eval=5,num_boost_round=500,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.741064 + 0.00517984
[10]	cv_agg's res: 0.756045 + 0.00699429
[15]	cv_agg's res: 0.76177 + 0.012525
[20]	cv_agg's res: 0.760009 + 0.0132469
[25]	cv_agg's res: 0.764315 + 0.013599
[30]	cv_agg's res: 0.770501 + 0.013461
[35]	cv_agg's res: 0.773208 + 0.0127656
[40]	cv_agg's res: 0.77833 + 0.0120968
[45]	cv_agg's res: 0.780375 + 0.0121068
[50]	cv_agg's res: 0.78173 + 0.012146
[55]	cv_agg's res: 0.781852 + 0.0115397
[60]	cv_agg's res: 0.78576 + 0.0111986
[65]	cv_agg's res: 0.78746 + 0.0105917
[70]	cv_agg's res: 0.789062 + 0.0112123
[75]	cv_agg's res: 0.78972 + 0.0121855
[80]	cv_agg's res: 0.790743 + 0.0122963
[85]	cv_agg's res: 0.790211 + 0.0118849
[90]	cv_agg's res: 0.791217 + 0.0116651
[95]	cv_agg's res: 0.792203 + 0.0117908
[100]	cv_agg's res: 0.791838 + 0.0130006
[105]	cv_agg's res: 0.794839 + 0.0124376
[110]	cv_agg's res: 0.7948 + 0.0119398
[115]	cv_agg's res: 0.795611 + 0.0123952
[120]	cv_agg's res: 0.796221 + 0.011398
[125]	cv_agg's res: 0.796615 + 0.0110769
[130]	

In [68]:
res_mean = pd.Series(cv_results['res-mean']).max()
print(res_mean)

0.8093868157999052


In [71]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=410,valid_sets=[dtrain])

[5]	training's res: 0.766355
[10]	training's res: 0.784702
[15]	training's res: 0.787777
[20]	training's res: 0.785147
[25]	training's res: 0.792233
[30]	training's res: 0.799752
[35]	training's res: 0.80836
[40]	training's res: 0.813431
[45]	training's res: 0.818275
[50]	training's res: 0.822123
[55]	training's res: 0.825812
[60]	training's res: 0.828663
[65]	training's res: 0.832476
[70]	training's res: 0.835648
[75]	training's res: 0.838653
[80]	training's res: 0.841609
[85]	training's res: 0.844488
[90]	training's res: 0.846826
[95]	training's res: 0.849786
[100]	training's res: 0.853324
[105]	training's res: 0.855972
[110]	training's res: 0.858206
[115]	training's res: 0.861998
[120]	training's res: 0.864252
[125]	training's res: 0.867077
[130]	training's res: 0.868807
[135]	training's res: 0.871336
[140]	training's res: 0.874778
[145]	training's res: 0.878155
[150]	training's res: 0.879234
[155]	training's res: 0.881702
[160]	training's res: 0.8833
[165]	training's res: 0.885504


In [72]:
pred=model.predict(test_b_df.drop(['uid'],axis=1))

In [73]:
res =pd.DataFrame({'uid':id_series['uid'],'label':pred})

In [74]:
print(pred)

[0.72698149 0.81999519 0.01719991 ... 0.70888424 0.39057942 0.52914757]


In [75]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
# res.label = res.label.map(lambda x: int(x))

In [76]:
print(res)

      label    uid
1749      1  u8749
2227      1  u9227
2406      1  u9406
628       1  u7628
2233      1  u9233
554       1  u7554
1760      1  u8760
1076      1  u8076
1133      1  u8133
406       1  u7406
1136      1  u8136
893       1  u7893
606       1  u7606
693       1  u7693
639       1  u7639
1477      1  u8477
337       1  u7337
2986      1  u9986
886       1  u7886
1603      1  u8603
1135      1  u8135
609       1  u7609
1326      1  u8326
1250      1  u8250
1098      1  u8098
353       1  u7353
1530      1  u8530
1044      1  u8044
2110      1  u9110
547       1  u7547
...     ...    ...
41        0  u7041
2475      0  u9475
782       0  u7782
1404      0  u8404
2247      0  u9247
1696      0  u8696
536       0  u7536
505       0  u7505
1037      0  u8037
1321      0  u8321
869       0  u7869
889       0  u7889
2430      0  u9430
2023      0  u9023
154       0  u7154
2917      0  u9917
1324      0  u8324
446       0  u7446
1534      0  u8534
1081      0  u8081
2809      0 

In [77]:
res.to_csv('../result/lgb-baseline-10.csv',index=False,header=False,sep=',',columns=['uid','label'])

# 2 layer model

In [503]:
train_df = pd.read_csv('../data/train/train_feature_21to33.csv', header = 0, encoding = 'utf-8')

In [504]:
test_a_df = pd.read_csv('../data/Test-A/test_a_feature_21to33.csv', header = 0, encoding = 'utf-8')

In [505]:
train_df = train_df.set_index(u'uid')
test_a_df = test_a_df.set_index(u'uid')

In [506]:
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.cross_validation import KFold

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls


import warnings
warnings.filterwarnings('ignore')

In [507]:
# Some useful parameters which will come in handy later on
ntrain = train_df.shape[0]
ntest = test_a_df.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 4 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds = NFOLDS, random_state = SEED)

In [508]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
    
    def feature_importances(self, x, y):
        print (self.clf.fit(x, y).feature_importances_)

In [509]:
import time
def get_time_stamp():
    now = int(time.time())
    return now

In [530]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0,
    'loss': 'deviance'
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [531]:
# Create 5 objects that represent our 5 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [532]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train_df['label'].ravel()
train_noy = train_df.drop(['label'], axis=1)
x_train = train_noy.values # Creates an array of the train data
x_test = test_a_df.values # Creats an array of the test data
print (x_train.shape, x_test.shape, y_train.shape)

((4999L, 48L), (2000L, 48L), (4999L,))


In [533]:
dtrain = train_df

In [534]:
def cal_precision(preds, dtrain):
    label = dtrain['label']
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    print(pre)
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

In [535]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        # print(x_tr, y_tr, x_te)

        time_before_train = get_time_stamp()
        clf.train(x_tr, y_tr)
        print("time for training:")
        print(get_time_stamp() - time_before_train)

        time_before_predict = get_time_stamp()
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        print("time for predicting")
        print(get_time_stamp() - time_before_predict)

    oof_test[:] = oof_test_skf.mean(axis=0)
    pd.Series(oof_train).to_csv('../data/oof_train.csv', header = None)
    print(cal_precision(pd.Series(oof_train), dtrain))
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [536]:
def get_oof2(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    
    time_before_train = get_time_stamp()
    clf.train(x_train, y_train)
    print("time for training:")
    print(get_time_stamp() - time_before_train)
    
    time_before_predict = get_time_stamp()
    oof_train = clf.predict(x_train)
    oof_test = clf.predict(x_test)
    print("time for predicting")
    print(get_time_stamp() - time_before_predict)
    
    # oof_train = oof_train.reshape(-1, 1)
    oof_test = oof_test.reshape(-1, 1)
    # print(cal_precision(oof_train, y_train.reshape(-1, 1)))
    return oof_train.reshape(-1, 1), oof_test

In [537]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof2(et, x_train, y_train, x_test) # Extra Trees
print("Training is complete")

# rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
# print("Training is complete")

ada_oof_train, ada_oof_test = get_oof2(ada, x_train, y_train, x_test) # AdaBoost 
print("Training is complete")

gb_oof_train, gb_oof_test = get_oof2(gb,x_train, y_train, x_test) # Gradient Boost
print("Training is complete")

# svc_oof_train, svc_oof_test = get_oof2(svc,x_train, y_train, x_test) # Support Vector Classifier
# print("Training is complete")

print("Training is complete!!!")

time for training:
1
time for predicting
1
Training is complete
time for training:
8
time for predicting
1
Training is complete
time for training:
13
time for predicting
0
Training is complete
Training is complete!!!


In [538]:
x_train = np.concatenate(( et_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [539]:
import xgboost as xgb

gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)


In [540]:
id_series = pd.DataFrame({'uid': test_a_df.index})

In [541]:
def do3(x):
    tmp = "%04d" % x['uid']
    return 'u' + str(tmp)
id_series.loc[:, 'uid'] = id_series.apply(do3, axis = 1)

In [542]:
print(id_series['uid'])

0       u5000
1       u5001
2       u5002
3       u5003
4       u5004
5       u5005
6       u5006
7       u5007
8       u5008
9       u5009
10      u5010
11      u5011
12      u5012
13      u5013
14      u5014
15      u5015
16      u5016
17      u5017
18      u5018
19      u5019
20      u5020
21      u5021
22      u5022
23      u5023
24      u5024
25      u5025
26      u5026
27      u5027
28      u5028
29      u5029
        ...  
1970    u6970
1971    u6971
1972    u6972
1973    u6973
1974    u6974
1975    u6975
1976    u6976
1977    u6977
1978    u6978
1979    u6979
1980    u6980
1981    u6981
1982    u6982
1983    u6983
1984    u6984
1985    u6985
1986    u6986
1987    u6987
1988    u6988
1989    u6989
1990    u6990
1991    u6991
1992    u6992
1993    u6993
1994    u6994
1995    u6995
1996    u6996
1997    u6997
1998    u6998
1999    u6999
Name: uid, Length: 2000, dtype: object


In [543]:
ans = pd.DataFrame({'uid': id_series['uid'], 'label': predictions}, columns = ['uid', 'label'])

In [544]:
ans = ans.sort_values(by='label', ascending=False)

In [545]:
ans.to_csv('../result/ans_4_df.csv', index=False, header = False)