In [38]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
train = pd.read_csv('./data/train_featureV2.csv')
test = pd.read_csv('./data/test_featureV2.csv')

dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [40]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.0496,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
}    

In [41]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre = pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.2 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [42]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.652484 + 0.00496582
[10]	cv_agg's res: 0.659985 + 0.00467271
[15]	cv_agg's res: 0.662978 + 0.0055251
[20]	cv_agg's res: 0.714503 + 0.00616572
[25]	cv_agg's res: 0.762933 + 0.0107428
[30]	cv_agg's res: 0.779195 + 0.0103663
[35]	cv_agg's res: 0.789528 + 0.00996587
[40]	cv_agg's res: 0.794415 + 0.00954591
[45]	cv_agg's res: 0.795063 + 0.00979173
[50]	cv_agg's res: 0.798517 + 0.00996682
[55]	cv_agg's res: 0.800581 + 0.0121477
[60]	cv_agg's res: 0.801993 + 0.0127354
[65]	cv_agg's res: 0.804032 + 0.0130346
[70]	cv_agg's res: 0.804009 + 0.0136653
[75]	cv_agg's res: 0.803715 + 0.0122672
[80]	cv_agg's res: 0.80289 + 0.0124797
[85]	cv_agg's res: 0.804906 + 0.011856
[90]	cv_agg's res: 0.804793 + 0.0128715
[95]	cv_agg's res: 0.805991 + 0.0150046
[100]	cv_agg's res: 0.804949 + 0.0151617
[105]	cv_agg's res: 0.805803 + 0.0156095
[110]	cv_agg's res: 0.806382 + 0.0147994
[115]	cv_agg's res: 0.805574 + 0.0148936
[120]	cv_agg's res: 0.805679 + 0.0158333
[125]	cv_agg's res: 0.808018 + 

{'res-mean': [0.615175017401903,
  0.6376254246339356,
  0.640321927790149,
  0.6486317035010334,
  0.6524844955386857,
  0.6560691795585459,
  0.6576442146444426,
  0.6588153756261408,
  0.6593670001808061,
  0.6599848561263796,
  0.660691293830571,
  0.6612944783197309,
  0.6617343135490057,
  0.6618849869018965,
  0.6629783751873662,
  0.6633873358986594,
  0.6634742143266971,
  0.6635036165044325,
  0.6827019924831049,
  0.7145026050073885,
  0.7308831998755257,
  0.7433118780338498,
  0.7497090397623456,
  0.7557972254068148,
  0.7629334689776689,
  0.767633806173534,
  0.770917198346401,
  0.7737098591112067,
  0.7760037034325403,
  0.779194731962432,
  0.7830160109357295,
  0.7838263369174562,
  0.7872389771392253,
  0.7889662525675023,
  0.7895282590163476,
  0.7923004764333407,
  0.7936625817154059,
  0.7933072987457761,
  0.794682841907539,
  0.7944150081494414,
  0.7950237929866314,
  0.7950429737293723,
  0.7957012693320712,
  0.7947112664732452,
  0.7950633171455433,
  0.7

In [43]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.702743
[10]	training's res: 0.705277
[15]	training's res: 0.707844
[20]	training's res: 0.76384
[25]	training's res: 0.835722
[30]	training's res: 0.867628
[35]	training's res: 0.887093
[40]	training's res: 0.902755
[45]	training's res: 0.9125
[50]	training's res: 0.920982
[55]	training's res: 0.929715
[60]	training's res: 0.936577
[65]	training's res: 0.942991
[70]	training's res: 0.949105
[75]	training's res: 0.954251
[80]	training's res: 0.959168
[85]	training's res: 0.963947
[90]	training's res: 0.967886
[95]	training's res: 0.970934
[100]	training's res: 0.974024
[105]	training's res: 0.977754
[110]	training's res: 0.98055
[115]	training's res: 0.98441
[120]	training's res: 0.986059
[125]	training's res: 0.988559
[130]	training's res: 0.991092
[135]	training's res: 0.993228
[140]	training's res: 0.994953
[145]	training's res: 0.996258
[150]	training's res: 0.997132
[155]	training's res: 0.997351
[160]	training's res: 0.99757
[165]	training's res: 0.99845
[170

In [44]:
pred=model.predict(test.drop(['uid'],axis=1))

res =pd.DataFrame({'uid':test.uid,'label':pred})
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.2 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result/lgb4.csv',index=False,header=False,sep=',',columns=['uid','label'])
