In [2]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import lightgbm as lgb

In [3]:
# constant definition
# small_data_path = './data/small_size/sample_atec_anti_fraud_train.csv'
full_data_path = '../data/full_size/atec_anti_fraud_train.csv'

In [4]:
# write sample data to file
df = pd.read_csv(full_data_path,index_col = 0)

In [5]:
# preprocess sample data
# method 1: delete all -1 labeled data
df = df[df['label']!=-1]
df = df.sort_values(by=['date'])
# fill in all NAN with most common value
# df = df.fillna(df.mode().iloc[0])

In [6]:
train_ratio = 0.8
train_num = int(train_ratio*df.shape[0])

train_x = df.iloc[:train_num,1:]
train_y = df.iloc[:train_num,0]
test_x = df.iloc[train_num:,1:]
test_y = df.iloc[train_num:,0]

train_data = lgb.Dataset(data=train_x,label=train_y)
test_data = lgb.Dataset(data=test_x,label=test_y)

In [7]:
param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
param['metric'] = 'binary_logloss'

In [9]:
num_round = 200
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data],early_stopping_rounds=20)

[1]	valid_0's binary_logloss: 0.601413
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's binary_logloss: 0.526239
[3]	valid_0's binary_logloss: 0.463591
[4]	valid_0's binary_logloss: 0.410671
[5]	valid_0's binary_logloss: 0.365467
[6]	valid_0's binary_logloss: 0.326567
[7]	valid_0's binary_logloss: 0.292834
[8]	valid_0's binary_logloss: 0.263547
[9]	valid_0's binary_logloss: 0.237834
[10]	valid_0's binary_logloss: 0.2152
[11]	valid_0's binary_logloss: 0.195239
[12]	valid_0's binary_logloss: 0.177564
[13]	valid_0's binary_logloss: 0.161934
[14]	valid_0's binary_logloss: 0.148025
[15]	valid_0's binary_logloss: 0.135682
[16]	valid_0's binary_logloss: 0.124635
[17]	valid_0's binary_logloss: 0.114831
[18]	valid_0's binary_logloss: 0.105961
[19]	valid_0's binary_logloss: 0.0980647
[20]	valid_0's binary_logloss: 0.0909679
[21]	valid_0's binary_logloss: 0.0845886
[22]	valid_0's binary_logloss: 0.0788938
[23]	valid_0's binary_logloss: 0.0737887
[24]	valid_0's binary_lo

In [6]:
counts = train_y.value_counts()
neg_num = counts[0]
pos_num = counts[1]

In [22]:
train_x.shape

(742504, 298)

In [11]:
import numpy as np
def func(t):
    if t<0.5:
        return 0
    else:
        return 1
vfunc = np.vectorize(func)

In [14]:
from sklearn import metrics
print('predicting')
y_predict = bst.predict(test_x)
y_predict = vfunc(y_predict)
print("precision:{},recall:{}".format(metrics.precision_score(test_y,y_predict),metrics.recall_score(test_y,y_predict)))

predicting
precision:0.6573628488931665,recall:0.5193916349809886


In [15]:
y_predict = bst.predict(test_x)
result = pd.DataFrame({'score':y_predict,'truth':test_y},index = test_x.index)
result.to_csv('../result/large_lgb_default_early_stop.csv')

In [35]:
# save lgb model
model_name = '../models/lgb_default_early_stop.txt'
bst.save_model(model_name,num_iteration=bst.best_iteration)

In [2]:
# import model
clf = joblib.load(model_name)

In [53]:
yp_pedict = clf.predict_proba(x_test)

  np.exp(prob, prob)


In [54]:
# import real test data
test_data = pd.read_csv('./data/full_size/atec_anti_fraud_test_a.csv',index_col = 0)

In [55]:
# fill nan
test_data = test_data.fillna(test_data.mode().iloc[0])
test_data_x = test_data.iloc[:,1:]
test_predict_y = clf.predict_proba(test_data_x)

  np.exp(prob, prob)


In [16]:
from sklearn import metrics
test_y = clf.predict_proba(test_data_x)

In [56]:
result = pd.DataFrame({'score':test_predict_y[:,1]},index=test_data.index)

In [58]:
result.to_csv('bm.csv')

In [65]:
result['score'].mean()

6.1016783683298489e-06

In [60]:
result['score']

id
8e8290c270ec4bc3448dd5edd35c6f059b42d38f9ddd6fda5578f0e340908fa4    0.0
f36c5fa5c0e7afccf733a4d74c7e06ffe43cc8fd24eda7c335237327bb02e02b    0.0
ee85f808b1fd49eaba308527e1686c509dc8e3e50574888e0f5f8bc21733711c    0.0
0df905aa187938d56a9b0816b13f54ac1f87d658a33cd06505b738c66b54fd3f    0.0
15f532f979c4f092bbbe28e5409c8c3b8454ece8f1ab3ee00755b4949ef75c03    0.0
614728f2aba299d20ddffca7e1ebc38ae8f239663562a9a3602e1e2c373f7299    0.0
3dc22d969895bc4afdaa5266d1586ae1a360adfcc638b5fb1e2f242bbbc869eb    0.0
b6cba0173d6a894f443578b5d82c00444dfe24c5e1705d4b2b835062eeb61d7a    0.0
84d010a48c747947588589473e2ebdc59db3d348849c5b67759dfbcd1d92f35e    0.0
bc00fec7ff8b3e4f8bd826048da109e7fe40fcc604ba28ce78576e05a4e67fba    0.0
85426887c8a4f4fd881ae1e79654940e35c68bfd681a5eaac4787f0f82483780    0.0
ee84ac886d80e779aeb5c4af65dbb8e087014d7c98b7973bd9d91d8e2467a341    0.0
4cbb83002d62e3257d49d3742b0a27eb677ae1c464f69e4b19fe73d095ab8eae    0.0
4e6ba7f2c84040ec8afde9f42c5bee3d53fd24e3df292980a6475fc06138d