# Read Data

In [20]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# constant definition
# small_data_path = './data/small_size/sample_atec_anti_fraud_train.csv'
train_path = '../data/full_size/atec_anti_fraud_train.csv'
testb_path='../data/full_size/atec_anti_fraud_test_b.csv'
train_data = pd.read_csv(train_path,index_col = 0)
testb_data = pd.read_csv(testb_path,index_col = 0)

# Feature Selection

In [21]:
# find no missing value features
no_nan_features = ['date']
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    if train_data[name].isnull().sum()==0:
        no_nan_features.append(name)
    i+=1
    
# find small missing features
small_missing_features=[]
i = 1
while i<len(testb_data.columns):
    name = testb_data.columns[i]
    train_missing_rate = train_data[name].isnull().sum()/train_data.shape[0]
    test_missing_rate = testb_data[name].isnull().sum()/testb_data.shape[0]
    if 0<train_missing_rate<0.3 and abs(test_missing_rate-train_missing_rate)<0.1:
        small_missing_features.append(name)
    i+=1
    
filldable_features = small_missing_features+no_nan_features
# feature selection
feature_score_files=['xgb_feature_scores.csv','lgb_feature_scores2.csv']
common_important_features=set()
all_important_features=set()
top=100
for file in feature_score_files:
    features = set(pd.read_csv(file,index_col = 0,header=None).sort_values(by=1,ascending=False).iloc[:top,0].index.tolist())
    all_important_features = all_important_features|features
    if common_important_features:
        common_important_features = common_important_features&features
    else:
        common_important_features=features

In [22]:
selected_features = filldable_features

# Fill selected features

In [23]:
train_data[filldable_features] = train_data[filldable_features].fillna(train_data[filldable_features].mean())
testb_data[filldable_features] = testb_data[filldable_features].fillna(testb_data[filldable_features].mean())

# train test split

In [25]:
# preprocess sample data
train_data = train_data[['label']+selected_features]
train_data = train_data[train_data['label']!=-1] # delete all -1 labeled data
train_data = train_data.sort_values(by=['date'])

train_num = int(0.8*train_data.shape[0])
test_data = train_data.iloc[train_num:,:]
train_data = train_data.iloc[:train_num,:]
train_x = train_data.drop(columns=['label'])
train_y = train_data['label']
test_x = test_data.drop(columns=['label'])
test_y = test_data['label']

# Build Bayessian Model

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
multinomial_nb_clf = BernoulliNB()
multinomial_nb_clf.fit(train_x,train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [72]:
from sklearn.metrics import precision_score,recall_score
predict_y = multinomial_nb_clf.predict(test_x)
predict_y_proba = multinomial_nb_clf.predict_proba(test_x)[:,1]
print(precision_score(test_y,predict_y),recall_score(test_y,predict_y))

0.0721766772628409 0.8201520912547529


In [94]:
predict_y_proba = pd.Series(predict_y_proba,index = test_y.index).apply(lambda x:1 if x>0.996 else 0)

In [98]:
recall_score(test_y,predict_y_proba)

0.10950570342205324

In [77]:
test_y.value_counts()

0    195372
1      2630
Name: label, dtype: int64

# Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100,max_depth=9, random_state=0)
rf_clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [64]:
predict_y=rf_clf.predict(test_x)
from sklearn.metrics import precision_score,recall_score
recall_score(train_y,predict_y)

0.3166877370417193