# Algorithms
+ Here 4 classification algorithms will be used:
    + Logistic regression
    + Decision tree
    + Random forest
    + Light GBM

In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

## 1. Load the data

In [3]:
training_data = pd.read_csv("data/609_training_data.csv")
training_data.drop(columns=['Unnamed: 0'], inplace=True)
print(training_data.shape)
training_data.head(3)

(58508, 197)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,187,188,189,190,191,192,193,194,195,196
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
training_label = pd.read_csv("data/609_training_label.csv")
training_label.drop(columns=['Unnamed: 0'], inplace=True)
print(training_label.shape)
training_label.head(3)

(58508, 1)


Unnamed: 0,Final_Y
0,0
1,0
2,0


In [5]:
testing_data = pd.read_csv("data/609_testing_data.csv")
testing_data.drop(columns=['Unnamed: 0'], inplace=True)
print(testing_data.shape)
testing_data.head(3)

(8238, 197)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,187,188,189,190,191,192,193,194,195,196
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
print(training_label.groupby('Final_Y').size())

Final_Y
0    29254
1    29254
dtype: int64


---

## 2. helper functions

In [7]:
# 构建上传文件 
def get_submission_file(optimized_model, testing_data, save_path):
    testing_label_pred = optimized_model.predict(testing_data)
    pd_testing_label_pred = pd.DataFrame(testing_label_pred, columns=['Final_Y'])
    pd_testing_label_pred.index = range(1,len(pd_testing_label_pred) + 1)
    pd_testing_label_pred.head()
    pd_testing_label_pred.to_csv(save_path)

In [8]:
# cross validation training 80%training, 20% test, mean result
# to check the training progress and select suitable parameters
def kfold_test(model, training_data, training_label):
    kfold = KFold(n_splits=5, shuffle=True, random_state=0) # shuffle the data
    precision_train = []
    precision_cv = []
    recall_train = []
    recall_cv = []
    f1_train = []
    f1_cv = []
    roc_auc_train = []
    roc_auc_cv = []
    for train_index, cv_index in kfold.split(training_data):
        # print("Round {}".format(round_count))
        train_x = training_data.iloc[train_index]
        train_y = training_label.iloc[train_index]
        cv_x = training_data.iloc[cv_index]
        cv_y = training_label.iloc[cv_index]
        model.fit(train_x, np.ravel(train_y))
        # Get training and cv result
        pred_tr_y = model.predict(train_x)
        pred_tr_y_proba = model.predict_proba(train_x)[:,1]
        pred_y = model.predict(cv_x)
        pred_y_proba = model.predict_proba(cv_x)[:,1]
        # Record result
        precision_train.append(precision_score(pred_tr_y, train_y))
        recall_train.append(recall_score(pred_tr_y, train_y))
        f1_train.append(f1_score(pred_tr_y, train_y))
        roc_auc_train.append(roc_auc_score(train_y, pred_tr_y_proba))
        precision_cv.append(precision_score(pred_y, cv_y))
        recall_cv.append(recall_score(pred_y, cv_y))
        f1_cv.append(f1_score(pred_y, cv_y))
        roc_auc_cv.append(roc_auc_score(cv_y, pred_y_proba))
    # mean result
    tr_print = "TR_Precision {:.4f} |TR_Recall {:.4f} |TR_F1 {:.4f} |TR_AUROC {:.4f}".format(
                np.mean(precision_train),np.mean(recall_train),
                np.mean(f1_train),np.mean(roc_auc_train)
                )
    cv_print = "CV_Precision {:.4f} |CV_Recall {:.4f} |CV_F1 {:.4f} |CV_AUROC {:.4f}".format(
                np.mean(precision_cv),np.mean(recall_cv),
                np.mean(f1_cv),np.mean(roc_auc_cv)
                )
    return tr_print, cv_print

In [9]:
# check number of predicted 1s in testing data
def check_test_zeros(trained_model, testing_data):
    pred = trained_model.predict(testing_data)
    return pred.sum()

In [10]:
# combined, i will use this in next stage
def single_test(model, training_data, training_label, testing_data,save_path=None):
    # 1 compare training (80% of training data) and validation (20%)
    tr_print, cv_print = kfold_test(model, training_data, training_label)
    print(tr_print)
    print(cv_print)
    # 2 train on 100% training data
    model.fit(training_data, np.ravel(training_label))
    # 3 check auroc value on 100% training data
    y_predprob = model.predict_proba(training_data)[:,1]
    score_print = "Train roc_auc_score: {:.4f}".format(roc_auc_score(training_label, y_predprob))
    print(score_print)
    # 4 check how many 1s are predicted in testing data
    pred1s = check_test_zeros(model, testing_data)
    print("1s in predictions: {}".format(pred1s))
    # 5 save model if the path exists
    if save_path is not None:
        print("Save to {}".format(save_path))
        get_submission_file(optimized_model=model, testing_data=testing_data, save_path=save_path)

---

## 3. LR & DecisionTree

### LR

In [15]:
# basic， training and testing  result is low, but similar, the model is underfitting
lr_basic = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000)
single_test(lr_basic, training_data, training_label, testing_data)

TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793


In [19]:
lr_basic

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
# parameter C, helpful to handle overfitting: 1
param_grid = [100.0,10.0,1.0,0.5,0.2,0.1,0.05,0.02,0.01,0.005,0.002,0.001] # default 1.0
for para in param_grid:
    print("Current {}".format(para))
    estimator = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000, C=para)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current 100.0
TR_Precision 0.9244 |TR_Recall 0.8764 |TR_F1 0.8998 |TR_AUROC 0.9500
CV_Precision 0.9234 |CV_Recall 0.8753 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current 10.0
TR_Precision 0.9244 |TR_Recall 0.8764 |TR_F1 0.8998 |TR_AUROC 0.9500
CV_Precision 0.9234 |CV_Recall 0.8753 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current 1.0
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current 0.5
TR_Precision 0.9245 |TR_Recall 0.8760 |TR_F1 0.8996 |TR_AUROC 0.9500
CV_Precision 0.9238 |CV_Recall 0.8749 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9498
1s in predictions: 1790
-----
Current 0.2
TR_Precision 0.9244 |TR_Recall 0.8759 |TR_F1 0.8995 |TR_AUROC 0.9499
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8986 |CV_AUROC 0.9489
Train roc_auc_s

In [17]:
# solver, algorithmm, lbfgs
for para in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    print("Current {}".format(para))
    estimator = LogisticRegression(random_state=0, solver=para, max_iter=10000, C)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current newton-cg
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8986 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current lbfgs
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current liblinear
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9238 |CV_Recall 0.8749 |CV_F1 0.8987 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1792
-----
Current sag
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8986 |CV_AUROC 0.9489
Train roc_auc_score: 0.9499
1s in predictions: 1793
-----
Current saga
TR_Precision 0.9245 |TR_Recall 0.8761 |TR_F1 0.8997 |TR_AUROC 0.9500
CV_Precision 0.9237 |CV_Recall 0.8749 |CV_F1 0.8986 |CV_AUROC 0.9489
Tra

---

### Tree

In [14]:
# basic tree: 效果稍好, 但仍旧一般 better than lr
tree_basic = DecisionTreeClassifier(random_state=0)
single_test(tree_basic, training_data, training_label, testing_data)

TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950


In [28]:
DecisionTreeClassifier(random_state=0, max_depth=17)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=17,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [18]:
# max_depth: 17 to avoid overfitting, better performance, the max deep of tree
# if the depth is too deep, it will lead to overfitting
param_grid = [None]+ list(range(1,21)) + list(range(25,151,5))
for para in param_grid:
    print("Current {}".format(para))
    estimator = DecisionTreeClassifier(random_state=0, max_depth=para)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current None
TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950
-----
Current 1
TR_Precision 0.7122 |TR_Recall 0.7166 |TR_F1 0.7144 |TR_AUROC 0.7153
CV_Precision 0.7122 |CV_Recall 0.7166 |CV_F1 0.7144 |CV_AUROC 0.7153
Train roc_auc_score: 0.7153
1s in predictions: 2628
-----
Current 2
TR_Precision 0.8522 |TR_Recall 0.7864 |TR_F1 0.8150 |TR_AUROC 0.8600
CV_Precision 0.8533 |CV_Recall 0.7854 |CV_F1 0.8152 |CV_AUROC 0.8596
Train roc_auc_score: 0.8618
1s in predictions: 2880
-----
Current 3
TR_Precision 0.7902 |TR_Recall 0.8840 |TR_F1 0.8332 |TR_AUROC 0.8863
CV_Precision 0.7911 |CV_Recall 0.8828 |CV_F1 0.8332 |CV_AUROC 0.8858
Train roc_auc_score: 0.8877
1s in predictions: 1255
-----
Current 4
TR_Precision 0.8536 |TR_Recall 0.8720 |TR_F1 0.8627 |TR_AUROC 0.9070
CV_Precision 0.8514 |CV_Recall 0.8712 |CV_F1 0.8612 |CV_AUROC 0.9057
Train roc_auc_score: 0.908

TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950
-----
Current 125
TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950
-----
Current 130
TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950
-----
Current 135
TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in predictions: 950
-----
Current 140
TR_Precision 0.9997 |TR_Recall 1.0000 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9413 |CV_Recall 0.9339 |CV_F1 0.9376 |CV_AUROC 0.9374
Train roc_auc_score: 1.0000
1s in 

In [23]:
# min_samples_split: 2 default, the min number of samples required to split an internal node
param_grid = list(range(2,21)) + list(range(25,101,5))
for para in param_grid:
    print("Current {}".format(para))
    estimator = DecisionTreeClassifier(random_state=0, max_depth=17, min_samples_split=para)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current 2
TR_Precision 0.9818 |TR_Recall 0.9753 |TR_F1 0.9785 |TR_AUROC 0.9961
CV_Precision 0.9461 |CV_Recall 0.9348 |CV_F1 0.9404 |CV_AUROC 0.9476
Train roc_auc_score: 0.9959
1s in predictions: 1014
-----
Current 3
TR_Precision 0.9799 |TR_Recall 0.9751 |TR_F1 0.9775 |TR_AUROC 0.9960
CV_Precision 0.9450 |CV_Recall 0.9350 |CV_F1 0.9399 |CV_AUROC 0.9478
Train roc_auc_score: 0.9958
1s in predictions: 1015
-----
Current 4
TR_Precision 0.9788 |TR_Recall 0.9740 |TR_F1 0.9764 |TR_AUROC 0.9958
CV_Precision 0.9449 |CV_Recall 0.9351 |CV_F1 0.9400 |CV_AUROC 0.9500
Train roc_auc_score: 0.9956
1s in predictions: 1018
-----
Current 5
TR_Precision 0.9776 |TR_Recall 0.9735 |TR_F1 0.9755 |TR_AUROC 0.9957
CV_Precision 0.9439 |CV_Recall 0.9352 |CV_F1 0.9395 |CV_AUROC 0.9517
Train roc_auc_score: 0.9955
1s in predictions: 1016
-----
Current 6
TR_Precision 0.9763 |TR_Recall 0.9727 |TR_F1 0.9745 |TR_AUROC 0.9955
CV_Precision 0.9443 |CV_Recall 0.9359 |CV_F1 0.9401 |CV_AUROC 0.9536
Train roc_auc_score: 0.9954


In [24]:
# min_samples_leaf: default 1, the min number of samples required to be at a leaf node
param_grid = list(range(1,21))
for para in param_grid:
    print("Current {}".format(para))
    estimator = DecisionTreeClassifier(random_state=0, max_depth=17, min_samples_leaf=para)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current 1
TR_Precision 0.9818 |TR_Recall 0.9753 |TR_F1 0.9785 |TR_AUROC 0.9961
CV_Precision 0.9461 |CV_Recall 0.9348 |CV_F1 0.9404 |CV_AUROC 0.9476
Train roc_auc_score: 0.9959
1s in predictions: 1014
-----
Current 2
TR_Precision 0.9689 |TR_Recall 0.9739 |TR_F1 0.9714 |TR_AUROC 0.9956
CV_Precision 0.9379 |CV_Recall 0.9391 |CV_F1 0.9385 |CV_AUROC 0.9550
Train roc_auc_score: 0.9954
1s in predictions: 989
-----
Current 3
TR_Precision 0.9682 |TR_Recall 0.9683 |TR_F1 0.9682 |TR_AUROC 0.9951
CV_Precision 0.9396 |CV_Recall 0.9367 |CV_F1 0.9381 |CV_AUROC 0.9612
Train roc_auc_score: 0.9949
1s in predictions: 988
-----
Current 4
TR_Precision 0.9617 |TR_Recall 0.9677 |TR_F1 0.9647 |TR_AUROC 0.9947
CV_Precision 0.9346 |CV_Recall 0.9389 |CV_F1 0.9367 |CV_AUROC 0.9661
Train roc_auc_score: 0.9946
1s in predictions: 931
-----
Current 5
TR_Precision 0.9627 |TR_Recall 0.9628 |TR_F1 0.9628 |TR_AUROC 0.9943
CV_Precision 0.9389 |CV_Recall 0.9350 |CV_F1 0.9369 |CV_AUROC 0.9686
Train roc_auc_score: 0.9942
1s 

In [26]:
# max_leaf_nodes, none, relation reduction in impurity
param_grid = [None]+list(range(5,101,5))
for para in param_grid:
    print("Current {}".format(para))
    estimator = DecisionTreeClassifier(random_state=0, max_depth=17, max_leaf_nodes=para)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current None
TR_Precision 0.9818 |TR_Recall 0.9753 |TR_F1 0.9785 |TR_AUROC 0.9961
CV_Precision 0.9461 |CV_Recall 0.9348 |CV_F1 0.9404 |CV_AUROC 0.9476
Train roc_auc_score: 0.9959
1s in predictions: 1014
-----
Current 5
TR_Precision 0.7755 |TR_Recall 0.8696 |TR_F1 0.8185 |TR_AUROC 0.8704
CV_Precision 0.7766 |CV_Recall 0.8682 |CV_F1 0.8183 |CV_AUROC 0.8699
Train roc_auc_score: 0.8730
1s in predictions: 1299
-----
Current 10
TR_Precision 0.8684 |TR_Recall 0.8614 |TR_F1 0.8647 |TR_AUROC 0.9057
CV_Precision 0.8669 |CV_Recall 0.8616 |CV_F1 0.8641 |CV_AUROC 0.9047
Train roc_auc_score: 0.9055
1s in predictions: 1845
-----
Current 15
TR_Precision 0.9037 |TR_Recall 0.8654 |TR_F1 0.8840 |TR_AUROC 0.9307
CV_Precision 0.9034 |CV_Recall 0.8661 |CV_F1 0.8842 |CV_AUROC 0.9297
Train roc_auc_score: 0.9302
1s in predictions: 1889
-----
Current 20
TR_Precision 0.8970 |TR_Recall 0.8910 |TR_F1 0.8937 |TR_AUROC 0.9427
CV_Precision 0.8972 |CV_Recall 0.8898 |CV_F1 0.8933 |CV_AUROC 0.9421
Train roc_auc_score: 0

---

## 4. RF

In [18]:
# the basic result is boosted in all metrics
rf_basic1 = RandomForestClassifier(n_estimators=10, random_state=0)
rf_basic1

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
# basic 1: n_estimators=10, all others default, decent result! 后面用的130 和128, 
rf_basic1 = RandomForestClassifier(n_estimators=10, random_state=0)
single_test(rf_basic1, training_data, training_label, testing_data)

TR_Precision 0.9969 |TR_Recall 0.9989 |TR_F1 0.9979 |TR_AUROC 1.0000
CV_Precision 0.9514 |CV_Recall 0.9606 |CV_F1 0.9560 |CV_AUROC 0.9911
Train roc_auc_score: 1.0000
1s in predictions: 706


In [16]:
# basic 2: n_estimators=100, all others default, decent result!
rf_basic2 = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=3)
single_test(rf_basic2, training_data, training_label, testing_data)

TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9657 |CV_Recall 0.9575 |CV_F1 0.9616 |CV_AUROC 0.9952
Train roc_auc_score: 1.0000
1s in predictions: 796


In [30]:
# result of round2， 130和 128 都很好, n_estimators: the number of trees in the forest
estimator = RandomForestClassifier(n_estimators=130, random_state=0, n_jobs=3)
single_test(estimator, training_data, training_label, testing_data)

TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9666 |CV_Recall 0.9574 |CV_F1 0.9620 |CV_AUROC 0.9953
Train roc_auc_score: 1.0000
1s in predictions: 802


In [29]:
estimator = RandomForestClassifier(n_estimators=128, random_state=0, n_jobs=3)
single_test(estimator, training_data, training_label, testing_data)

TR_Precision 0.9999 |TR_Recall 0.9998 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9664 |CV_Recall 0.9576 |CV_F1 0.9620 |CV_AUROC 0.9953
Train roc_auc_score: 1.0000
1s in predictions: 800


In [29]:
# max_features: 随着max_features增加, precision增加, recall下降， log, 2,3,6, 
# the number of features to consider when looking for the best split
# auto --> sqrt(197) = 14
# log2 --> log2(197) = 7
# param_grid = ['auto'] + list(range(3,26,2)) + list(range(30,151,5))
param_grid = ['auto','log2'] + list(range(1,16))
for para in param_grid:
    print("Current {}".format(para))
    estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features=para,
                                       n_jobs=3)
    single_test(estimator, training_data, training_label, testing_data)
    print("-----")

Current auto
TR_Precision 0.9999 |TR_Recall 0.9998 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9664 |CV_Recall 0.9577 |CV_F1 0.9620 |CV_AUROC 0.9953
Train roc_auc_score: 1.0000
1s in predictions: 800
-----
Current log2
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9642 |CV_Recall 0.9639 |CV_F1 0.9641 |CV_AUROC 0.9959
Train roc_auc_score: 1.0000
1s in predictions: 669
-----
Current 1
TR_Precision 0.9999 |TR_Recall 0.9998 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9576 |CV_Recall 0.9698 |CV_F1 0.9636 |CV_AUROC 0.9961
Train roc_auc_score: 1.0000
1s in predictions: 500
-----
Current 2
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9595 |CV_Recall 0.9710 |CV_F1 0.9652 |CV_AUROC 0.9962
Train roc_auc_score: 1.0000
1s in predictions: 508
-----
Current 3
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9612 |CV_Recall 0.9702 |CV_F1 0.9657 |CV_AUROC 0.9963
Train roc_auc_score: 1.000

In [30]:
# sqrt = 14, max_features, auto
estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features='auto',
                                       n_jobs=3)
single_test(estimator, training_data, training_label, testing_data, save_path='result/999_rf_001.csv')

Save to result/999_rf_001.csv
TR_Precision 0.9999 |TR_Recall 0.9998 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9664 |CV_Recall 0.9577 |CV_F1 0.9620 |CV_AUROC 0.9953
Train roc_auc_score: 1.0000
1s in predictions: 800


In [31]:
# log2 about 7
estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features='log2',
                                       n_jobs=3)
single_test(estimator, training_data, training_label, testing_data, save_path='result/999_rf_002.csv')

Save to result/999_rf_002.csv
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9642 |CV_Recall 0.9639 |CV_F1 0.9641 |CV_AUROC 0.9959
Train roc_auc_score: 1.0000
1s in predictions: 669


In [38]:
# max_features=3
estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features=3,
                                       n_jobs=3)
single_test(estimator, training_data, training_label, testing_data, save_path='result/999_rf_003.csv')

Save to result/999_rf_003.csv
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9612 |CV_Recall 0.9702 |CV_F1 0.9657 |CV_AUROC 0.9963
Train roc_auc_score: 1.0000
1s in predictions: 528


In [43]:
# max_features=6
estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features=6,
                                       n_jobs=3)
single_test(estimator, training_data, training_label, testing_data, save_path='result/999_rf_004.csv')

Save to result/999_rf_004.csv
TR_Precision 0.9998 |TR_Recall 0.9999 |TR_F1 0.9999 |TR_AUROC 1.0000
CV_Precision 0.9637 |CV_Recall 0.9659 |CV_F1 0.9648 |CV_AUROC 0.9960
Train roc_auc_score: 1.0000
1s in predictions: 630


In [44]:
# finally I choose 4 rf models and submit
# Here "different results" is my previous test submission, achieved ~ 0.91745
compare_with_prev_best('result/999_rf_001_91041.csv') # 0.91041
compare_with_prev_best('result/999_rf_002_91381.csv') # 0.91381 # log 2
compare_with_prev_best('result/999_rf_003_91114.csv') # 0.91114
compare_with_prev_best('result/999_rf_004.csv')       # 0.91114

predicted 1s: prev_best 796| curr 800
different results 302
predicted 1s: prev_best 796| curr 669
different results 371
predicted 1s: prev_best 796| curr 528
different results 480
predicted 1s: prev_best 796| curr 630
different results 386


In [31]:
# Best random forest model: rf_002, 0.91381
estimator = RandomForestClassifier(n_estimators=128, 
                                       random_state=0, 
                                       max_depth=66,
                                       max_features='log2',
                                       n_jobs=3)
estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=66, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=3,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

---

## 5. LGBM

In [17]:
# basic: the basic result is worse than RF but better than LR and tree
lgbm_basic1 = LGBMClassifier(random_state=0)
single_test(lgbm_basic1, training_data, training_label, testing_data)

TR_Precision 0.9576 |TR_Recall 0.9639 |TR_F1 0.9608 |TR_AUROC 0.9953
CV_Precision 0.9509 |CV_Recall 0.9562 |CV_F1 0.9536 |CV_AUROC 0.9933
Train roc_auc_score: 0.9950
1s in predictions: 899


In [15]:
lgbm_basic1

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

---

### Round 1: n_estimators

这里有两个版本: n_estimators = 123 and 285

In [64]:
# two versions
estimator = LGBMClassifier(random_state=0, n_estimators=123, learning_rate=0.1)
single_test(estimator, training_data, training_label, testing_data, save_path="result/999_lgbm_001_123.csv")
estimator = LGBMClassifier(random_state=0, n_estimators=285, learning_rate=0.1)
single_test(estimator, training_data, training_label, testing_data, save_path="result/999_lgbm_002_285.csv")

Save to result/999_lgbm_001_123.csv
TR_Precision 0.9590 |TR_Recall 0.9677 |TR_F1 0.9634 |TR_AUROC 0.9959
CV_Precision 0.9504 |CV_Recall 0.9574 |CV_F1 0.9539 |CV_AUROC 0.9934
Train roc_auc_score: 0.9955
1s in predictions: 873
Save to result/999_lgbm_002_285.csv
TR_Precision 0.9712 |TR_Recall 0.9835 |TR_F1 0.9773 |TR_AUROC 0.9982
CV_Precision 0.9470 |CV_Recall 0.9597 |CV_F1 0.9533 |CV_AUROC 0.9931
Train roc_auc_score: 0.9977
1s in predictions: 818


---

### Round 2: max_depth, num_leaves

max_depth

num_leaves

---
### Round 3: min_child_samples, min_child_weight
min_child_samples

min_child_weight

#### Build 2 submission tests: lgbm003, lgbm004

In [102]:
# 构建模型 123 version --> 91599, the 123 one has better f1 score, 285 one recall is good
# TR_Precision 0.9669 |TR_Recall 0.9770 |TR_F1 0.9719 |TR_AUROC 0.9974
# CV_Precision 0.9514 |CV_Recall 0.9585 |CV_F1 0.9549 |CV_AUROC 0.9935
# Train roc_auc_score: 0.9971
# 1s in predictions: 824
estimator = LGBMClassifier(random_state=0, 
                               n_estimators=123,
                               max_depth=10,
                               num_leaves=66,
                               min_child_samples=26)
# single_test(estimator, training_data, training_label, testing_data, save_path="result/999_lgbm_003_123.csv")
compare_with_prev_best("result/999_lgbm_003_123_91599.csv")

predicted 1s: prev_best 796| curr 824
different results 254


In [117]:
# 构建模型 285 version --> 91357
# TR_Precision 0.9687 |TR_Recall 0.9835 |TR_F1 0.9760 |TR_AUROC 0.9980
# CV_Precision 0.9463 |CV_Recall 0.9616 |CV_F1 0.9539 |CV_AUROC 0.9932
# Train roc_auc_score: 0.9978
# 1s in predictions: 792
estimator = LGBMClassifier(random_state=0, 
                               n_estimators=285,
                               max_depth=7,
                               num_leaves=68)
# single_test(estimator, training_data, training_label, testing_data, save_path="result/999_lgbm_004_285_91357.csv")
compare_with_prev_best("result/999_lgbm_004_285_91357.csv")

predicted 1s: prev_best 796| curr 792
different results 282


#### it seemed that 123 version is better,  in next sections i will tune it only

---

### Round 4: max_bin, min_data_in_leaf
max_bin: max number of bins

min_data_in_leaf: 默认20, 过拟合时用, min number of data in one leaf

---

### Round 5: feature_fraction, bagging_fraction, bagging_freq

+ **123 version only**

----

### Round 6: lambda_l1, lambda_l2: handle overfitting

---

### Round 7: min_split_gain

### submit test
The performance is not good on public leader board. But it finally achieves 4th place in private leaderboard

In [127]:
# 91211
estimator = LGBMClassifier(random_state=0, 
                               n_estimators=123,
                               max_depth=10,
                               num_leaves=66,
                               min_child_samples=26,
                               min_data_in_leaf=101)
single_test(estimator, training_data, training_label, testin
            g_data, save_path="result/999_lgbm_005_123.csv")
compare_with_prev_best("result/999_lgbm_005_123.csv")

Save to result/999_lgbm_005_123.csv
TR_Precision 0.9595 |TR_Recall 0.9717 |TR_F1 0.9656 |TR_AUROC 0.9961
CV_Precision 0.9498 |CV_Recall 0.9598 |CV_F1 0.9548 |CV_AUROC 0.9934
Train roc_auc_score: 0.9959
1s in predictions: 841
predicted 1s: prev_best 796| curr 841
different results 225


## Final one
+ This is my best submission, where only n_estimators is tunes
+ 0.91915: 2nd place on public leaderboard

#### In private leaderboard, it achieves 9th place

In [130]:
estimator = LGBMClassifier(random_state=0, n_estimators=123, learning_rate=0.1)
estimator

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=123, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)