In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

**Salary** : Reflects items such as base salary, executive cash allowances, and benefits payments.

**Bonus** : Reflects annual cash incentives paid based upon company performance. Also may include other retention payments.

**Long Term Incentive** :
Reflects long-term incentive cash payments from various long-term incentive programs designed to tie executive compensation to long-term success as measured against key performance drivers and business objectives over a multi-year period, generally 3 to 5 years.

**Deferred Income** :
Reflects voluntary executive deferrals of salary, annual cash incentives, and long-term cash incentives as well as cash fees deferred by non-employee directors under a deferred compensation arrangement. May also reflect deferrals under a stock option or phantom stock unit in lieu of cash arrangement.

**Deferral Payments** :
Reflects distributions from a deferred compensation arrangement due to termination of employment or due to in-service withdrawals as per plan provisions.

**Loan Advances** : Reflects total amount of loan advances, excluding repayments, provided by the Debtor in return for a promise of repayment. In certain instances, the terms of the promissory notes allow for the option to repay with stock of the company.

**Other** : Reflects items such as payments for severance, consulting services, relocation costs, tax advances and allowances for employees on international assignment (i.e. housing allowances, cost of living allowances, payments under Enron’s Tax Equalization Program, etc.). May also include payments provided with respect to employment agreements, as well as imputed income amounts for such things as use of corporate aircraft.

**Expenses** : Reflects reimbursements of business expenses. May include fees paid for consulting services.

**Director Fees** : Reflects cash payments and/or value of stock grants made in lieu of cash payments to non-employee directors.

**Exercised Stock Options** : Reflects amounts from exercised stock options which equal the market value in excess of the exercise price on the date the options were exercised either through cashless (same-day sale), stock swap or cash exercises. The reflected gain may differ from that realized by the insider due to fluctuations in the market price and the timing of any subsequent sale of the securities.

**Restricted Stock** :Reflects the gross fair market value of shares and accrued dividends (and/or phantom units and dividend equivalents) on the date of release due to lapse of vesting periods, regardless of whether deferred.

**Restricted Stock Deferred** : Reflects value of restricted stock voluntarily deferred prior to release under a deferred compensation arrangement.

**Stock Value** : In 1998, 1999 and 2000, Debtor and non-debtor affiliates were charged for options granted. The Black-Scholes method was used to determine the amount to be charged. Any amounts charged to Debtor and non-debtor affiliates associated with the options exercised related to these three years have not been subtracted from the share value amounts shown.


In [51]:
data_dir = '/Users/bananaiselite/2021-ml100marathon-midterm/'
train_data = pd.read_csv(data_dir+'train_data.csv')
test_features = pd.read_csv(data_dir+'test_features.csv')
train_data.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,1617011.0,174839.0,True,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,1920000.0,22122.0,True,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,1573324.0,True,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,602671.0,907502.0,True,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,375304.0,486.0,True,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [52]:
test_features.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,BELDEN TIMOTHY N,5249999.0,2144013.0,-2334434.0,,tim.belden@enron.com,953136.0,17355.0,484.0,228.0,...,,,210698.0,157569.0,,213999.0,5521.0,7991.0,5501630.0,1110705.0
1,BOWEN JR RAYMOND M,1350000.0,,-833.0,,raymond.bowen@enron.com,,65907.0,27.0,140.0,...,,974293.0,1621.0,252055.0,,278601.0,1593.0,1858.0,2669589.0,252055.0
2,HANNON KEVIN P,1500000.0,,-3117011.0,,kevin.hannon@enron.com,5538001.0,34039.0,32.0,32.0,...,,1617011.0,11350.0,853064.0,,243293.0,1035.0,1045.0,288682.0,6391065.0
3,DELAINEY DAVID W,3000000.0,,,,david.delainey@enron.com,2291113.0,86174.0,3069.0,66.0,...,,1294981.0,1661.0,1323148.0,,365163.0,2097.0,3093.0,4747979.0,3614261.0
4,CAUSEY RICHARD A,1000000.0,,-235000.0,,richard.causey@enron.com,,30674.0,49.0,58.0,...,,350000.0,307895.0,2502063.0,,415189.0,1585.0,1892.0,1868758.0,2502063.0


In [53]:
train_data.shape,test_features.shape

((113, 22), (33, 21))

In [54]:
#target
y = train_data['poi']
train_data =train_data.drop(['poi'], axis=1)
train = pd.concat([train_data,test_features],axis=0)
train.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [55]:
float_col = []
obj_col = []
for i, dtypes in enumerate(train.dtypes):
    if dtypes == 'float64':
        float_col.append(train.columns[i])
    if dtypes == 'object':
        obj_col.append(train.columns[i])
print(f'數字類型的欄位有{len(float_col)}個\n字符類型的欄位有{len(obj_col)}')

數字類型的欄位有19個
字符類型的欄位有2


**缺失值處理**

In [56]:
#檢查缺失值
def na_check(df_data):
    data_na = df_data.isnull().sum()
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing number' :data_na})
    
    return missing_data
na_check(train)

Unnamed: 0,Missing number
loan_advances,142
director_fees,129
restricted_stock_deferred,128
deferral_payments,107
deferred_income,97
long_term_incentive,80
bonus,64
from_messages,60
from_poi_to_this_person,60
from_this_person_to_poi,60


In [57]:
obj_col

['name', 'email_address']

In [58]:
#字符類型資料缺失值處理
'''

判斷這兩項都不會對預測造成影響

'''
train = train.drop(['name', 'email_address'], axis = 1)
test =test_features.drop(['name', 'email_address'], axis = 1)
train.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [59]:
float_col

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [60]:
#數值類型資料缺失值處理
#把缺失值過多的欄位去掉
train = train.drop(['loan_advances', 'director_fees', 'restricted_stock_deferred', 'deferral_payments', 'deferred_income','long_term_incentive'], axis = 1)
test = test.drop(['loan_advances', 'director_fees', 'restricted_stock_deferred', 'deferral_payments', 'deferred_income','long_term_incentive'], axis = 1)
na_check(train)

Unnamed: 0,Missing number
bonus,64
to_messages,60
shared_receipt_with_poi,60
from_this_person_to_poi,60
from_poi_to_this_person,60
from_messages,60
other,53
salary,51
expenses,51
exercised_stock_options,44


In [61]:
#用回歸隨機森林補上預測值
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

value_missing = train.copy()


sortindex = np.argsort(value_missing .isnull().sum(axis=0)).values
sortindex

array([12, 11,  7,  1,  2,  8,  6,  3,  4,  5,  9, 10,  0])

In [62]:
for i in sortindex:
    df = value_missing
    
    df = df.iloc[:,df.columns != i]
    
    fill_target = df.iloc[:, i]
    
    #空缺處補零，隨機森林遇到空值會出錯
    df_0 = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 0).fit_transform(df)
    
    Y_train = fill_target[fill_target.notnull()]
    Y_test = fill_target[fill_target.isnull()]
    X_train = df_0[Y_train.index, :]
    X_test = df_0[Y_test.index, :]

    
    rfc = RandomForestRegressor(n_estimators=100)
    rfc = rfc.fit(X_train, Y_train)
    Ypredict = rfc.predict(X_test)
    value_missing.loc[value_missing.iloc[:,i].isnull(), value_missing.columns[i]] = Ypredict

In [63]:
test_missing = test.copy()
sortindex = np.argsort(value_missing .isnull().sum(axis=0)).values
sortindex

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [64]:
#測試集也要填上
for i in sortindex:
    df = test_missing
    
    df = df.iloc[:,df.columns != i]
    
    fill_target = df.iloc[:, i]
    
    #空缺處補零，隨機森林遇到空值會出錯
    df_0 = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 0).fit_transform(df)
    
    Y_train = fill_target[fill_target.notnull()]
    Y_test = fill_target[fill_target.isnull()]
    X_train = df_0[Y_train.index, :]
    X_test = df_0[Y_test.index, :]

    
    rfc = RandomForestRegressor(n_estimators=100)
    rfc = rfc.fit(X_train, Y_train)
    Ypredict = rfc.predict(X_test)
    test_missing.loc[test_missing.iloc[:,i].isnull(), test_missing.columns[i]] = Ypredict

In [14]:
na_check(value_missing)

Unnamed: 0,Missing number


In [65]:
na_check(test_missing)

Unnamed: 0,Missing number


# 建立模型

In [15]:
#資料切割
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = value_missing[:y.shape[0]]
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=44)
from sklearn.preprocessing import StandardScaler

#normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
print("X train size: {}".format(X_train.shape))
print("X test size: {}".format(X_test.shape))

X train size: (84, 13)
X test size: (29, 13)


In [17]:
#寫出一個函數來尋找最佳參數
from sklearn.model_selection import GridSearchCV

def get_best_parameter(model, params, X, y):
    grid = GridSearchCV(model, params, cv=5, verbose=1, scoring="accuracy")  
    grid.fit(X, y)
    print(f"Best accuracy: {grid.best_score_}")
    print(f"Best parameters: {grid.best_params_}")
    print(f"Average time(s) to fit: {round(grid.cv_results_['mean_fit_time'].mean(), 3)}")
    print(f"Average time(s) to score: {round(grid.cv_results_['mean_score_time'].mean(), 3)}")
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    #  這邊依設置的參數列印出所有參數的得分狀況
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    return grid

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

**Logistic Regression**

In [23]:
lr = LogisticRegression(max_iter = 5000)
lr_params = {"C": [0.01,0.02,0.03,0.04,0.05]
             , "penalty": ["l1", "l2"]
             , "solver" :['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
             , "random_state" :[200]}

lr = get_best_parameter(lr, lr_params, X_train, Y_train)

Y_pred_lr = lr.predict(X_test)
print(f"Accuracy score: {accuracy_score(Y_test, Y_pred_lr)}")
print(f"F1 score: {f1_score(Y_test, Y_pred_lr)}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best accuracy: 0.9051470588235293
Best parameters: {'C': 0.02, 'penalty': 'l2', 'random_state': 200, 'solver': 'newton-cg'}
Average time(s) to fit: 0.003
Average time(s) to score: 0.0
0.893 (+/-0.044) for {'C': 0.01, 'penalty': 'l1', 'random_state': 200, 'solver': 'liblinear'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'l1', 'random_state': 200, 'solver': 'newton-cg'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'l1', 'random_state': 200, 'solver': 'lbfgs'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'l1', 'random_state': 200, 'solver': 'sag'}
0.893 (+/-0.044) for {'C': 0.01, 'penalty': 'l1', 'random_state': 200, 'solver': 'saga'}
0.881 (+/-0.182) for {'C': 0.01, 'penalty': 'l2', 'random_state': 200, 'solver': 'liblinear'}
0.893 (+/-0.044) for {'C': 0.01, 'penalty': 'l2', 'random_state': 200, 'solver': 'newton-cg'}
0.893 (+/-0.044) for {'C': 0.01, 'penalty': 'l2', 'random_state': 200, 'solver': 'lbfgs'}
0.893 (+/-0.044) for {'C': 0.01, 

**Decision Tree**

In [21]:
tree_params = {"criterion":['gini', 'entropy']
               ,"max_depth": [*range(5,10)]
               , "min_samples_leaf": [*range(4,10)]
               , "min_samples_split": [*range(3,9)]
               , "random_state" :[200]}

tree = DecisionTreeClassifier()
tree = get_best_parameter(tree, tree_params, X_train, Y_train)

Y_pred_tree = tree.predict(X_test)

print(f"Accuracy score: {accuracy_score(Y_test, Y_pred_tree)}")
print(f"F1 score: {f1_score(Y_test, Y_pred_tree)}")

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best accuracy: 0.8786764705882353
Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 8, 'random_state': 200}
Average time(s) to fit: 0.001
Average time(s) to score: 0.001
0.855 (+/-0.189) for {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 200}
0.855 (+/-0.189) for {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 4, 'random_state': 200}
0.855 (+/-0.189) for {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 5, 'random_state': 200}
0.855 (+/-0.189) for {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 6, 'random_state': 200}
0.855 (+/-0.189) for {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'mi

**Randon Forest**

In [22]:
forest_params = {'n_estimators': [20 , 40, 60, 80, 100]
                 ,'criterion' :['gini', 'entropy']
                 , "max_depth": [None, 1 ,3, 5, 7]
                 , "min_samples_split": [2, 3, 4, 5]
                 , "min_samples_leaf": [*range(3,9)]
                 , "random_state" :[200]
                 ,"n_jobs":[-1]}
forest = RandomForestClassifier(n_jobs = -1)
forest = get_best_parameter(forest, forest_params, X_train, Y_train)

Y_pred_forest = forest.predict(X_test)

print(f"Accuracy score: {accuracy_score(Y_test, Y_pred_forest)}")
print(f"F1 score: {f1_score(Y_test, Y_pred_forest)}")

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
Best accuracy: 0.9169117647058824
Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 60, 'n_jobs': -1, 'random_state': 200}
Average time(s) to fit: 0.099
Average time(s) to score: 0.016
0.893 (+/-0.044) for {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 20, 'n_jobs': -1, 'random_state': 200}
0.905 (+/-0.056) for {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 40, 'n_jobs': -1, 'random_state': 200}
0.917 (+/-0.056) for {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 60, 'n_jobs': -1, 'random_state': 200}
0.917 (+/-0.056) for {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 80, 'n_jobs': -1, 'random_state': 200}
0.905 (+/-0.056) for {'

**KNN**

In [24]:
knn_params = {"n_neighbors": [1, 2, 3, 4, 5, 6, 7]
             ,"algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
             ,"n_jobs":[-1]}

knn = KNeighborsClassifier(n_jobs = -1)

knn = get_best_parameter(knn, knn_params, X_train, Y_train)

Y_pred_knn = knn.predict(X_test)

print(f"Accuracy score: {accuracy_score(Y_test, Y_pred_knn)}")
print(f"F1 score: {f1_score(Y_test, Y_pred_knn)}")

Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best accuracy: 0.9169117647058824
Best parameters: {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 2}
Average time(s) to fit: 0.001
Average time(s) to score: 0.008
0.882 (+/-0.128) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 1}
0.917 (+/-0.056) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 2}
0.904 (+/-0.060) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 3}
0.905 (+/-0.056) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 4}
0.905 (+/-0.056) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 5}
0.893 (+/-0.044) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 6}
0.893 (+/-0.044) for {'algorithm': 'auto', 'n_jobs': -1, 'n_neighbors': 7}
0.882 (+/-0.128) for {'algorithm': 'ball_tree', 'n_jobs': -1, 'n_neighbors': 1}
0.917 (+/-0.056) for {'algorithm': 'ball_tree', 'n_jobs': -1, 'n_neighbors': 2}
0.904 (+/-0.060) for {'algorithm': 'ball_tree', 'n_jobs': -1, 'n_neighbors': 3}
0.905 

In [25]:
gradient_params = {"learning_rate": [0.05, 0.08, 0.11, 0.14, 0.17, 0.2]
                   ,"n_estimators": [20, 40 ,60 , 80, 100]
                   , "max_depth": [None, 1, 3, 5, 7]
                   , 'eval_metric' : ["logloss"]
                   , 'min_child_weight':[*range(1,6,1)]
                   , "random_state" :[200]}

gradient_tree = XGBClassifier()
gradient_tree = get_best_parameter(gradient_tree , gradient_params, X_train, Y_train)

Y_pred_gradient_tree = gradient_tree.predict(X_test)

print(f"Accuracy score: {accuracy_score(Y_test, Y_pred_gradient_tree)}")
print(f"F1 score: {f1_score(Y_test, Y_pred_gradient_tree)}")

Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Best accuracy: 0.9051470588235293
Best parameters: {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 40, 'random_state': 200}
Average time(s) to fit: 0.023
Average time(s) to score: 0.002
0.893 (+/-0.049) for {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_depth': None, 'min_child_weight': 1, 'n_estimators': 20, 'random_state': 200}
0.893 (+/-0.049) for {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_depth': None, 'min_child_weight': 1, 'n_estimators': 40, 'random_state': 200}
0.893 (+/-0.049) for {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_depth': None, 'min_child_weight': 1, 'n_estimators': 60, 'random_state': 200}
0.881 (+/-0.075) for {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_depth': None, 'min_child_weight': 1, 'n_estimators': 80, 'random_state': 200}
0.881 (+/-0.075) for {'eval_metric': 'logloss', 'learning_rate': 0.05, 'max_d

In [26]:
from mlxtend.classifier import StackingClassifier

meta_estimator = XGBClassifier(tol=100, subsample=0.7, n_estimators=100, max_depth=2, max_features="sqrt", learning_rate=0.2)
stacking = StackingClassifier(classifiers=[knn, lr, tree, forest, gradient_tree], meta_classifier=meta_estimator, use_probas=True, average_probas=False)
stacking.fit(X_train, Y_train)
Y_pred_stacking = stacking.predict_proba(X_test)[:, 1]
Y_pred_stacking

Fitting 5 folds for each of 28 candidates, totalling 140 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Parameters: { max_features, tol } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




array([0.02506422, 0.04684221, 0.01312932, 0.81064713, 0.01312932,
       0.09318562, 0.01312932, 0.01312932, 0.02019185, 0.81064713,
       0.7461553 , 0.02019185, 0.01312932, 0.01312932, 0.7461553 ,
       0.01312932, 0.07345118, 0.01312932, 0.01312932, 0.01312932,
       0.01312932, 0.01312932, 0.86896425, 0.03829756, 0.01535344,
       0.07345118, 0.01312932, 0.01312932, 0.01312932], dtype=float32)

In [28]:
#留下我們所需的feature
feature_to_predict =['bonus','to_messages', 'shared_receipt_with_poi'
 , 'from_this_person_to_poi', 'from_poi_to_this_person'
 , 'from_poi_to_this_person', 'from_messages'
 ,'other', 'salary', 'expenses', 'exercised_stock_options'
 ,'restricted_stock', 'total_payments', 'total_stock_value']

test_features = test_features[feature_to_predict]

In [75]:
test_missing

Unnamed: 0,bonus,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,other,restricted_stock,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,5249999.0,953136.0,17355.0,484.0,228.0,108.0,210698.0,157569.0,213999.0,5521.0,7991.0,5501630.0,1110705.0
1,1350000.0,886403.3,65907.0,27.0,140.0,15.0,1621.0,252055.0,278601.0,1593.0,1858.0,2669589.0,252055.0
2,1500000.0,5538001.0,34039.0,32.0,32.0,21.0,11350.0,853064.0,243293.0,1035.0,1045.0,288682.0,6391065.0
3,3000000.0,2291113.0,86174.0,3069.0,66.0,609.0,1661.0,1323148.0,365163.0,2097.0,3093.0,4747979.0,3614261.0
4,1000000.0,1435552.0,30674.0,49.0,58.0,12.0,307895.0,2502063.0,415189.0,1585.0,1892.0,1868758.0,2502063.0
5,1700000.0,875445.3,98849.0,27.0,40.0,1.0,1936.0,441096.0,211788.0,900.0,1320.0,2081796.0,441096.0
6,2000000.0,10433520.0,86987.0,21.0,242.0,6.0,7427621.0,4188667.0,1060932.0,2979.0,3275.0,17252530.0,14622180.0
7,323499.99,895016.8,9081.98,15.18,2.1,0.78,43298.4,32460.0,144852.46,98.68,166.7,254857.2,188115.3
8,800000.0,765920.0,96268.0,22.0,188.0,11.0,891.0,315068.0,278601.0,772.0,865.0,875760.0,1080988.0
9,100000.0,4160672.0,8409.0,56.81,8.39,0.89,202052.0,201483.0,76399.0,271.95,412.17,394475.0,4221891.0


In [66]:
test_data = test_missing

scaler = StandardScaler()
test_data = scaler.fit_transform(test_data)
test_data

array([[ 7.87415183e-02, -1.98927409e-01, -2.10405987e-01,
         4.95926242e-01,  2.95188452e+00,  7.38621270e-01,
        -1.99286863e-01, -1.95488235e-01, -1.88440720e-01,
         4.11983775e+00,  4.35030500e+00, -1.18451956e-01,
        -1.93384526e-01],
       [-1.57000232e-01, -2.00183581e-01, -1.55862142e-01,
        -3.15915943e-01,  1.52948786e+00, -1.51759253e-01,
        -2.27695727e-01, -1.91244632e-01, -1.74195251e-01,
         5.62828449e-01,  3.36369790e-01, -1.72047573e-01,
        -2.04965843e-01],
       [-1.47933239e-01, -1.12622246e-01, -1.91663000e-01,
        -3.07033643e-01, -2.16180783e-01, -9.43153484e-02,
        -2.26373775e-01, -1.64251811e-01, -1.81981062e-01,
         5.75302843e-02, -1.95723692e-01, -2.17105600e-01,
        -1.22163975e-01],
       [-5.72633122e-02, -1.73741428e-01, -1.33093973e-01,
         5.08807514e+00,  3.33381566e-01,  5.53518731e+00,
        -2.27690292e-01, -1.43139160e-01, -1.55107354e-01,
         1.01922679e+00,  1.14465448e

In [67]:
#第一列為False 第二列為True
y_pred_lr = lr.predict_proba(test_data)[:, 1]
y_pred_knn = knn.predict_proba(test_data)[:, 1]
y_pred_tree = tree.predict_proba(test_data)[:, 1]
y_pred_forest = forest.predict_proba(test_data)[:, 1]
y_pred_gradient_tree = gradient_tree.predict_proba(test_data)[:, 1]
y_pred_stacking = stacking.predict_proba(test_data)[:, 1]

In [77]:
name = test_features['name']

In [80]:
for model_name, model in zip(['lr', 'knn', 'tree', 'forest', 'gradient_tree', 'stacking'], [y_pred_lr, y_pred_knn, y_pred_tree, y_pred_forest, y_pred_gradient_tree, y_pred_stacking]):
    sub = pd.DataFrame({'name': name, 'poi': model})
    sub.to_csv(f'submission_{model_name}.csv', index=False)