In [1]:
import os
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn import datasets, metrics, linear_model
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

In [2]:
dir_data = '../data/ml100marathon'
train_app = os.path.join(dir_data, 'train_data.csv')
test_app = os.path.join(dir_data, 'test_features.csv')

In [3]:
df_train = pd.read_csv(train_app)
#df_train.head()
df_test = pd.read_csv(test_app)
#df_test.head()

In [4]:
df_train['deferred_income']*=-1
df_test['deferred_income']*=-1
df_train['restricted_stock_deferred']*=-1
df_test['restricted_stock_deferred']*=-1
train_Y = df_train['poi']
test_name = df_test['name']
#result_back = df_test[df_test['name']=='TOTAL']
df_train_d = df_train.drop(['director_fees','loan_advances','name', 'email_address','restricted_stock_deferred','poi'] , axis=1)
df_test_d = df_test.copy()
#df_test_d[df_test_d['name'] == 'TOTAL'] = np.nan
df_test_d = df_test_d.drop(['director_fees','loan_advances','name','restricted_stock_deferred','email_address'] , axis=1)
#result_back_d = result_back.drop(['director_fees','loan_advances','name','restricted_stock_deferred','email_address'] , axis=1)

print(df_train_d.values.shape)
print(df_test_d.values.shape)
df = pd.concat([df_train_d,df_test_d])
print(df.values.shape)
train_num = train_Y.shape[0]
#df

(113, 16)
(33, 16)
(146, 16)


In [5]:
def fill_and_scale(in_df,fill_by,scale_by):
    if fill_by == 'mean':
        df1 = in_df.fillna(in_df.mean())
    elif fill_by == 'median':
        df1 = in_df.fillna(in_df.median())
    elif fill_by == 'zero':
        df1 = in_df.fillna(0)
        
    if scale_by == 'std':
        df1 = StandardScaler().fit_transform(df1)
    elif scale_by == 'minmax':
        df1 = MinMaxScaler().fit_transform(df1)
    return df1

def run_and_predict(model,_train_X,_train_Y,_test_X,_test_Y):
    model.fit(_train_X,_train_Y)
    y_test_pred = model.predict(_test_X)
    ascore = accuracy_score(y_test_pred,_test_Y)
    prob_out = model.predict_proba(_test_X)[:,1];
    print("prob_out.shape",prob_out.shape)
    print("_test_Y.shape",_test_Y.shape)
    aucscore = roc_auc_score(_test_Y,prob_out)
    print("accuracy:", ascore)
    print("aucscore:",aucscore)
    return y_test_pred, prob_out, ascore, aucscore
def write_prob (filename, _test_name, _y_prob):
    y_pred_df = pd.DataFrame(data={'name':_test_name.values,'poi':_y_prob})
    y_pred_df.to_csv(filename,index=None)
def clip_outliers (in_df, col, th_low, th_high):
    out_df = in_df.copy()
    out_df[col] = in_df[col].clip(th_low,th_high)
    return out_df
def log1p (in_df, col):
    out_df = in_df.copy()
    out_df[col] = np.log1p(in_df[col])
    return out_df

In [6]:
df1 = fill_and_scale(df,'mean','std')
estimator1 = LogisticRegression()
df2 = fill_and_scale(df,'median','minmax')
estimator2 = GradientBoostingClassifier()
df3 = fill_and_scale(df,'median','minmax')
estimator3 = RandomForestClassifier()

In [7]:
#split train and test 
train_X = df1[:train_num]
test_X = df1[train_num:]
train_X1, test_X1, train_Y1, test_Y1 = train_test_split(train_X,train_Y,test_size=0.25,random_state=118)
print(cross_val_score(estimator1, train_X1, train_Y1, cv=5).mean())
print(cross_val_score(estimator2, train_X1, train_Y1, cv=5).mean())
print(cross_val_score(estimator3, train_X1, train_Y1, cv=5).mean())
print("X_train.shape = ",train_X1.shape)
print("X_test.shape = ",test_X1.shape)
print("y_train.shape = ",train_Y1.shape)
print("y_test.shape = ",test_Y1.shape)

0.882107843137
0.883496732026
0.918137254902
X_train.shape =  (84, 16)
X_test.shape =  (29, 16)
y_train.shape =  (84,)
y_test.shape =  (29,)


In [8]:
prob_out1, pred_out1, accu1, auc1 = run_and_predict(estimator1,train_X1, train_Y1, test_X1, test_Y1)
outcome1 = estimator1.predict_proba(test_X)[:,1]

prob_out2, pred_out2, accu2, auc2 = run_and_predict(estimator2,train_X1, train_Y1, test_X1, test_Y1)
outcome2 = estimator2.predict_proba(test_X)[:,1]

prob_out3, pred_out3, accu3, auc3 = run_and_predict(estimator3,train_X1, train_Y1, test_X1, test_Y1)
outcome3 = estimator3.predict_proba(test_X)[:,1]

prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.931034482759
aucscore: 0.388888888889
prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.689655172414
aucscore: 0.277777777778
prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.862068965517
aucscore: 0.592592592593


In [9]:
prob_out = (prob_out1*0.73+prob_out2*0.83+prob_out3*0.67)/(0.73+0.83+0.67)
aucout = roc_auc_score(test_Y1,prob_out)
print(aucout)

0.37037037037


In [10]:
outcome1

array([ 0.32000286,  0.1476849 ,  0.28623662,  0.12338059,  0.19930283,
        0.16333632,  0.58272571,  0.13370174,  0.08529486,  0.10853468,
        0.13564484,  0.15036349,  0.13871193,  0.13394383,  0.19922134,
        0.13895956,  0.10640126,  0.11315506,  0.10746148,  0.09938178,
        1.        ,  0.19447384,  0.16479864,  0.16284755,  0.1310931 ,
        0.13915429,  0.17311455,  0.10497622,  0.17275299,  0.20414885,
        0.12172335,  0.11727173,  0.22661347])

In [11]:
outcome2

array([  9.64717211e-01,   9.85930026e-01,   9.78442710e-01,
         7.83316319e-04,   1.12513862e-03,   7.87616114e-03,
         9.99160993e-01,   5.76371544e-03,   9.52699162e-04,
         6.83939822e-03,   1.12513862e-03,   4.69398252e-04,
         2.35545354e-03,   4.78339988e-03,   9.87652205e-01,
         4.69398252e-04,   2.22662381e-03,   1.03671952e-02,
         8.79174423e-01,   6.45313531e-04,   9.95162244e-01,
         1.12513862e-03,   7.87948391e-04,   1.04842388e-03,
         1.04754793e-03,   3.25308019e-03,   7.70080887e-04,
         9.27154457e-04,   3.53682767e-03,   1.83888653e-03,
         7.88607438e-04,   3.89139538e-03,   1.13295893e-02])

In [None]:
outcome3

array([ 0.3,  0.5,  0.6,  0.1,  0.1,  0.4,  0.6,  0.1,  0. ,  0.2,  0. ,
        0. ,  0. ,  0. ,  0.5,  0. ,  0.3,  0.3,  0.5,  0. ,  1. ,  0.2,
        0. ,  0.1,  0.2,  0.3,  0.1,  0.1,  0.2,  0.2,  0. ,  0.1,  0.1])

In [None]:
outcome = (outcome1*0.72875+outcome2*0.83571)/(0.72875+0.83571)
pirnt(roc_auc_score(_test_Y,outocme))

In [None]:
print(outcome)
print(outcome.shape)
write_prob('02_18_5.csv',test_name,outcome)

In [None]:
# df = log1p(df,'bonus')
# df = log1p(df,'deferral_payments')
# df = log1p(df,'deferred_income')
# df = log1p(df,'exercised_stock_options')
# df = log1p(df,'expenses')
# df = log1p(df,'long_term_incentive')
# df = log1p(df,'restricted_stock')
# df = log1p(df,'salary')
# df = log1p(df,'shared_receipt_with_poi')
# df = log1p(df,'to_messages')
# df = log1p(df,'total_payments')
# df = log1p(df,'total_stock_value')

In [None]:
# all_corr = df.corr()
# all_corr

In [None]:
# fig_num = 16
# plt.figure(figsize=(20,fig_num*3))
# plt.subplots_adjust(hspace=0.5)
# index = 1
# for col in df:
#     plt.subplot(fig_num,1,index)
#     plt.title("distribution of "+col)
#     sns.distplot(df[col].dropna())
#     index += 1
# plt.show()

In [None]:

# plt.figure(figsize=(15,15))
# heatmap = sns.heatmap(all_corr, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)

# plt.show()

In [None]:
# #import seaborn as sns
# fig_num = 8*15 
# plt.figure(figsize=(20,fig_num*3))
# plt.subplots_adjust(hspace=0.5)
# index = 1
# for i in range(len(list(df))):
#     for j in range(len(list(df))):
#         if j > i:
#             plt.subplot(fig_num,1,index)
#             plt.title(list(df)[i]+' vs '+list(df)[j]+' corr = '+str(all_corr.iloc[i,j]))
#             #sns.kdeplot(df.iloc[:,i],df.iloc[:,j])
#             plt.plot(df.iloc[:,i],df.iloc[:,j],'.')
#             index += 1
# plt.show()

In [None]:
# df1 = df.fillna(df.mean())
# df_temp = MinMaxScaler().fit_transform(df1)
# train_X = df_temp[:train_num]
# test_X = df_temp[train_num:]
# estimator = GradientBoostingClassifier()
# print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# estimator.fit(train_X,train_Y)
# y_hat = estimator.predict(test_X)
# #print(test_X)
# print(y_hat)
# y_prob2 = estimator.predict_proba(test_X)[:,1]
# print(y_prob2)

In [None]:
# df1 = df.fillna(df.mean())
# df_temp = StandardScaler().fit_transform(df1)
# train_X = df_temp[:train_num]
# test_X = df_temp[train_num:]
# estimator = RandomForestClassifier()
# print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# estimator.fit(train_X,train_Y)
# y_hat = estimator.predict(test_X)
# #print(test_X)
# print(y_hat)
# y_prob3 = estimator.predict_proba(test_X)[:,1]
# print(y_prob3)

# vocabulary
* repayment: 貸款的還款
* loan advances: 貸款預付款
* promissory note: 本票
* severance: 遣散