In [1]:
import os
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn import datasets, metrics, linear_model
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

In [2]:
dir_data = '../data/ml100marathon'
train_app = os.path.join(dir_data, 'train_data.csv')
test_app = os.path.join(dir_data, 'test_features.csv')

In [3]:
df_train = pd.read_csv(train_app)
#df_train.head()
df_test = pd.read_csv(test_app)
#df_test.head()

In [4]:
df_train['deferred_income']*=-1
df_test['deferred_income']*=-1
df_train['restricted_stock_deferred']*=-1
df_test['restricted_stock_deferred']*=-1
train_Y = df_train['poi']
test_name = df_test['name']
#result_back = df_test[df_test['name']=='TOTAL']
df_train_d = df_train.drop(['director_fees','loan_advances','name', 'email_address','restricted_stock_deferred','poi'] , axis=1)
df_test_d = df_test.copy()
#df_test_d[df_test_d['name'] == 'TOTAL'] = np.nan
df_test_d = df_test_d.drop(['director_fees','loan_advances','name','restricted_stock_deferred','email_address'] , axis=1)
#result_back_d = result_back.drop(['director_fees','loan_advances','name','restricted_stock_deferred','email_address'] , axis=1)

print(df_train_d.values.shape)
print(df_test_d.values.shape)
df = pd.concat([df_train_d,df_test_d])
print(df.values.shape)
train_num = train_Y.shape[0]
#df

(113, 16)
(33, 16)
(146, 16)


In [5]:
def fill_and_scale(in_df,fill_by,scale_by):
    if fill_by == 'mean':
        df1 = in_df.fillna(in_df.mean())
    elif fill_by == 'median':
        df1 = in_df.fillna(in_df.median())
    elif fill_by == 'zero':
        df1 = in_df.fillna(0)
        
    if scale_by == 'std':
        df1 = StandardScaler().fit_transform(df1)
    elif scale_by == 'minmax':
        df1 = MinMaxScaler().fit_transform(df1)
    return df1

def run_and_predict(model,_train_X,_train_Y,_test_X,_test_Y):
    model.fit(_train_X,_train_Y)
    y_test_pred = model.predict(_test_X)
    ascore = accuracy_score(y_test_pred,_test_Y)
    prob_out = model.predict_proba(_test_X)[:,1];
    print("prob_out.shape",prob_out.shape)
    print("_test_Y.shape",_test_Y.shape)
    aucscore = roc_auc_score(_test_Y,prob_out)
    print("accuracy:", ascore)
    print("aucscore:",aucscore)
    return y_test_pred, prob_out, ascore, aucscore
def write_prob (filename, _test_name, _y_prob):
    y_pred_df = pd.DataFrame(data={'name':_test_name.values,'poi':_y_prob})
    y_pred_df.to_csv(filename,index=None)
def clip_outliers (in_df, col, th_low, th_high):
    out_df = in_df.copy()
    out_df[col] = in_df[col].clip(th_low,th_high)
    return out_df
def log1p (in_df, col):
    out_df = in_df.copy()
    out_df[col] = np.log1p(in_df[col])
    return out_df

In [14]:
df1 = fill_and_scale(df,'mean','std')
estimator1 = LogisticRegression()
df2 = fill_and_scale(df,'median','minmax')
estimator2 = GradientBoostingClassifier()
df3 = fill_and_scale(df,'median','minmax')
estimator3 = RandomForestClassifier()

In [18]:
#split train and test 
train_X = df1[:train_num]
test_X = df1[train_num:]
train_X1, test_X1, train_Y1, test_Y1 = train_test_split(train_X,train_Y,test_size=0.25,random_state=48)
print(cross_val_score(estimator1, train_X1, train_Y1, cv=5).mean())
print(cross_val_score(estimator2, train_X1, train_Y1, cv=5).mean())
print(cross_val_score(estimator3, train_X1, train_Y1, cv=5).mean())
print("X_train.shape = ",train_X1.shape)
print("X_test.shape = ",test_X1.shape)
print("y_train.shape = ",train_Y1.shape)
print("y_test.shape = ",test_Y1.shape)

0.869852941176
0.869852941176
0.905882352941
X_train.shape =  (84, 16)
X_test.shape =  (29, 16)
y_train.shape =  (84,)
y_test.shape =  (29,)


In [19]:
prob_out1, pred_out1, accu1, auc1 = run_and_predict(estimator1,train_X1, train_Y1, test_X1, test_Y1)
outcome1 = estimator1.predict_proba(test_X)[:,1]

prob_out2, pred_out2, accu2, auc2 = run_and_predict(estimator2,train_X1, train_Y1, test_X1, test_Y1)
outcome2 = estimator2.predict_proba(test_X)[:,1]

prob_out3, pred_out3, accu3, auc3 = run_and_predict(estimator3,train_X1, train_Y1, test_X1, test_Y1)
outcome3 = estimator3.predict_proba(test_X)[:,1]

prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.896551724138
aucscore: 0.79
prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.896551724138
aucscore: 0.845
prob_out.shape (29,)
_test_Y.shape (29,)
accuracy: 0.896551724138
aucscore: 0.94


In [20]:
prob_out = (prob_out1*0.73+prob_out2*0.83+prob_out3*0.67)/(0.73+0.83+0.67)
aucout = roc_auc_score(test_Y1,prob_out)
print(aucout)

0.735


In [21]:
outcome1

array([ 0.44329017,  0.23369275,  0.18946494,  0.01967682,  0.16357818,
        0.13007691,  0.75320833,  0.11769373,  0.23399485,  0.08702239,
        0.10957935,  0.1039527 ,  0.11612666,  0.11881007,  0.15747197,
        0.09524761,  0.09100527,  0.10516589,  0.09674904,  0.0919729 ,
        1.        ,  0.13322058,  0.1379518 ,  0.14201427,  0.11600742,
        0.10540028,  0.10655495,  0.09493193,  0.11114004,  0.11920474,
        0.07948757,  0.09505753,  0.17205611])

In [22]:
outcome2

array([  8.80192195e-03,   3.77242185e-01,   1.56484148e-02,
         9.15835408e-04,   1.44701428e-02,   7.45328885e-03,
         9.97845348e-01,   4.56050908e-03,   7.90540360e-04,
         9.34935528e-04,   7.92702325e-04,   7.92702325e-04,
         6.83282315e-04,   8.72763241e-04,   1.16797721e-01,
         1.43119386e-03,   6.22731872e-04,   1.20162244e-01,
         1.39515626e-01,   4.26549878e-04,   9.91682883e-01,
         9.65172746e-04,   1.13224515e-03,   8.99875607e-04,
         1.13224515e-03,   1.47382447e-03,   6.83282315e-04,
         4.94877423e-04,   1.40469476e-03,   3.91266798e-03,
         4.94877423e-04,   7.18161392e-04,   3.60596486e-01])

In [12]:
outcome3

array([ 0.3,  0.5,  0.6,  0.1,  0.1,  0.4,  0.6,  0.1,  0. ,  0.2,  0. ,
        0. ,  0. ,  0. ,  0.5,  0. ,  0.3,  0.3,  0.5,  0. ,  1. ,  0.2,
        0. ,  0.1,  0.2,  0.3,  0.1,  0.1,  0.2,  0.2,  0. ,  0.1,  0.1])

In [27]:
outcome = (outcome1*0.73+outcome2*0.83+outcome3*0.67)/(0.73+0.83+0.67)
#print(roc_auc_score(test_Y1,outcome))

In [28]:
print(outcome)
print(outcome.shape)
write_prob('02_18_5.csv',test_name,outcome)

[ 0.17843382  0.27699853  0.15798098  0.06687185  0.0889786   0.04535533
  0.73814068  0.04022495  0.07689345  0.05887997  0.096256    0.0343244
  0.03826887  0.03921782  0.18515545  0.0317124   0.03002274  0.16928509
  0.20377792  0.03026648  0.87672502  0.04396956  0.04558053  0.07686875
  0.03839694  0.03505179  0.03513553  0.03126056  0.03690499  0.10056815
  0.02620479  0.06142963  0.31071571]
(33,)


In [None]:
# df = log1p(df,'bonus')
# df = log1p(df,'deferral_payments')
# df = log1p(df,'deferred_income')
# df = log1p(df,'exercised_stock_options')
# df = log1p(df,'expenses')
# df = log1p(df,'long_term_incentive')
# df = log1p(df,'restricted_stock')
# df = log1p(df,'salary')
# df = log1p(df,'shared_receipt_with_poi')
# df = log1p(df,'to_messages')
# df = log1p(df,'total_payments')
# df = log1p(df,'total_stock_value')

In [None]:
# all_corr = df.corr()
# all_corr

In [None]:
# fig_num = 16
# plt.figure(figsize=(20,fig_num*3))
# plt.subplots_adjust(hspace=0.5)
# index = 1
# for col in df:
#     plt.subplot(fig_num,1,index)
#     plt.title("distribution of "+col)
#     sns.distplot(df[col].dropna())
#     index += 1
# plt.show()

In [None]:

# plt.figure(figsize=(15,15))
# heatmap = sns.heatmap(all_corr, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)

# plt.show()

In [None]:
# #import seaborn as sns
# fig_num = 8*15 
# plt.figure(figsize=(20,fig_num*3))
# plt.subplots_adjust(hspace=0.5)
# index = 1
# for i in range(len(list(df))):
#     for j in range(len(list(df))):
#         if j > i:
#             plt.subplot(fig_num,1,index)
#             plt.title(list(df)[i]+' vs '+list(df)[j]+' corr = '+str(all_corr.iloc[i,j]))
#             #sns.kdeplot(df.iloc[:,i],df.iloc[:,j])
#             plt.plot(df.iloc[:,i],df.iloc[:,j],'.')
#             index += 1
# plt.show()

In [None]:
# df1 = df.fillna(df.mean())
# df_temp = MinMaxScaler().fit_transform(df1)
# train_X = df_temp[:train_num]
# test_X = df_temp[train_num:]
# estimator = GradientBoostingClassifier()
# print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# estimator.fit(train_X,train_Y)
# y_hat = estimator.predict(test_X)
# #print(test_X)
# print(y_hat)
# y_prob2 = estimator.predict_proba(test_X)[:,1]
# print(y_prob2)

In [None]:
# df1 = df.fillna(df.mean())
# df_temp = StandardScaler().fit_transform(df1)
# train_X = df_temp[:train_num]
# test_X = df_temp[train_num:]
# estimator = RandomForestClassifier()
# print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
# estimator.fit(train_X,train_Y)
# y_hat = estimator.predict(test_X)
# #print(test_X)
# print(y_hat)
# y_prob3 = estimator.predict_proba(test_X)[:,1]
# print(y_prob3)

# vocabulary
* repayment: 貸款的還款
* loan advances: 貸款預付款
* promissory note: 本票
* severance: 遣散