In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import warnings
warnings.simplefilter('ignore')

import time
from pprint import pprint

In [2]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_features.csv')
print('train shape: {}'.format(train.shape))
print('test shape: {}'.format(test.shape))

poi = train['poi']
train.drop(['poi', 'name', 'email_address'], axis=1, inplace=True)
name = test['name']
test.drop(['name', 'email_address'], axis=1, inplace=True)
df = pd.concat([train, test])

train shape: (113, 22)
test shape: (33, 21)


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 19 columns):
bonus                        61 non-null float64
deferral_payments            28 non-null float64
deferred_income              34 non-null float64
director_fees                13 non-null float64
exercised_stock_options      81 non-null float64
expenses                     73 non-null float64
from_messages                65 non-null float64
from_poi_to_this_person      65 non-null float64
from_this_person_to_poi      65 non-null float64
loan_advances                2 non-null float64
long_term_incentive          49 non-null float64
other                        69 non-null float64
restricted_stock             82 non-null float64
restricted_stock_deferred    10 non-null float64
salary                       73 non-null float64
shared_receipt_with_poi      65 non-null float64
to_messages                  65 non-null float64
total_payments               96 non-null float64
total_stock_va

In [4]:
# ignore email_address
email_features = {'to_messages', 
                  'from_poi_to_this_person', 
                  'from_messages', 
                  'from_this_person_to_poi', 
                  'shared_receipt_with_poi'}
financial_features = set(train.columns) - email_features

In [5]:
for col in train.columns:
    if col in email_features:
        df[col] = np.nan_to_num(df[col])
    elif col in financial_features:
        df[col].fillna(df[col].median(), inplace=True)

### Supervised transformation based on gradient boosted trees

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics

data, target = df[:len(train)], poi
test = df[len(train):]

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.25, stratify=target)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, stratify=y_train)

  from numpy.core.umath_tests import inner1d


In [7]:
# Supervised transformation based on gradient boosted trees
grd = GradientBoostingClassifier(n_estimators=300)
grd_enc = OneHotEncoder()
grd_log = LogisticRegression(solver='lbfgs', max_iter=10000)

grd.fit(x_train, y_train)
grd_enc.fit(grd.apply(x_train)[:, :, 0])
grd_log.fit(grd_enc.transform(grd.apply(x_val)[:, :, 0]), y_val)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
y_pred_prob = grd_log.predict_proba(grd_enc.transform(grd.apply(x_test)[:, :, 0]))[:, 1]
y_pred = grd_log.predict(grd_enc.transform(grd.apply(x_test)[:, :, 0]))
print('val_score: {}'.format(np.mean(cross_val_score(grd_log, grd_enc.transform(grd.apply(x_train)[:, :, 0]), y_train, cv=10))))
print('\naccuracy: {}'.format(metrics.accuracy_score(y_test, y_pred)))

val_score: 1.0

accuracy: 0.7931034482758621


### parameter tuning

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

def tuning_parameters(alg, params, data, target):
    
    grid_search = GridSearchCV(alg, params, error_score=0, cv=10, n_jobs=-1)
    grid_search.fit(data, target)
    
    print(f'Best Accuracy: {grid_search.best_score_:.4f}')
    print(f'\nBest Parameters: {grid_search.best_params_}')
    print(f"\nAverage Time to fit: {grid_search.cv_results_['mean_fit_time'].mean():.4f}")
    print(f"\nAverage Time to predict: {grid_search.cv_results_['mean_score_time'].mean():.4f}")

In [30]:
gbc = GradientBoostingClassifier()

model = Pipeline([
    ('gbc', gbc)
])

params = {
    
    'gbc__n_estimators': [100, 200, 300],
    'gbc__max_depth': [20, 30, 40],
    'gbc__min_samples_split': [2, 3, 4],
    'gbc__min_samples_leaf': [1, 2, 3],
    'gbc__max_features': [5, 7, 9, 11, 13, 15, 19],
    'gbc__learning_rate': [0.7, 0.8, 0.9, 1.0],
    'gbc__subsample': [0.75, 0.8, 0.85],
}

start_time = time.time()
tuning_parameters(model, params, x_train, y_train)
print(f"Time elapsed = {time.time() - start_time} (sec)")

Best Accuracy: 0.9206

Best Parameters: {'gbc__learning_rate': 0.7, 'gbc__max_depth': 40, 'gbc__max_features': 5, 'gbc__min_samples_leaf': 2, 'gbc__min_samples_split': 3, 'gbc__n_estimators': 300, 'gbc__subsample': 0.75}

Average Time to fit: 0.1091

Average Time to predict: 0.0009
Time elapsed = 968.527872800827 (sec)


In [10]:
# revised GBC after hypertuning
grd_r = GradientBoostingClassifier(learning_rate=0.7,
                                   max_depth=40,
                                   max_features=5,
                                   min_samples_leaf=2,
                                   min_samples_split=3,
                                   n_estimators=300,
                                   subsample=0.75)
grd_r.fit(x_train, y_train)
grd_enc.fit(grd_r.apply(x_train)[:, :, 0])
grd_log.fit(grd_enc.transform(grd_r.apply(x_val)[:, :, 0]), y_val)

submission = pd.DataFrame(grd_log.predict_proba(grd_enc.transform(grd_r.apply(test)[:, :, 0]))[:, 1])
print(submission.shape)
submission.columns = ['poi']
submission['name'] = name
submission = submission[['name', 'poi']]

(33, 1)


In [11]:
submission.to_csv('submission.csv', index=False)

In [12]:
submission

Unnamed: 0,name,poi
0,BELDEN TIMOTHY N,0.026251
1,BOWEN JR RAYMOND M,0.08213
2,HANNON KEVIN P,0.263583
3,DELAINEY DAVID W,0.041818
4,CAUSEY RICHARD A,0.075655
5,HICKERSON GARY J,0.071663
6,FREVERT MARK A,0.056452
7,CHAN RONNIE,0.046663
8,DONAHUE JR JEFFREY M,0.188594
9,REYNOLDS LAWRENCE,0.074992
