In [1]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

from sklearn.decomposition import PCA

# Read-in data

In [2]:
data_path = "data/"
raw_data = pd.read_csv(data_path+"PreprocessedData.csv")
raw_data.head()

Unnamed: 0,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,from_this_person_to_poi,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,604575.5,94299.0,68924.45,29.0,365788.0,38986.056989,1061827.0,585062.0,38.0,1.0,585062.0,600000.0,-193872.090134,384015.3,702.0,1740.0,-281517.3,664928.9,807.0,0
1,1586055.0,11200.0,6680544.0,205.905383,267102.0,-145258.783504,5634343.0,3942714.0,48.4706,27.97807,10623258.0,1200000.0,-940898.965987,-1667853.0,1102.11315,2660303.0,-1386055.0,1295738.0,1862.395226,0
2,709021.0,78552.0,4890344.0,353.540028,170941.0,-31622.031205,211725.0,1788391.0,21.318312,35.511657,6678735.0,350000.0,-652381.839975,-3087491.0,769.865004,12961.0,-400729.0,299473.8,1658.326109,0
3,652052.3,55029.796303,651850.0,12.0,254564.804739,72785.472422,2620577.0,386335.0,10.0,0.0,1038185.0,1025666.0,-109153.579009,1389209.0,58.0,323481.4,-604874.7,760233.3,764.0,0
4,1617011.0,34039.0,5538001.0,32.0,243293.0,148341.996637,288682.0,853064.0,32.0,21.0,6391065.0,1500000.0,-946394.118559,-6507245.0,1035.0,11350.0,-3117011.0,2065934.0,1045.0,1


In [3]:
features = raw_data.iloc[:,:-1]
targets  = raw_data.iloc[:,-1:]

In [4]:
features.head()

Unnamed: 0,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,from_this_person_to_poi,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages
0,604575.5,94299.0,68924.45,29.0,365788.0,38986.056989,1061827.0,585062.0,38.0,1.0,585062.0,600000.0,-193872.090134,384015.3,702.0,1740.0,-281517.3,664928.9,807.0
1,1586055.0,11200.0,6680544.0,205.905383,267102.0,-145258.783504,5634343.0,3942714.0,48.4706,27.97807,10623258.0,1200000.0,-940898.965987,-1667853.0,1102.11315,2660303.0,-1386055.0,1295738.0,1862.395226
2,709021.0,78552.0,4890344.0,353.540028,170941.0,-31622.031205,211725.0,1788391.0,21.318312,35.511657,6678735.0,350000.0,-652381.839975,-3087491.0,769.865004,12961.0,-400729.0,299473.8,1658.326109
3,652052.3,55029.796303,651850.0,12.0,254564.804739,72785.472422,2620577.0,386335.0,10.0,0.0,1038185.0,1025666.0,-109153.579009,1389209.0,58.0,323481.4,-604874.7,760233.3,764.0
4,1617011.0,34039.0,5538001.0,32.0,243293.0,148341.996637,288682.0,853064.0,32.0,21.0,6391065.0,1500000.0,-946394.118559,-6507245.0,1035.0,11350.0,-3117011.0,2065934.0,1045.0


In [5]:
targets.head()

Unnamed: 0,poi
0,0
1,0
2,0
3,0
4,1


In [6]:
# Flatten targets data
targets = np.ravel(targets)

# ML

In [9]:
# Scorer for GridSearchCV
scorer = make_scorer(accuracy_score) 

### RandomForrest

In [12]:
hyper_parameters_RF = {"n_estimators": [100,125,150,200,220,250,270,275],
                       "min_samples_split": [3,4,5]
                      }
#260,2
# Create GridSearchCV object
grid_RF = GridSearchCV(RandomForestClassifier(),cv=10, param_grid=hyper_parameters_RF, scoring=scorer)
fit_grid_RF = grid_RF.fit(features,targets) # fit grid to our training data
clf_RF = fit_grid_RF.best_estimator_ # save the best performing classifier to variable

print("Best Hyper-parameters: \n", grid_RF.best_params_)
print("\n--------------------------------------------------------------------------------------------------\n")
print("Model after hyper-parameter tuning: \n",grid_RF.best_estimator_)

Best Hyper-parameters: 
 {'min_samples_split': 4, 'n_estimators': 275}

--------------------------------------------------------------------------------------------------

Model after hyper-parameter tuning: 
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=275,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [13]:
# Cross-Validate the best classifier
scores_RF = cross_val_score(clf_RF, features, targets, cv=10)

print("\nAccuracy: ",scores_RF.mean())


Accuracy:  0.8766666666666667


### AdaBoost

In [14]:
# Set hyperparameters to test
hyper_parameters_AB = {"n_estimators": [25,50,100,125,150],
                       "base_estimator": [DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3),RandomForestClassifier(max_depth=2),RandomForestClassifier(max_depth=4)]
                      }
# Create GridSearchCV object
grid_AB = GridSearchCV(AdaBoostClassifier(), param_grid=hyper_parameters_AB, scoring=scorer)
fit_grid_AB = grid_AB.fit(features,targets) # fit grid to our training data
clf_AB = fit_grid_AB.best_estimator_ # save the best performing classifier to variable

print("Best Hyper-parameters: \n", grid_AB.best_params_)
print("\n--------------------------------------------------------------------------------------------------\n")
print("Model after hyper-parameter tuning: \n",grid_AB.best_estimator_)

Best Hyper-parameters: 
 {'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), 'n_estimators': 125}

--------------------------------------------------------------------------------------------------

Model after hyper-parameter tuning: 
 AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                      

In [15]:
# Cross-Validate the best classifier
scores_AB = cross_val_score(clf_AB, features, targets, cv=10)

print("\nAccuracy: ",scores_AB.mean())


Accuracy:  0.8833333333333334


## PCA

Check scatter plots of features against features to see if they can be linearly combined by PCA.

We'll also look at the targets as two different colors and see if we can find a pattern

In [None]:
def PlotFeaturesFeatures(data_to_plot):
    fig = plt.figure(figsize=(25,300))
    
    features_to_plot = data_to_plot.iloc[:,:-1]
    targets_to_plot = data_to_plot.iloc[:,-1:]
    
    num_features = features_to_plot.shape[1]
    
    # Calculate combination. n = num_features : p = 2, since we want to plot 2 features against each other
    combinations_to_plot = list(combinations(features_to_plot, 2))
    # num_plots tells us how many plots to make
    num_plots = len(combinations_to_plot)

    # will have 3 cols, so to calculate number of rows we need:
    num_rows = np.ceil( num_plots/3. ) # Will round up
    
    # Palette for each target (0,1)
    colors = ['#fc8d59','#91bfdb'] # These are two distinct and color-blind friendly colors
    palette = sns.color_palette(colors)

    for i in range(num_plots):
        axis = fig.add_subplot(num_rows, 3, i+1)
        plot = sns.relplot(x=combinations_to_plot[i][0], y=combinations_to_plot[i][1], hue="poi", palette=palette,data=data_to_plot, ax=axis)
        plt.close(plot.fig) # close figure that seaborn creates when sns.relplot is called 
    plt.show()

In [None]:
PlotFeaturesFeatures(imputed_df)

It looks like some of the features are linearly related, like salary and total_payments (makes sense), so let's try to use PCA

In [34]:
pca = PCA(n_components=2)
pca_fit = pca.fit(features)
pca_features = pca_fit.transform(features)
# pca_test = pca_fit.transform(features_test)

In [35]:
# Set hyperparameters to test
hyper_parameters_pca_AB = {"n_estimators": [25,50,100,125,150],
                       "base_estimator": [DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3),RandomForestClassifier(max_depth=2),RandomForestClassifier(max_depth=4)]
                      }
# Create GridSearchCV object
grid_pca_AB = GridSearchCV(AdaBoostClassifier(), param_grid=hyper_parameters_pca_AB, scoring=scorer)
fit_grid_pca_AB = grid_pca_AB.fit(pca_features,targets) # fit grid to our training data
clf_pca_AB = fit_grid_pca_AB.best_estimator_ # save the best performing classifier to variable

print("Best Hyper-parameters: \n", grid_pca_AB.best_params_)
print("\n--------------------------------------------------------------------------------------------------\n")
print("Model after hyper-parameter tuning: \n",grid_pca_AB.best_estimator_)

Best Hyper-parameters: 
 {'base_estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'n_estimators': 50}

--------------------------------------------------------------------------------------------------

Model after hyper-parameter tuning: 
 AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         ccp_alpha=0.0,
                                                         class_weig

In [36]:
# Cross-Validate the best classifier
scores_pca_AB = cross_val_score(clf_pca_AB, pca_features, targets, cv=10)

print("\nAccuracy: ",scores_pca_AB.mean())


Accuracy:  0.8761904761904761
