# Enron Person of Interest Classification

In [1]:
# Read and clean the data, add new features, split into testing/training subsets

# Import libraries
import pickle
from tester import dump_classifier_and_data
from dos_to_unix import d2ux
import pandas as pd
import numpy as np
from math import inf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, make_scorer, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50

# Remove obvious outliers (These were discovered by reviewing enron61702insiderpay.pdf)
def remove_outliers(data):
    data.pop('TOTAL', 0 ) # Remove TOTAL 
    data.pop('THE TRAVEL AGENCY IN THE PARK', 0) # Remove THE TRAVEL AGENCY IN THE PARK due to lack of relevance
    print('Removed the following outliers:')
    print('TOTAL')
    print('THE TRAVEL AGENCY IN THE PARK')
    
# Read data to dataframe
def to_dataframe(data):
    """
    Create pandas dataframe from dictionary of enron data
    """
    dataframe = pd.DataFrame(data).T
    dataframe.index = dataframe.index.rename('name')
    return dataframe

def new_columns(dataframe):
    """
    Create the following new columns:
    
    total_restricted_stock_diff: total stock value - restricted stock
    to_poi_ratio - ratio of emails to poi
    from_poi_ratio - ratio of emails from poi
    payments_to_stock_ratio - ratio of total payments to total stock value
    """
    dataframe['total_restricted_stock_diff'] = dataframe.total_stock_value - dataframe.restricted_stock 
    dataframe['to_poi_ratio'] = dataframe.from_this_person_to_poi/ dataframe.to_messages
    dataframe['from_poi_ratio'] = dataframe.from_poi_to_this_person / dataframe.from_messages
    dataframe['payments_to_stock_ratio'] = dataframe.total_payments / dataframe.total_stock_value
    # fill inf or NaN ratios in new columns with 0.0
    dataframe.loc[dataframe.to_poi_ratio.isna(), 'to_poi_ratio'] = 0.0
    dataframe.loc[dataframe.from_poi_ratio.isna(), 'from_poi_ratio'] = 0.0
    dataframe.loc[(dataframe.payments_to_stock_ratio == inf) | 
                  (dataframe.payments_to_stock_ratio.isna()), 'payments_to_stock_ratio'] = 0.0 
    return dataframe

def clean_data(dataframe):
    """
    Add new columns, convert "poi" from boolean to binary, convert pandas objects to floats (fill NaN with 0.0), 
    split X and y data
    """
    nan_dict = {} # Create dictionary to store total NaN values by column label
    for col in dataframe.columns:
        # Adjust string "NaN" values to the numpy representation, count total nan values for each label and save to nan_dict
        dataframe.loc[dataframe[col] == 'NaN', col] = np.nan
        nan_dict.update({col : dataframe[col].isnull().sum()})
        if col == 'email_address':
            pass
        elif col == 'poi':
            dataframe[col] = dataframe[col].astype(int) # Convert poi to binary
        else:
            dataframe[col] = dataframe[col].astype(float) # All other numeric values as floats
            dataframe[col] = dataframe[col].fillna(0.0) # Data imputation (fill missing data)   
    # Sort nan dict, and print
    nan_dict = dict(sorted(nan_dict.items(), key = lambda item: item[1]))
    print('\nTOTAL NAN VALUES BY LABEL:\n')
    for k in nan_dict.keys():
        print(k, ':', nan_dict[k])
    dataframe = new_columns(dataframe) # Add new columns
    # Create separate variables for X and y
    print('\nDataset after cleaning data and adding new features:\n')
    dataframe.info()
    X_features = [feature for feature in dataframe.columns if feature not in ['poi', 'email_address']]
    X_data = dataframe[X_features]
    y_data = dataframe.poi
    return X_data, y_data

# Convert dos linefeed to unix, return new .pkl file name ("_unix" appended)
dataset_file = d2ux("final_project_dataset.pkl") 

# Load the dictionary containing the dataset
with open(dataset_file, "rb") as data_file:
    data_dict = pickle.load(data_file)

# Remove outliers
remove_outliers(data_dict)

# Read data to dataframe for easier data manipulation, and clean data
df = to_dataframe(data_dict)
# Print the total number of people in the dataset after removing outliers
print(f'\nThe total number of people in the dataset after removing outliers: {len(df)}')
# Print the total number of POIs, and non-POIs
print(f'Total Persons of Interest: {df.poi.sum()}\nTotal who are not Persons of Interest: {len(df) - df.poi.sum()}')
X, y = clean_data(df)

print('\nSample of data after data cleaning, and prior to final feature selection and standardization:')
pd.concat([y, X], axis = 1).head(5)

Removed the following outliers:
TOTAL
THE TRAVEL AGENCY IN THE PARK

The total number of people in the dataset after removing outliers: 144
Total Persons of Interest: 18
Total who are not Persons of Interest: 126

TOTAL NAN VALUES BY LABEL:

poi : 0
total_stock_value : 19
total_payments : 21
email_address : 33
restricted_stock : 35
exercised_stock_options : 43
salary : 50
expenses : 50
other : 53
to_messages : 58
from_poi_to_this_person : 58
from_messages : 58
from_this_person_to_poi : 58
shared_receipt_with_poi : 58
bonus : 63
long_term_incentive : 79
deferred_income : 96
deferral_payments : 106
restricted_stock_deferred : 127
director_fees : 128
loan_advances : 141

Dataset after cleaning data and adding new features:

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, METTS MARK to GLISAN JR BEN F
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   salary                     

Unnamed: 0_level_0,poi,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees,total_restricted_stock_diff,to_poi_ratio,from_poi_ratio,payments_to_stock_ratio
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
METTS MARK,0,365788.0,807.0,0.0,1061827.0,0.0,600000.0,0.0,0.0,585062.0,94299.0,38.0,0.0,29.0,1740.0,1.0,0.0,702.0,585062.0,0.0,0.0,0.001239,1.310345,1.814897
BAXTER JOHN C,0,267102.0,0.0,1295738.0,5634343.0,0.0,1200000.0,0.0,-1386055.0,10623258.0,11200.0,0.0,6680544.0,0.0,2660303.0,0.0,1586055.0,0.0,3942714.0,0.0,6680544.0,0.0,0.0,0.530378
ELLIOTT STEVEN,0,170941.0,0.0,0.0,211725.0,0.0,350000.0,0.0,-400729.0,6678735.0,78552.0,0.0,4890344.0,0.0,12961.0,0.0,0.0,0.0,1788391.0,0.0,4890344.0,0.0,0.0,0.031701
CORDES WILLIAM R,0,0.0,764.0,0.0,0.0,0.0,0.0,0.0,0.0,1038185.0,0.0,10.0,651850.0,12.0,0.0,0.0,0.0,58.0,386335.0,0.0,651850.0,0.0,0.833333,0.0
HANNON KEVIN P,1,243293.0,1045.0,0.0,288682.0,0.0,1500000.0,0.0,-3117011.0,6391065.0,34039.0,32.0,5538001.0,32.0,11350.0,21.0,1617011.0,1035.0,853064.0,0.0,5538001.0,0.020096,1.0,0.04517


In [2]:
# Test a variety of classifiers to achieve better than 0.3 precision and recall

# Choose if data is going to be scaled, and if/how many features will be selected
def scale_selectk(X_data, y_data, scale = True, selectk = True, k = 'all'):
    """
    Specifies whether to scale the data, and which best columns to keep
    """
    # Scale data (keep as dataframe) if scale = True
    if scale == True:
        scaler = MinMaxScaler()
        X_data = pd.DataFrame(data = scaler.fit_transform(X_data),
                              columns = X_data.columns,
                              index = X_data.index)
    # Keep selected X features if selectk = True
    if selectk == True:
        best = SelectKBest(f_classif, k = k).fit(X_data, y_data)
        X_mask = best.get_support()
        X_kept_cols = X_data.columns[X_mask].tolist()
        print('The kept features are:', X_kept_cols)
        print('The total number of kept features are:', len(X_kept_cols))
        print('The feature scores are:')
        for c, b in zip(X_data.columns, best.scores_):
            print(c, ':', b)
        X_data = X_data[X_kept_cols]
    return X_data

# Print different performance metrics for a classifier
def metrics(predictions, y_val, estimator_descr = 'estimator'):
    """
    Prints the accuracy, f1_score, recall, and precision for a given prediction
    """
    print(f'\nAccuracy Score for {estimator_descr}: {accuracy_score(predictions, y_val) * 100}%')
    print(f'F1 Score for {estimator_descr}: {f1_score(predictions, y_val) * 100}%')
    print(f'Precision Score for {estimator_descr}: {precision_score(predictions, y_val) * 100}%')
    print(f'Recall Score for {estimator_descr}: {recall_score(predictions, y_val) * 100}%\n')
    
# Define performance reporting function to print the best performing parameters when using GridSearchCV
def best_performance(classifier):
    """
    Reports performance and parameters of the best classifier of a parameter search/CV
    """
    print('Best Parameters: ' + str(classifier.best_params_) + '\n')
    print('Best Score: ' + str(classifier.best_score_))

# Optimize estimator using cross-validation
def EstimatorCV(X_data, y_data, pipeline, params): 
    # define parameters to test using grid search
    estimator = GridSearchCV(pipeline, 
                             param_grid = params, 
                             cv = 5, 
                             scoring = make_scorer(f1_score, pos_label = 1),
                             verbose = True, 
                             n_jobs = -1)
    # find best training parameters
    best_estimator = estimator.fit(X_data, y_data)
    best_performance(best_estimator)
    return best_estimator

In [3]:
# Naive Bayes Classifier

# Scale using Standard Scaler
nb_scaler = StandardScaler()
nb_X = nb_scaler.fit_transform(X)

# Split into training/testing sets (Best performance when there is no scaling/removing features)
X_train, X_test, y_train, y_test = train_test_split(nb_X, y, test_size = 0.2, random_state = 0)

pca = PCA()
nbc = GaussianNB()

nb_pipeline = Pipeline([('pca', pca),
                         ('nbc', nbc)])

# Naive Bayes scores with no parameter optimization, no PCA:
nbc = nbc.fit(X_train, y_train)
nbc_pred = nbc.predict(X_test)
metrics(nbc_pred, y_test, 'Naive Bayes')

# Naive Bayes scores with no parameter optimization, but with PCA
nbc_w_pca = nb_pipeline.fit(X_train, y_train)
nbc_w_pca_pred = nbc_w_pca.predict(X_test)
metrics(nbc_w_pca_pred, y_test, 'Naive Bayes with PCA')

# Naive Bayes cross-validation
nb_param_grid = {'pca__n_components' : [None, 2, 5, 10],
                 'pca__whiten' : [True, False]}

nb_clf = EstimatorCV(X_train, y_train, nb_pipeline, nb_param_grid)
nb_clf_pred = nb_clf.predict(X_test)
metrics(nb_clf_pred, y_test, 'Naive Bayes with optimized PCA parameters')

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, nb_clf_pred),'\n')


Accuracy Score for Naive Bayes: 65.51724137931035%
F1 Score for Naive Bayes: 0.0%
Precision Score for Naive Bayes: 0.0%
Recall Score for Naive Bayes: 0.0%


Accuracy Score for Naive Bayes with PCA: 58.620689655172406%
F1 Score for Naive Bayes with PCA: 0.0%
Precision Score for Naive Bayes with PCA: 0.0%
Recall Score for Naive Bayes with PCA: 0.0%

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'pca__n_components': 5, 'pca__whiten': True}

Best Score: 0.45999999999999996

Accuracy Score for Naive Bayes with optimized PCA parameters: 79.3103448275862%
F1 Score for Naive Bayes with optimized PCA parameters: 25.0%
Precision Score for Naive Bayes with optimized PCA parameters: 25.0%
Recall Score for Naive Bayes with optimized PCA parameters: 25.0%


Confusion Matrix:
[[22  3]
 [ 3  1]] 



In [4]:
# Random Forest Classifier

# Scale = True, select 5 features
rf_X = scale_selectk(X, y, scale = True, selectk = True, k = 5)
# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(rf_X, y, test_size = 0.2, random_state = 0)

pca = PCA()
rfc = RandomForestClassifier(criterion = 'entropy',
                             random_state = 0,
                             max_features = 'auto',
                             warm_start = True)

rf_pipeline = Pipeline([('pca', pca), 
                        ('rfc', rfc)])

# Random Forest scores with no parameter optimization, no PCA
rfc = rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
metrics(rfc_pred, y_test, 'Random Forest')

# Random Forest scores with no parameter optimization, but with PCA
rfc_w_pca = rf_pipeline.fit(X_train, y_train)
rfc_w_pca_pred = rfc_w_pca.predict(X_test)
metrics(rfc_w_pca_pred, y_test, 'Random Forest with PCA')

# Random forest cross-validation
rf_param_grid = {'pca__n_components' : [None, 2, 5],
                 'pca__whiten' : [True, False],
                 'rfc__min_samples_leaf': [1, 2, 4, 6],
                 'rfc__class_weight' : [None, 'balanced']}

rf_clf = EstimatorCV(X_train, y_train, rf_pipeline, rf_param_grid)
rf_clf_pred = rf_clf.predict(X_test)
metrics(rf_clf_pred, y_test, 'Random Forest with PCA and optimized parameters')

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, rf_clf_pred),'\n')

The kept features are: ['salary', 'bonus', 'total_stock_value', 'exercised_stock_options', 'total_restricted_stock_diff']
The total number of kept features are: 5
The feature scores are:
salary : 18.575703268041824
to_messages : 1.6988243485808527
deferral_payments : 0.21705893033950568
total_payments : 8.873835255516232
loan_advances : 7.242730396536017
bonus : 21.06000170753659
restricted_stock_deferred : 0.06498431172370998
deferred_income : 11.595547659731226
total_stock_value : 24.467654047526405
expenses : 6.234201140506746
from_poi_to_this_person : 5.344941523147335
exercised_stock_options : 25.09754152873551
from_messages : 0.1641644982342872
other : 4.246153540676069
from_this_person_to_poi : 2.426508127242881
long_term_incentive : 10.072454529369448
shared_receipt_with_poi : 8.746485532129082
restricted_stock : 9.346700791051369
director_fees : 2.107655943276089
total_restricted_stock_diff : 25.661492064167202
to_poi_ratio : 4.1690838152893965
from_poi_ratio : 5.2096502205818

In [5]:
# SVM Classifier


# Scale = True, select all features
svm_X = scale_selectk(X, y, scale = True, selectk = True, k = 'all')
# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(svm_X, y, test_size = 0.2, random_state = 0)

pca = PCA(whiten = True)
svc = SVC(random_state = 0,
          probability = True,
          decision_function_shape = 'ovo')

svc_pipeline = Pipeline([('pca', pca), 
                         ('svc', svc)])

# SVM scores with no parameter optimization, no PCA
svc = svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
metrics(svc_pred, y_test, 'SVM')

# SVM scores with no parameter optimization, but with PCA
svc_w_pca = svc_pipeline.fit(X_train, y_train)
svc_w_pca_pred = svc_w_pca.predict(X_test)
metrics(svc_w_pca_pred, y_test, 'SVM with PCA')

# SVM cross-validation
svm_param_grid = {'pca__n_components' : [None, 2, 5],
                  'svc__C' : [0.1, 10, 50, 100],
                  'svc__kernel' : ['linear', 'rbf', 'sigmoid']}

svm_clf = EstimatorCV(X_train, y_train, svc_pipeline, svm_param_grid)
svm_clf_pred = svm_clf.predict(X_test)
metrics(svm_clf_pred, y_test, 'Random Forest with PCA and optimized parameters')

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, svm_clf_pred),'\n')

The kept features are: ['salary', 'to_messages', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'from_poi_to_this_person', 'exercised_stock_options', 'from_messages', 'other', 'from_this_person_to_poi', 'long_term_incentive', 'shared_receipt_with_poi', 'restricted_stock', 'director_fees', 'total_restricted_stock_diff', 'to_poi_ratio', 'from_poi_ratio', 'payments_to_stock_ratio']
The total number of kept features are: 23
The feature scores are:
salary : 18.575703268041824
to_messages : 1.6988243485808527
deferral_payments : 0.21705893033950568
total_payments : 8.873835255516232
loan_advances : 7.242730396536017
bonus : 21.06000170753659
restricted_stock_deferred : 0.06498431172370998
deferred_income : 11.595547659731226
total_stock_value : 24.467654047526405
expenses : 6.234201140506746
from_poi_to_this_person : 5.344941523147335
exercised_stock_options : 25.09754152873551
from_messages : 

In [6]:
# Gradient Boost Classifier


# Scale = True, select 4 features
gb_X = scale_selectk(X, y, scale = True, selectk = True, k = 4)

# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(gb_X, y, test_size = 0.2, random_state = 0)

pca = PCA(n_components = None)
gbc = GradientBoostingClassifier(loss = 'exponential',
                                 random_state = 0, 
                                 max_features = 'auto',
                                 warm_start = True)

gbc_pipeline = Pipeline([('pca', pca), 
                         ('gbc', gbc)])

# Gradient Boost Classifier scores with no parameter optimization, no PCA
gbc = gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)
metrics(gbc_pred, y_test, 'Gradient Booster')

# Gradient Boost Classifier scores with no parameter optimization, but with PCA
gbc_w_pca = gbc_pipeline.fit(X_train, y_train)
gbc_w_pca_pred = gbc_w_pca.predict(X_test)
metrics(gbc_w_pca_pred, y_test, 'Gradient Booster with PCA')

# Gradient Boost Classifier cross-validation
gbc_param_grid = {'gbc__n_estimators' : [100, 250, 300, 350, 400, 450],
                  'gbc__subsample' : [0.1, 0.25, 0.3, 0.4, 0.5, 1.0],
                  'gbc__max_depth' : [1, 2, 3]}

gb_clf = EstimatorCV(X_train, y_train, gbc_pipeline, gbc_param_grid)
gb_clf_pred = gb_clf.predict(X_test)
metrics(gb_clf_pred, y_test, 'Random Forest with PCA and optimized parameters')

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, gb_clf_pred))

The kept features are: ['bonus', 'total_stock_value', 'exercised_stock_options', 'total_restricted_stock_diff']
The total number of kept features are: 4
The feature scores are:
salary : 18.575703268041824
to_messages : 1.6988243485808527
deferral_payments : 0.21705893033950568
total_payments : 8.873835255516232
loan_advances : 7.242730396536017
bonus : 21.06000170753659
restricted_stock_deferred : 0.06498431172370998
deferred_income : 11.595547659731226
total_stock_value : 24.467654047526405
expenses : 6.234201140506746
from_poi_to_this_person : 5.344941523147335
exercised_stock_options : 25.09754152873551
from_messages : 0.1641644982342872
other : 4.246153540676069
from_this_person_to_poi : 2.426508127242881
long_term_incentive : 10.072454529369448
shared_receipt_with_poi : 8.746485532129082
restricted_stock : 9.346700791051369
director_fees : 2.107655943276089
total_restricted_stock_diff : 25.661492064167202
to_poi_ratio : 4.1690838152893965
from_poi_ratio : 5.209650220581801
payment

In [7]:
# Dump classifier, dataset, and feature list

# Save best parameters to clf variable for final model
pca = PCA(n_components = None)
gbc = GradientBoostingClassifier(loss = 'exponential',
                                 max_depth = 1,
                                 n_estimators = 250,
                                 random_state = 0, 
                                 max_features = 'auto',
                                 warm_start = True,
                                 subsample = 0.4)
gbc_pipeline = Pipeline([('pca', pca), 
                         ('gbc', gbc)])

clf = gbc_pipeline.fit(X_train, y_train)
# Format dataset and feature_list for final submission
dataset = pd.concat([y.astype(bool), gb_X], axis = 1).to_dict(orient = 'index')
feature_list = ['poi'] + gb_X.columns.tolist()

# Dump to pickles
dump_classifier_and_data(clf, dataset, feature_list)

In [8]:
# Final test of model (Goal is at least 0.3 for recall and accuracy scores)

from tester import test_classifier
test_classifier(clf, dataset, feature_list, folds = 1000)

Pipeline(steps=[('pca', PCA()),
                ('gbc',
                 GradientBoostingClassifier(loss='exponential', max_depth=1,
                                            max_features='auto',
                                            n_estimators=250, random_state=0,
                                            subsample=0.4, warm_start=True))])
	Accuracy: 0.91840	Precision: 0.71532	Recall: 0.64450	F1: 0.67806	F2: 0.65752
	Total predictions: 15000	True positives: 1289	False positives:  513	False negatives:  711	True negatives: 12487

