In [56]:
import numpy as np
import sys
import pickle
from pandas import DataFrame, Series
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, SelectPercentile

sys.path.append("tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
from auxiliary import computeFraction

# enron_data = pickle.load(open("final_project_dataset.pkl", "r"))
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [57]:
data_dict['SKILLING JEFFREY K']

{'bonus': 5600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'jeff.skilling@enron.com',
 'exercised_stock_options': 19250000,
 'expenses': 29336,
 'from_messages': 108,
 'from_poi_to_this_person': 88,
 'from_this_person_to_poi': 30,
 'loan_advances': 'NaN',
 'long_term_incentive': 1920000,
 'other': 22122,
 'poi': True,
 'restricted_stock': 6843672,
 'restricted_stock_deferred': 'NaN',
 'salary': 1111258,
 'shared_receipt_with_poi': 2042,
 'to_messages': 3627,
 'total_payments': 8682716,
 'total_stock_value': 26093672}

In [58]:
features_list = data_dict['SKILLING JEFFREY K'].keys()
features_list.remove('email_address')
features_list.remove('poi')
features_list = ['poi'] + features_list

In [59]:
print features_list

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']


In [60]:
outliers = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']

for point in outliers:
    data_dict.pop(point)

In [61]:
### Store to my_dataset for easy export below.
my_dataset = data_dict

In [122]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list)
# data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [125]:
## count the NaNs and print percentage of NaNs for each feature

my_df = pd.DataFrame(my_dataset).transpose() # turn columns to rows
# my_df = my_df.drop('email_address')
nan_counts_dict = {}
for column in my_df.columns:
    my_df[column] = my_df[column].replace('NaN',np.nan)
    nan_counts = my_df[column].isnull().sum()
    nan_counts_dict[column] = round(float(nan_counts)/float(len(my_df[column])) * 100,1)
df = pd.DataFrame(nan_counts_dict,index = ['percent_of_NaN']).transpose()
df = df.drop('email_address')
df.reset_index(level=0,inplace=True)
df = df.rename(columns = {'index':'feature'}).sort_values('percent_of_NaN', ascending=False)
df

Unnamed: 0,feature,percent_of_NaN
11,loan_advances,97.9
3,director_fees,88.9
16,restricted_stock_deferred,88.2
1,deferral_payments,73.6
2,deferred_income,66.7
12,long_term_incentive,54.9
0,bonus,43.8
8,from_messages,40.3
9,from_poi_to_this_person,40.3
10,from_this_person_to_poi,40.3


add features based on lecture 11 ud120

In [126]:
submit_dict = {}
for name in my_dataset:

    data_point = my_dataset[name]

    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    data_point["fraction_from_poi"] = fraction_from_poi

    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
                       "from_this_person_to_poi":fraction_to_poi}
    data_point["fraction_to_poi"] = fraction_to_poi

In [127]:
my_dataset['SKILLING JEFFREY K']

{'bonus': 5600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'jeff.skilling@enron.com',
 'exercised_stock_options': 19250000,
 'expenses': 29336,
 'fraction_from_poi': 0.0242624758753791,
 'fraction_to_poi': 0.2777777777777778,
 'from_messages': 108,
 'from_poi_to_this_person': 88,
 'from_this_person_to_poi': 30,
 'loan_advances': 'NaN',
 'long_term_incentive': 1920000,
 'other': 22122,
 'poi': True,
 'restricted_stock': 6843672,
 'restricted_stock_deferred': 'NaN',
 'salary': 1111258,
 'shared_receipt_with_poi': 2042,
 'to_messages': 3627,
 'total_payments': 8682716,
 'total_stock_value': 26093672}

In [128]:
## count the NaNs and print percentage of NaNs for each feature

my_df = pd.DataFrame(my_dataset).transpose() # turn columns to rows
nan_counts_dict = {}
new_features = []
for column in my_df.columns:
    my_df[column] = my_df[column].replace('NaN',np.nan)
    nan_counts = my_df[column].isnull().sum()
    nan_counts_dict[column] = round(float(nan_counts)/float(len(my_df[column])) * 100,1)
    if nan_counts_dict[column] < 50:
        new_features.append(column)
df = pd.DataFrame(nan_counts_dict,index = ['percent_of_NaN']).transpose() # turn columns to rows
df = df.drop('email_address')
df.reset_index(level=0,inplace=True)
df = df.rename(columns = {'index':'feature'}).sort_values('percent_of_NaN', ascending=False)
df

Unnamed: 0,feature,percent_of_NaN
11,loan_advances,97.9
3,director_fees,88.9
16,restricted_stock_deferred,88.2
1,deferral_payments,73.6
2,deferred_income,66.7
12,long_term_incentive,54.9
0,bonus,43.8
8,from_messages,40.3
9,from_poi_to_this_person,40.3
10,from_this_person_to_poi,40.3


based on the percentage of NaNs, I will select the features with NaN percentage below 50%. I will exclude the features on which fraction_to_poi and fraction_from_poi are based, namely from_messages, from_poi_to_this_person, from_this_person_to_poi, to_messages.

In [129]:
exclude_features = ['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'email_address', 'poi']
for item in exclude_features:
    if item in new_features:
        new_features.remove(item)
# new_features.remove(['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'email_address', 'poi'])
print new_features

['bonus', 'exercised_stock_options', 'expenses', 'fraction_from_poi', 'fraction_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'total_payments', 'total_stock_value']


In [130]:
# include 'poi' feature to appear first in list
new_features = ['poi'] + new_features
print new_features

['poi', 'bonus', 'exercised_stock_options', 'expenses', 'fraction_from_poi', 'fraction_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'total_payments', 'total_stock_value']


In [80]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [82]:
test_classifier(gnb_clf, my_dataset, features_list)

GaussianNB()
	Accuracy: 0.33467	Precision: 0.14746	Recall: 0.83450	F1: 0.25064	F2: 0.43198
	Total predictions: 15000	True positives: 1669	False positives: 9649	False negatives:  331	True negatives: 3351



In [83]:
test_classifier(gnb_clf, my_dataset, new_features)

GaussianNB()
	Accuracy: 0.83673	Precision: 0.32420	Recall: 0.20700	F1: 0.25267	F2: 0.22313
	Total predictions: 15000	True positives:  414	False positives:  863	False negatives: 1586	True negatives: 12137



In [84]:
### Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

In [85]:
test_classifier(dt_clf, my_dataset, features_list)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy: 0.79847	Precision: 0.23511	Recall: 0.22700	F1: 0.23098	F2: 0.22858
	Total predictions: 15000	True positives:  454	False positives: 1477	False negatives: 1546	True negatives: 11523



In [86]:
test_classifier(dt_clf, my_dataset, new_features)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy: 0.82927	Precision: 0.35623	Recall: 0.34750	F1: 0.35181	F2: 0.34921
	Total predictions: 15000	True positives:  695	False positives: 1256	False negatives: 1305	True negatives: 11744



In [88]:
# test_classifier(dt_clf, my_dataset, new_features, scale_features = True)

In [93]:
from sklearn.ensemble import AdaBoostClassifier
adb_clf = AdaBoostClassifier(algorithm= 'SAMME')

In [92]:
test_classifier(adb_clf, my_dataset, new_features)

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None)
	Accuracy: 0.84953	Precision: 0.41229	Recall: 0.30200	F1: 0.34863	F2: 0.31907
	Total predictions: 15000	True positives:  604	False positives:  861	False negatives: 1396	True negatives: 12139



PCA attempt below

In [113]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
# remove label from features_list
features_for_pca = features_list[1:]

# extract features
data_for_pca = featureFormat(my_dataset, features_for_pca, sort_keys = True)

# scale features
scale_pca_data = preprocessing.MinMaxScaler().fit_transform(data_for_pca)

# set up PCA to explain pre-selected % of variance (perc_var)
perc_var = .95
pca = PCA(n_components=8)

# fit and transform
pca_transform = pca.fit_transform(scale_pca_data)

# Starting features and ending components
num_features = len(features_for_pca)
components = pca_transform.shape[1]
print 'PCA\n'
print 'Explained Variance: {0}\n Original Number of Dimensions: {1}\n Final Dimensions: {2}\n'.format(perc_var,num_features,components)


PCA

Explained Variance: 0.95
 Original Number of Dimensions: 19
 Final Dimensions: 8



In [97]:
print features_list

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']


In [114]:
from sklearn.pipeline import Pipeline
print 'Evaluate Initial Classifiers using PCA\n'
gnb_pipe = Pipeline(steps=[('pca', pca), ('gaussian', gnb_clf)])
adb_pipe = Pipeline(steps=[('pca', pca), ('adaboost', adb_clf)])
dt_pipe = Pipeline(steps = [('pca',pca),('decision_tree', dt_clf)])

Evaluate Initial Classifiers using PCA



In [116]:
test_classifier(gnb_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=8, whiten=False)), ('gaussian', GaussianNB())])
	Accuracy: 0.85053	Precision: 0.42568	Recall: 0.34650	F1: 0.38203	F2: 0.35989
	Total predictions: 15000	True positives:  693	False positives:  935	False negatives: 1307	True negatives: 12065



In [117]:
test_classifier(adb_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=8, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
	Accuracy: 0.83400	Precision: 0.25152	Recall: 0.12400	F1: 0.16611	F2: 0.13799
	Total predictions: 15000	True positives:  248	False positives:  738	False negatives: 1752	True negatives: 12262



In [118]:
test_classifier(dt_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=8, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
	Accuracy: 0.79793	Precision: 0.25252	Recall: 0.26300	F1: 0.25765	F2: 0.26084
	Total predictions: 15000	True positives:  526	False positives: 1557	False negatives: 1474	True negatives: 11443



GridSearch

In [124]:
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
print "Tune Classifiers\n"

## Tune decision tree via gridsearch

# Set up cross validator (will be used for tuning all classifiers)
cv = cross_validation.StratifiedShuffleSplit(labels,
                                            n_iter = 10,
                                             random_state = 42)
# set up estimator and pipeline, using PCA for feature selection
estimators = [('reduce_dim', PCA()),('dec_tree',dt_clf)]
dtclf = Pipeline(estimators)

# set up paramaters dictionary
dt_params = dict(reduce_dim__n_components=[perc_var],
              dec_tree__criterion=("gini","entropy"),
                  dec_tree__min_samples_split=[1,2,4,8,16,32],
                   dec_tree__min_samples_leaf=[1,2,4,8,16,32],
                   dec_tree__max_depth=[None,1,2,4,8,16,32])

# set up gridsearch
dt_grid_search = GridSearchCV(dtclf, param_grid = dt_params,
                          scoring = 'f1', cv =cv)

# pass data into into the gridsearch via fit
dt_grid_search.fit(features, labels)

print 'Decision tree tuning\n Steps: {0}\n, Best Parameters: {1}\n '.format(dtclf.steps,dt_grid_search.best_params_,dt_grid_search.best_score_)
# print sep2
# pick a winner
best_dtclf = dt_grid_search.best_estimator_

Tune Classifiers



  'precision', 'predicted', average, warn_for)


Decision tree tuning
 Steps: [('reduce_dim', PCA(copy=True, n_components=None, whiten=False)), ('dec_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))]
, Best Parameters: {'dec_tree__max_depth': 8, 'dec_tree__criterion': 'entropy', 'dec_tree__min_samples_leaf': 2, 'reduce_dim__n_components': 0.95, 'dec_tree__min_samples_split': 4}
 
