In [1]:
import numpy as np
import sys
import pickle
from pandas import DataFrame, Series
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, SelectPercentile

sys.path.append("tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data, test_classifier_scaling
from auxiliary import computeFraction, evaluate_validate

# enron_data = pickle.load(open("final_project_dataset.pkl", "r"))
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [2]:
data_dict['SKILLING JEFFREY K']

{'bonus': 5600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'jeff.skilling@enron.com',
 'exercised_stock_options': 19250000,
 'expenses': 29336,
 'from_messages': 108,
 'from_poi_to_this_person': 88,
 'from_this_person_to_poi': 30,
 'loan_advances': 'NaN',
 'long_term_incentive': 1920000,
 'other': 22122,
 'poi': True,
 'restricted_stock': 6843672,
 'restricted_stock_deferred': 'NaN',
 'salary': 1111258,
 'shared_receipt_with_poi': 2042,
 'to_messages': 3627,
 'total_payments': 8682716,
 'total_stock_value': 26093672}

In [3]:
features_list = data_dict['SKILLING JEFFREY K'].keys()
features_list.remove('email_address')
features_list.remove('poi')
features_list = ['poi'] + features_list

In [4]:
print features_list

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']


In [5]:
outliers = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']

for point in outliers:
    data_dict.pop(point)

In [6]:
### Store to my_dataset for easy export below.
my_dataset = data_dict

/// deleted below
Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list)
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

## add features based on lecture 11 ud120

In [8]:
submit_dict = {}
for name in my_dataset:

    data_point = my_dataset[name]

    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    data_point["fraction_from_poi"] = fraction_from_poi

    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
                       "from_this_person_to_poi":fraction_to_poi}
    data_point["fraction_to_poi"] = fraction_to_poi

In [9]:
my_dataset['SKILLING JEFFREY K']

{'bonus': 5600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'jeff.skilling@enron.com',
 'exercised_stock_options': 19250000,
 'expenses': 29336,
 'fraction_from_poi': 0.0242624758753791,
 'fraction_to_poi': 0.2777777777777778,
 'from_messages': 108,
 'from_poi_to_this_person': 88,
 'from_this_person_to_poi': 30,
 'loan_advances': 'NaN',
 'long_term_incentive': 1920000,
 'other': 22122,
 'poi': True,
 'restricted_stock': 6843672,
 'restricted_stock_deferred': 'NaN',
 'salary': 1111258,
 'shared_receipt_with_poi': 2042,
 'to_messages': 3627,
 'total_payments': 8682716,
 'total_stock_value': 26093672}

In [60]:
## count the NaNs and print percentage of NaNs for each feature

my_df = pd.DataFrame(my_dataset).transpose()
nan_counts_dict = {}
for column in my_df.columns:
    countit = (my_df[column]=='NaN').sum()
    nan_counts_dict[column] = round(float(countit) / float(len(my_df[column])) * 100,1)
nan_counts = pd.DataFrame(nan_counts_dict.items(), columns = ['feature', 'percentOfNaNs'])
nan_counts = nan_counts.sort_values('percentOfNaNs', ascending=False)
nan_counts

Unnamed: 0,feature,percentOfNaNs
10,loan_advances,97.9
13,director_fees,88.9
8,restricted_stock_deferred,88.2
1,deferral_payments,73.6
15,deferred_income,66.7
4,long_term_incentive,54.9
14,bonus,43.8
0,to_messages,40.3
16,from_this_person_to_poi,40.3
11,from_messages,40.3


based on the percentage of NaNs, I will select the features with NaN percentage below 50%. I will exclude the features on which fraction_to_poi and fraction_from_poi are based, namely from_messages, from_poi_to_this_person, from_this_person_to_poi, to_messages.

In [11]:
exclude_features = ['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'email_address', 'poi']
for item in exclude_features:
    if item in new_features:
        new_features.remove(item)
# new_features.remove(['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'email_address', 'poi'])
print new_features

['bonus', 'exercised_stock_options', 'expenses', 'fraction_from_poi', 'fraction_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'total_payments', 'total_stock_value']


In [12]:
# include 'poi' feature to appear first in list
new_features = ['poi'] + new_features
print new_features

['poi', 'bonus', 'exercised_stock_options', 'expenses', 'fraction_from_poi', 'fraction_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'total_payments', 'total_stock_value']


In [13]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [14]:
test_classifier(gnb_clf, my_dataset, features_list)

GaussianNB()
	Accuracy: 0.33467	Precision: 0.14746	Recall: 0.83450	F1: 0.25064	F2: 0.43198
	Total predictions: 15000	True positives: 1669	False positives: 9649	False negatives:  331	True negatives: 3351



In [15]:
test_classifier(gnb_clf, my_dataset, new_features)

GaussianNB()
	Accuracy: 0.83673	Precision: 0.32420	Recall: 0.20700	F1: 0.25267	F2: 0.22313
	Total predictions: 15000	True positives:  414	False positives:  863	False negatives: 1586	True negatives: 12137



In [16]:
### Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

In [17]:
test_classifier(dt_clf, my_dataset, features_list)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy: 0.79607	Precision: 0.22494	Recall: 0.21650	F1: 0.22064	F2: 0.21814
	Total predictions: 15000	True positives:  433	False positives: 1492	False negatives: 1567	True negatives: 11508



In [18]:
test_classifier(dt_clf, my_dataset, new_features)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy: 0.82973	Precision: 0.35766	Recall: 0.34800	F1: 0.35276	F2: 0.34989
	Total predictions: 15000	True positives:  696	False positives: 1250	False negatives: 1304	True negatives: 11750



In [19]:
from sklearn.ensemble import AdaBoostClassifier
adb_clf = AdaBoostClassifier(algorithm= 'SAMME')

In [20]:
test_classifier(adb_clf, my_dataset, new_features)

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None)
	Accuracy: 0.84960	Precision: 0.41257	Recall: 0.30200	F1: 0.34873	F2: 0.31910
	Total predictions: 15000	True positives:  604	False positives:  860	False negatives: 1396	True negatives: 12140



## PCA attempt below

In [21]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
# data extraction using k_best features list
# data = featureFormat(my_dataset, new_features, sort_keys = True)

# data extraction using full features list, for pipe into PCA
#data = featureFormat(my_dataset, features_list, sort_keys = True)

# labels2, features2 = targetFeatureSplit(data)

## scale extracted features
# scaler = preprocessing.MinMaxScaler()
# trn = scaler.fit_transform(trn)

# remove label from features_list
features_for_pca = new_features[1:]

# extract features
data_for_pca = featureFormat(my_dataset, features_for_pca, sort_keys = True)

# scale features
scale_pca_data = preprocessing.MinMaxScaler().fit_transform(data_for_pca)

# set up PCA to explain pre-selected % of variance (perc_var)
perc_var = .95
pca = PCA(n_components=perc_var)

# fit and transform
pca_transform = pca.fit_transform(scale_pca_data)

# Starting features and ending components
num_features = len(features_for_pca)
components = pca_transform.shape[1]
print 'PCA\n'
print 'Explained Variance: {0}\n Original Number of Dimensions: {1}\n Final Dimensions: {2}\n'.format(perc_var,num_features,components)


PCA

Explained Variance: 0.95
 Original Number of Dimensions: 11
 Final Dimensions: 8



In [22]:
print features_list

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']


In [23]:
from sklearn.pipeline import Pipeline
print 'Evaluate Initial Classifiers using PCA\n'
gnb_pipe = Pipeline(steps=[('pca', pca), ('gaussian', gnb_clf)])
adb_pipe = Pipeline(steps=[('pca', pca), ('adaboost', adb_clf)])
dt_pipe = Pipeline(steps = [('pca',pca),('decision_tree', dt_clf)])

Evaluate Initial Classifiers using PCA



In [25]:
# using the hand picked feature list called new_features

evaluate_validate([gnb_pipe, adb_pipe, dt_pipe], my_dataset, new_features)

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
 Accuracy:0.906976744186
 Predicted Poi in test set:3.0
 Total Persons in test set:43
 Precision:0.666666666667
 Recall:0.4 
 F1 Score: 0.5 

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
 Accuracy:0.906976744186
 Predicted Poi in test set:3.0
 Total Persons in test set:43
 Precision:0.666666666667
 Recall:0.4 
 F1 Score: 0.5 

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
 Accuracy:0.860465116279
 P

In [27]:
# using the long feature list

evaluate_validate([gnb_pipe, adb_pipe, dt_pipe], my_dataset, features_list)

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
 Accuracy:0.883720930233
 Predicted Poi in test set:4.0
 Total Persons in test set:43
 Precision:0.5
 Recall:0.4 
 F1 Score: 0.444444444444 

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
 Accuracy:0.837209302326
 Predicted Poi in test set:6.0
 Total Persons in test set:43
 Precision:0.333333333333
 Recall:0.4 
 F1 Score: 0.363636363636 

clf = Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
 Accuracy:0.8604

## using test_classifier_scaling for PCA scale_features=true for scaled

In [28]:
test_classifier_scaling(gnb_pipe, my_dataset, features_list, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
	Accuracy: 0.82927	Precision: 0.36700	Recall: 0.38700	F1: 0.37673	F2: 0.38283
	Total predictions: 15000	True positives:  774	False positives: 1335	False negatives: 1226	True negatives: 11665



0.3767339985397907

In [61]:
test_classifier(gnb_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
	Accuracy: 0.87373	Precision: 0.55311	Recall: 0.27600	F1: 0.36825	F2: 0.30673
	Total predictions: 15000	True positives:  552	False positives:  446	False negatives: 1448	True negatives: 12554



In [29]:
test_classifier_scaling(adb_pipe, my_dataset, features_list, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
	Accuracy: 0.82513	Precision: 0.26312	Recall: 0.17300	F1: 0.20875	F2: 0.18572
	Total predictions: 15000	True positives:  346	False positives:  969	False negatives: 1654	True negatives: 12031



0.20874811463046758

In [63]:
test_classifier(adb_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
	Accuracy: 0.85333	Precision: 0.38962	Recall: 0.17650	F1: 0.24295	F2: 0.19818
	Total predictions: 15000	True positives:  353	False positives:  553	False negatives: 1647	True negatives: 12447



In [30]:
test_classifier_scaling(dt_pipe, my_dataset, features_list, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
	Accuracy: 0.79640	Precision: 0.25511	Recall: 0.27450	F1: 0.26445	F2: 0.27039
	Total predictions: 15000	True positives:  549	False positives: 1603	False negatives: 1451	True negatives: 11397



0.26445086705202314

In [64]:
test_classifier(dt_pipe, my_dataset, features_list)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
	Accuracy: 0.77933	Precision: 0.19393	Recall: 0.20750	F1: 0.20048	F2: 0.20464
	Total predictions: 15000	True positives:  415	False positives: 1725	False negatives: 1585	True negatives: 11275



## standard PCA with hand picked features

In [37]:
test_classifier_scaling(gnb_pipe, my_dataset, new_features, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
	Accuracy: 0.82593	Precision: 0.32769	Recall: 0.29050	F1: 0.30798	F2: 0.29725
	Total predictions: 15000	True positives:  581	False positives: 1192	False negatives: 1419	True negatives: 11808



0.3079777365491651

In [65]:
test_classifier(gnb_pipe, my_dataset, new_features)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('gaussian', GaussianNB())])
	Accuracy: 0.86847	Precision: 0.51235	Recall: 0.28000	F1: 0.36211	F2: 0.30793
	Total predictions: 15000	True positives:  560	False positives:  533	False negatives: 1440	True negatives: 12467



In [38]:
test_classifier_scaling(adb_pipe, my_dataset, new_features, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
	Accuracy: 0.84240	Precision: 0.35601	Recall: 0.22500	F1: 0.27574	F2: 0.24288
	Total predictions: 15000	True positives:  450	False positives:  814	False negatives: 1550	True negatives: 12186



0.2757352941176471

In [66]:
test_classifier(adb_pipe, my_dataset, new_features)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('adaboost', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None))])
	Accuracy: 0.85887	Precision: 0.41485	Recall: 0.14250	F1: 0.21213	F2: 0.16404
	Total predictions: 15000	True positives:  285	False positives:  402	False negatives: 1715	True negatives: 12598



In [39]:
test_classifier_scaling(dt_pipe, my_dataset, new_features, scale_features = True, std_features = False)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
	Accuracy: 0.81147	Precision: 0.29785	Recall: 0.30500	F1: 0.30138	F2: 0.30354
	Total predictions: 15000	True positives:  610	False positives: 1438	False negatives: 1390	True negatives: 11562



0.30138339920948615

In [67]:
test_classifier(dt_pipe, my_dataset, new_features)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('decision_tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])
	Accuracy: 0.77473	Precision: 0.14330	Recall: 0.13850	F1: 0.14086	F2: 0.13943
	Total predictions: 15000	True positives:  277	False positives: 1656	False negatives: 1723	True negatives: 11344



## GridSearch attempt

In [45]:
from sklearn import grid_search
data = featureFormat(my_dataset, new_features, sort_keys = True)

labels, features = targetFeatureSplit(data)

parameters = {'min_samples_split':[2,4,6,8,10,12,50],
              'splitter': ('best','random'),
              'max_depth':[None,2,4,6,8,10,15,20]
              }
clf_s = grid_search.GridSearchCV(dt_clf, parameters).fit(features, labels)
print 'best estimator:'
print clf_s.best_estimator_

best estimator:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=12, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='random')


In [46]:
dt_pipe2 = Pipeline(steps=[('pca',pca),('dt', clf_s.best_estimator_)])

In [47]:
test_classifier(dt_pipe2, my_dataset, new_features)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.95, whiten=False)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=12, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='random'))])
	Accuracy: 0.87467	Precision: 0.66575	Recall: 0.12050	F1: 0.20406	F2: 0.14410
	Total predictions: 15000	True positives:  241	False positives:  121	False negatives: 1759	True negatives: 12879

