In [1]:
#!/usr/bin/python

import sys
import pickle
import matplotlib.pyplot as plt
import numpy as np
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

r = 42

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

feature_list = ['poi',
               'bonus',
               'salary',
               'deferral_payments',
               'deferred_income',
               'director_fees',
               'exercised_stock_options',
               'expenses',
               'total_payments',
               'total_stock_value',
               'from_messages',
               'from_poi_to_this_person',
               'from_this_person_to_poi',
               'loan_advances',
               'long_term_incentive',
               'other',
               'restricted_stock',
               'restricted_stock_deferred',
               'salary',
               'shared_receipt_with_poi',
               'to_messages'
               ]

data = featureFormat(data_dict, feature_list)

In [2]:
import pprint
pp = pprint.PrettyPrinter(depth=6)

import copy
my_dataset = copy.deepcopy(data_dict)
my_feature_list = copy.deepcopy(feature_list)

for k in my_dataset.keys():
    my_dataset[k]['ratio_to_poi_to_all_sent']  = 0
    if (my_dataset[k]['from_poi_to_this_person'] != 'NaN') and (my_dataset[k]['from_messages'] != 'NaN') and (my_dataset[k]['from_messages'] != 0):
        my_dataset[k]['ratio_to_poi_to_all_sent'] = float(my_dataset[k]['from_this_person_to_poi'])/float(my_dataset[k]['from_messages'])

    my_dataset[k]['ratio_from_poi_to_all_received']  = 0
    if (my_dataset[k]['from_this_person_to_poi'] != 'NaN') and (my_dataset[k]['to_messages'] != 'NaN') and (my_dataset[k]['to_messages'] != 0):
        my_dataset[k]['ratio_from_poi_to_all_received'] = float(my_dataset[k]['from_poi_to_this_person'])/float(my_dataset[k]['to_messages'])


for i in ['ratio_to_poi_to_all_sent','ratio_from_poi_to_all_received']:
    if i not in my_feature_list:
        my_feature_list.append(i)

#source: https://discussions.udacity.com/t/nan-values-not-removed-by-featureformat/179405/2

### Gaussian Naive Bayes

In [4]:
## Gaussian Naive Bayes

from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = GaussianNB()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[3,5,9,15,19,21,'all'])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

#source: https://discussions.udacity.com/t/different-accuracy-score-in-gridsearchcv/240608/6

gnb_classifier = grid.best_estimator_

# use test_classifier to evaluate
test_classifier(gnb_classifier, my_dataset, my_feature_list)

Fitting 1000 folds for each of 7 candidates, totalling 7000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1340 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done 3340 tasks      | elapsed:   16.9s
[Parallel(n_jobs=4)]: Done 7000 out of 7000 | elapsed:   31.5s finished


The best parameters are {'select_features__k': 'all'} with a score of 0.2584
Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k='all', score_func=<function f_classif at 0x115e429b0>)), ('my_classifier', GaussianNB())])
	Accuracy: 0.36180	Precision: 0.15169	Recall: 0.82450	F1: 0.25623	F2: 0.43691
	Total predictions: 15000	True positives: 1649	False positives: 9222	False negatives:  351	True negatives: 3778



In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

gnb_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k='all')),
    ('my_classifier', GaussianNB())
    ]

gnb_classifier = Pipeline(gnb_pipeline_steps)

test_classifier(gnb_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k='all', score_func=<function f_classif at 0x116624aa0>)), ('my_classifier', GaussianNB())])
	Accuracy: 0.36180	Precision: 0.15169	Recall: 0.82450	F1: 0.25623	F2: 0.43691
	Total predictions: 15000	True positives: 1649	False positives: 9222	False negatives:  351	True negatives: 3778



### K Nearest Neighbors

In [3]:
## KNeighborsClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = KNeighborsClassifier()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[1,2,3,4,5,6,7,9,11,13,15,17,19,21], 
              my_classifier__n_neighbors=[1,2,3,4,5,6,7,8,9,13,15,20])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 168 candidates, totalling 168000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 224 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 1424 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 3424 tasks      | elapsed:   20.8s
[Parallel(n_jobs=4)]: Done 6224 tasks      | elapsed:   37.6s
[Parallel(n_jobs=4)]: Done 9824 tasks      | elapsed:   58.8s
[Parallel(n_jobs=4)]: Done 14224 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 19424 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 25424 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 32224 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 39824 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 48224 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 57424 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 67424 tasks      | elapsed:  7.2min

The best parameters are {'my_classifier__n_neighbors': 1, 'select_features__k': 6} with a score of 0.30


In [None]:
knn_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=6)),
    ('my_classifier', KNeighborsClassifier(n_neighbors=1))
    ]

knn_classifier = Pipeline(knn_pipeline_steps)

test_classifier(knn_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=6, score_func=<function f_classif at 0x115f53de8>)), ('my_classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform'))])
	Accuracy: 0.80733	Precision: 0.30187	Recall: 0.33900	F1: 0.31936	F2: 0.33086
	Total predictions: 15000	True positives:  678	False positives: 1568	False negatives: 1322	True negatives: 11432



### Decision Tree

In [5]:
## DecisionTreeClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = DecisionTreeClassifier(random_state=r)
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[3,5,9,15,19,21,'all'],
                  my_classifier__max_features=[None, 'auto', 'log2'],
                  my_classifier__criterion=['gini', 'entropy'],
                  my_classifier__min_samples_split=[2, 3, 4, 5, 10]
                 )

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 210 candidates, totalling 210000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 1344 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 3344 tasks      | elapsed:   23.1s
[Parallel(n_jobs=4)]: Done 6144 tasks      | elapsed:   42.1s
[Parallel(n_jobs=4)]: Done 9744 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 14144 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 19344 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 25344 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 32144 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 39744 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 48144 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 57344 tasks      | elapsed:  7.0min
[Parallel(n_jobs=4)]: Done 67344 tasks      | elapsed:  8.3min

The best parameters are {'my_classifier__min_samples_split': 10, 'select_features__k': 19, 'my_classifier__criterion': 'entropy', 'my_classifier__max_features': None} with a score of 0.3578


In [7]:
dt_classifier = grid.best_estimator_

test_classifier(dt_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=19, score_func=<function f_classif at 0x119eeb0c8>)), ('my_classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'))])
	Accuracy: 0.85120	Precision: 0.42961	Recall: 0.35400	F1: 0.38816	F2: 0.36692
	Total predictions: 15000	True positives:  708	False positives:  940	False negatives: 1292	True negatives: 12060



In [6]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

dt_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=19)),
    ('my_classifier', DecisionTreeClassifier(random_state=r,min_samples_split=10,criterion='entropy',max_features=None))
    ]

dt_classifier = Pipeline(dt_pipeline_steps)

test_classifier(dt_classifier, my_dataset, my_feature_list)


Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=19, score_func=<function f_classif at 0x119eeb0c8>)), ('my_classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'))])
	Accuracy: 0.85120	Precision: 0.42961	Recall: 0.35400	F1: 0.38816	F2: 0.36692
	Total predictions: 15000	True positives:  708	False positives:  940	False negatives: 1292	True negatives: 12060



### Random Forest Classifier

In [3]:
## RandomForestClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = RandomForestClassifier(random_state=r)
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[3,5,9,15,19,21,'all'],
                  my_classifier__max_features=[None, 'auto', 'log2'],
                  my_classifier__criterion=['gini', 'entropy'],
                  my_classifier__min_samples_split=[2, 3, 4, 5, 10]
                 )

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))


# The best parameters are {'my_classifier__min_samples_split': 10, 
#                          'select_features__k': 3, 
#                          'my_classifier__criterion': 'entropy', 
#                          'my_classifier__max_features': None} with a score of 0.2985


Fitting 1000 folds for each of 210 candidates, totalling 210000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   19.2s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:   35.7s
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:   54.4s
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 8076 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 9976 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done 12076 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 14376 tasks      | elapsed:  5.7min
[Parallel(n_jobs=4)]: Done 16876 tasks      | elapsed:  6.7min
[Paral

The best parameters are {'my_classifier__min_samples_split': 10, 'select_features__k': 3, 'my_classifier__criterion': 'entropy', 'my_classifier__max_features': None} with a score of 0.2985


In [3]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

rf_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=3)),
    ('my_classifier', RandomForestClassifier(random_state=r,min_samples_split=10,criterion='entropy',max_features=None))
    ]

rf_classifier = Pipeline(rf_pipeline_steps)

test_classifier(rf_classifier, my_dataset, my_feature_list)


Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=3, score_func=<function f_classif at 0x10e6ee140>)), ('my_classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=None, max_...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])
	Accuracy: 0.85233	Precision: 0.42316	Recall: 0.29600	F1: 0.34834	F2: 0.31493
	Total predictions: 15000	True positives:  592	False positives:  807	False negatives: 1408	True negatives: 12193

