In [3]:
#!/usr/bin/python

import sys
import pickle
import matplotlib.pyplot as plt
import numpy as np
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

r = 42

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

feature_list = ['poi',
               'bonus',
               'salary',
               'deferral_payments',
               'deferred_income',
               'director_fees',
               'exercised_stock_options',
               'expenses',
               'total_payments',
               'total_stock_value',
               'from_messages',
               'from_poi_to_this_person',
               'from_this_person_to_poi',
               'loan_advances',
               'long_term_incentive',
               'other',
               'restricted_stock',
               'restricted_stock_deferred',
               'salary',
               'shared_receipt_with_poi',
               'to_messages'
               ]

data = featureFormat(data_dict, feature_list)

In [4]:
import pprint
pp = pprint.PrettyPrinter(depth=6)

import copy
my_dataset = copy.deepcopy(data_dict)
my_feature_list = copy.deepcopy(feature_list)

for k in my_dataset.keys():
    my_dataset[k]['ratio_to_poi_to_all_sent']  = 0
    if (my_dataset[k]['from_poi_to_this_person'] != 'NaN') and (my_dataset[k]['from_messages'] != 'NaN') and (my_dataset[k]['from_messages'] != 0):
        my_dataset[k]['ratio_to_poi_to_all_sent'] = float(my_dataset[k]['from_this_person_to_poi'])/float(my_dataset[k]['from_messages'])

    my_dataset[k]['ratio_from_poi_to_all_received']  = 0
    if (my_dataset[k]['from_this_person_to_poi'] != 'NaN') and (my_dataset[k]['to_messages'] != 'NaN') and (my_dataset[k]['to_messages'] != 0):
        my_dataset[k]['ratio_from_poi_to_all_received'] = float(my_dataset[k]['from_poi_to_this_person'])/float(my_dataset[k]['to_messages'])


for i in ['ratio_to_poi_to_all_sent','ratio_from_poi_to_all_received']:
    if i not in my_feature_list:
        my_feature_list.append(i)

#source: https://discussions.udacity.com/t/nan-values-not-removed-by-featureformat/179405/2

### K Nearest Neighbors

In [3]:
## KNeighborsClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = KNeighborsClassifier()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[1,2,3,4,5,6,7,9,11,13,15,17,19,21], 
              my_classifier__n_neighbors=[1,2,3,4,5,6,7,8,9,13,15,20])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 168 candidates, totalling 168000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 224 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 1424 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 3424 tasks      | elapsed:   20.8s
[Parallel(n_jobs=4)]: Done 6224 tasks      | elapsed:   37.6s
[Parallel(n_jobs=4)]: Done 9824 tasks      | elapsed:   58.8s
[Parallel(n_jobs=4)]: Done 14224 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 19424 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 25424 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 32224 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 39824 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 48224 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 57424 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 67424 tasks      | elapsed:  7.2min

The best parameters are {'my_classifier__n_neighbors': 1, 'select_features__k': 6} with a score of 0.30


In [None]:
knn_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=6)),
    ('my_classifier', KNeighborsClassifier(n_neighbors=1))
    ]

knn_classifier = Pipeline(knn_pipeline_steps)

test_classifier(knn_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=6, score_func=<function f_classif at 0x115f53de8>)), ('my_classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform'))])
	Accuracy: 0.80733	Precision: 0.30187	Recall: 0.33900	F1: 0.31936	F2: 0.33086
	Total predictions: 15000	True positives:  678	False positives: 1568	False negatives: 1322	True negatives: 11432



### Gaussian Naive Bayes
https://discussions.udacity.com/t/different-accuracy-score-in-gridsearchcv/240608/6


In [8]:
## Gaussian Naive Bayes

from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = GaussianNB()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[3,5,9,15,19,21,'all'])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 7 candidates, totalling 7000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 212 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 1832 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done 4832 tasks      | elapsed:   21.6s


The best parameters are {'select_features__k': 'all'} with a score of 0.2584


[Parallel(n_jobs=4)]: Done 7000 out of 7000 | elapsed:   30.7s finished


In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

gnb_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k='all')),
    ('my_classifier', GaussianNB())
    ]

gnb_classifier = Pipeline(gnb_pipeline_steps)

test_classifier(gnb_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k='all', score_func=<function f_classif at 0x116624aa0>)), ('my_classifier', GaussianNB())])
	Accuracy: 0.36180	Precision: 0.15169	Recall: 0.82450	F1: 0.25623	F2: 0.43691
	Total predictions: 15000	True positives: 1649	False positives: 9222	False negatives:  351	True negatives: 3778



### Decision Tree

In [10]:
## DecisionTreeClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = DecisionTreeClassifier()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

parameters = dict(select_features__k=[3,5,9,15,19,21,'all'])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 7 candidates, totalling 7000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 212 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 1112 tasks      | elapsed:    8.0s
[Parallel(n_jobs=4)]: Done 2612 tasks      | elapsed:   20.7s
[Parallel(n_jobs=4)]: Done 4712 tasks      | elapsed:   36.3s


The best parameters are {'select_features__k': 3} with a score of 0.3107


[Parallel(n_jobs=4)]: Done 7000 out of 7000 | elapsed:   52.9s finished


In [12]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

gnb_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=3)),
    ('my_classifier', DecisionTreeClassifier())
    ]

gnb_classifier = Pipeline(gnb_pipeline_steps)

test_classifier(gnb_classifier, my_dataset, my_feature_list)

## why is F1 0.35 here but 0.31 above ??  ## are there other parameters to vary?

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=3, score_func=<function f_classif at 0x116624aa0>)), ('my_classifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.84840	Precision: 0.40756	Recall: 0.30200	F1: 0.34693	F2: 0.31850
	Total predictions: 15000	True positives:  604	False positives:  878	False negatives: 1396	True negatives: 12122



### Clustering

In [None]:
## KMeans

from sklearn.grid_search import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = KMeans()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

# maybe add a parameter for n_init ?
parameters = dict(select_features__k=[3,5,9,15,19,21], 
              my_classifier__n_clusters=[3,5,9],
              my_classifier__n_init=[10,30,50])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 1000 folds for each of 168 candidates, totalling 168000 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 104 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 704 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done 2600 tasks      | elapsed:   19.8s
[Parallel(n_jobs=4)]: Done 5400 tasks      | elapsed:   38.7s


In [4]:
##The best parameters are {'select_features__k': 2, 'my_classifier__n_clusters': 4} with a score of 0.51

##The best parameters are {'select_features__k': 2, 'my_classifier__n_clusters': 3} with a score of 0.51

from sklearn.grid_search import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit

km_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=5)),
    ('my_classifier', KMeans(n_clusters=2))
    ]

km_classifier = Pipeline(km_pipeline_steps)

test_classifier(km_classifier, my_dataset, my_feature_list)

Pipeline(steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select_features', SelectKBest(k=5, score_func=<function f_classif at 0x112c632a8>)), ('my_classifier', KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0))])
	Accuracy: 0.56953	Precision: 0.15487	Recall: 0.50000	F1: 0.23649	F2: 0.34585
	Total predictions: 15000	True positives: 1000	False positives: 5457	False negatives: 1000	True negatives: 7543



### Logistic Regression

In [None]:
## KMeans

from sklearn.grid_search import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.cross_validation import StratifiedShuffleSplit
my_data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, feature_values = targetFeatureSplit(my_data)
folds = 1000
cv = StratifiedShuffleSplit(
     labels, folds, random_state=r)

clf = KMeans()
steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif)),
    ('my_classifier', clf)
    ]

# maybe add a parameter for n_init ?
parameters = dict(select_features__k=[3,5,9,15,19,21], 
              my_classifier__n_clusters=[3,5,9],
              my_classifier__n_init=[10,30,50])

pipe = Pipeline(steps)

grid = GridSearchCV(pipe, param_grid=parameters, cv=cv, verbose=1, scoring='f1', n_jobs=4)

grid.fit(feature_values, labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
r_pipeline_steps = [
    ('scale', MinMaxScaler()),
    ('select_features',SelectKBest(f_classif,k=)),
    ('my_classifier', KMeans(n_clusters=))
    ]

r_classifier = Pipeline(r_pipeline_steps)

test_classifier(r_classifier, my_dataset, my_feature_list)