In [1]:
# %load poi_id.py
#!/usr/bin/python
import warnings
warnings.filterwarnings('ignore')
import sys
import pickle
sys.path.append("tools/")
import matplotlib.pyplot as plt
from feature_format import featureFormat, targetFeatureSplit
import tester
from tester import dump_classifier_and_data
import pprint
import pandas as pd
from IPython.display import display, HTML
import numpy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

##################################################################################################################### 
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

financial_features = ['salary', 
                     'deferral_payments', 
                     'total_payments', 
                     'loan_advances', 
                     'bonus', 
                     'restricted_stock_deferred', 
                     'deferred_income', 
                     'total_stock_value', 
                     'expenses', 
                     'exercised_stock_options', 
                     'other', 
                     'long_term_incentive', 
                     'restricted_stock', 
                     'director_fees']

email_features = ['to_messages', 
                  'from_poi_to_this_person', 
                  'from_messages', 
                  'from_this_person_to_poi', 
                  'shared_receipt_with_poi']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

pprint.pprint(data_dict)
##################################################################################################################### 
### Task 2: Remove outliers

# Finding employees with 18 or more NaNs in their feature set
for i in data_dict:
    count = 0
    for j in data_dict[i]:
        if data_dict[i][j] == 'NaN':
            count += 1
    if count > 17:
        print 'Name of individual: ',i
        print 'Number of NaN values in the corresponding feature set: ',count

# not the name of a person
data_dict.pop('TOTAL',0) 
# not the name of a person
data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0) 
# all 20 features have NaN values
data_dict.pop('LOCKHART EUGENE E',0) 

# Convert data from dictionary to dataframe
df = pd.DataFrame.from_dict(data_dict, orient='index', dtype=None)

# Remove NaN
salary = df['salary'].replace(['NaN'], 0)
from_poi = df['from_poi_to_this_person'].replace(['NaN'], 0)
bonus = df['bonus'].replace(['NaN'], 0)

# Re shape the data
salary = numpy.reshape( numpy.array(salary), (len(salary), 1))
from_poi = numpy.reshape( numpy.array(from_poi), (len(from_poi), 1))
# bonus = numpy.reshape( numpy.array(bonus), (len(bonus), 1))

# Split the data into training and testing sets to generate a regression line
salary_train, salary_test, from_poi_train, from_poi_test = train_test_split(salary, 
                                                                            from_poi, 
                                                                            test_size=0.1, 
                                                                            random_state=42)

# Using a regression line to view outliers
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg = reg.fit(salary_train, from_poi_train)
reg_pred = reg.predict(salary_test)

# Check accuracy of regression
from sklearn.metrics import r2_score
r = r2_score(from_poi_test, reg_pred)
print 'R score of predicting emails from poi to person in question using the salary: ', r

## No real information gain from plot.
# Plot salary against from_poi
# import matplotlib.pyplot as plt
# %matplotlib inline  
# try:
#     plt.plot(ages, reg.predict(salary), color="blue")
# except NameError:
#     pass
# plt.scatter(salary, from_poi)
# plt.xlabel('Salary')
# plt.ylabel('From POI to this person')
# plt.show()

#####################################################################################################################
### Task 3: Create new feature(s)

### Store to my_dataset for easy export below.
my_dataset = data_dict
my_df = pd.DataFrame.from_dict(my_dataset, orient='index', dtype=None)
my_features_list = ['poi'] + financial_features + email_features

# Clean the data to be used to set up new feature.
salary = my_df['salary'].replace(['NaN'], 0,inplace = True)
total_payments = my_df['total_payments'].replace(['NaN'], 0,inplace = True)
bonus = my_df['bonus'].replace(['NaN'], 0,inplace = True)
total_stock_value = my_df['total_stock_value'].replace(['NaN'], 0,inplace = True)

# Create new column with total monetary assets -> ['net_worth'] using cleaned column data above.
my_df['net_worth'] = my_df['salary'] + my_df['total_payments'] + my_df['bonus'] + my_df['total_stock_value']
new_features_list = my_features_list + ['net_worth']

# Convert dataframe back to dictionary
my_dataset = my_df.to_dict(orient='index')


### Extract features(email and financial) and labels(poi or not) from dataset for local testing

# Takes a list of features ('features_list'), searches the data dictionary for those features, 
# and returns those features in the form of a data list.
data = featureFormat(my_dataset, new_features_list, sort_keys = True)
# Splits the data list, created by the previous statement, into poi(labels) and features
labels, features = targetFeatureSplit(data)

# Use feature selection to select k best features
kbest = SelectKBest(k = 10)
kbest.fit(features, labels)
scores = kbest.scores_

# Combine features with their scores
features_scores = zip(new_features_list[1:], scores)

# Top features
features_scores = dict(features_scores[:21])
sorted_features_scores = sorted(features_scores.items(), key=lambda x: x[1], reverse=True)
best_features = dict(sorted_features_scores[:4]).keys()
best_features = ['poi'] + best_features

# Scale the features                                                             
#MinMax Scaler
scaler = preprocessing.MinMaxScaler()
features = MinMaxScaler().fit_transform(features)

#print new_features_list
print 'POI followed by the best features ', best_features

#####################################################################################################################
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

classifiers = [
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(gamma=5, C=2),
    LDA(),
    LogisticRegression(),
    KNeighborsClassifier(),
    KMeans(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    QDA()
    ]

for clf in classifiers:
    accuracy, precision, recall = [], [], []
    for i in range(500):
        features = MinMaxScaler().fit_transform(features)
        features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                                    labels, 
                                                                                    test_size=0.3) 
        clf.fit(features_train, labels_train)
        prediction = clf.predict(features_test)
        # Append scores
        accuracy.append(accuracy_score(labels_test, prediction))
        precision.append(precision_score(labels_test, prediction, average="weighted"))
        recall.append(recall_score(labels_test, prediction, average="weighted"))
    print "Classifier details: ", clf
    print "Accuracy: ", numpy.mean(accuracy)
    print "Precision: ", numpy.mean(precision)
    print "Recall: ", numpy.mean(recall)
    print '\n'
 
    
#####################################################################################################################
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

#Tune AdaBoost
adaboost = GridSearchCV(AdaBoostClassifier(),
                     param_grid = {'n_estimators' : [10,20,30,40,50,60,70,80,90,100],
                                  'algorithm' : ['SAMME', 'SAMME.R']},
                     cv = cv,
                     scoring = 'f1')
adaboost.fit(features, labels)
print 'adaboost best estimator: ', adaboost.best_estimator_
print 'adaboost best parameters: ', adaboost.best_params_
print 'adaboost best score: ', adaboost.best_score_
print '\n'

# Tune RandomForestClassifier
rfc = GridSearchCV(RandomForestClassifier(),
                     param_grid = {'n_estimators': [200, 700],
                                   'max_features': ['auto', 'sqrt', 'log2']},
                     cv = cv,
                     scoring = 'f1')
rfc.fit(features, labels)
print 'RandomForestClassifier best estimator: ', rfc.best_estimator_
print 'RandomForestClassifier best parameters: ', rfc.best_params_
print 'RandomForestClassifier best score: ', rfc.best_score_
print '\n'


# Tune DecisionTreeClassifier
dtc = GridSearchCV(DecisionTreeClassifier(),
                     param_grid = {'criterion': ['gini', 'entropy'],
                                   'splitter': ['best' ,'random'],
                                   'max_features': ['sqrt', 'auto', 'log2'],
                                   'presort' : ['True', 'False'],
                                   'random_state' : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                                  },
                     cv = cv,
                     scoring = 'f1')
dtc.fit(features, labels)
print 'DecisionTreeClassifier best estimator: ', dtc.best_estimator_
print 'DecisionTreeClassifier best parameters: ', dtc.best_params_
print 'DecisionTreeClassifier best score: ', dtc.best_score_
print '\n'

# Tune K Nearest Neighbors
cv = StratifiedShuffleSplit(labels, 10, random_state = 42)
knn = GridSearchCV(KNeighborsClassifier(), 
                   param_grid = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                                 'metric': ['manhattan','minkowski', 'euclidean'], 
                                 'weights': ['distance', 'uniform']}, 
                   cv = cv,
                   scoring = 'f1')
knn.fit(features, labels)
print 'K Nearest Neighbors best estimator: ', knn.best_estimator_
print 'K Nearest Neighbors best parameters: ', knn.best_params_
print 'K Nearest Neighbors best score: ', knn.best_score_
# tester.test_classifier(knn.best_estimator_, my_dataset, best_features)

# Pipeline 
print "Pipelining..."
pipeline = Pipeline([('normalization', scaler),
                     ('classifier', knn.best_estimator_)
                    ])
tester.test_classifier(pipeline, my_dataset, best_features)

# Tune K Means
kmeans = GridSearchCV(KMeans(),
                     param_grid = {'n_clusters' : [2],
                                  'tol' : [0.00000001,0.0000001,0.000001, 0.00001, 0.0001, 0.001, 0.01],
                                  'max_iter' : [300,200,400,500,600,700],
                                  'init' : ['k-means++', 'random'],
                                  'copy_x' : [True, False]},
                     cv = cv,
                     scoring = 'f1')
kmeans.fit(features, labels)
print 'K Means best estimator: ', kmeans.best_estimator_
print 'K Means best parameters: ', kmeans.best_params_
print 'K Means best score: ', kmeans.best_score_
print '\n'

# Best classifier being tested in tester.py
clf =  pipeline
features_list = best_features

#####################################################################################################################
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
print 'Dumping classifier, dataset, and features_list.....'
dump_classifier_and_data(clf, my_dataset, features_list)
print 'Program end.'



{'ALLEN PHILLIP K': {'bonus': 4175000,
                     'deferral_payments': 2869717,
                     'deferred_income': -3081055,
                     'director_fees': 'NaN',
                     'email_address': 'phillip.allen@enron.com',
                     'exercised_stock_options': 1729541,
                     'expenses': 13868,
                     'from_messages': 2195,
                     'from_poi_to_this_person': 47,
                     'from_this_person_to_poi': 65,
                     'loan_advances': 'NaN',
                     'long_term_incentive': 304805,
                     'other': 152,
                     'poi': False,
                     'restricted_stock': 126027,
                     'restricted_stock_deferred': -126027,
                     'salary': 201955,
                     'shared_receipt_with_poi': 1407,
                     'to_messages': 2902,
                     'total_payments': 4484442,
                     'total_stock_value': 17295



Classifier details:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy:  0.836511627907
Precision:  0.316693578644
Recall:  0.274675757576






Classifier details:  GaussianNB()
Accuracy:  0.562837209302
Precision:  0.195098196394
Recall:  0.652798412698






Classifier details:  SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy:  0.865627906977
Precision:  0.0187333333333
Recall:  0.00998333333333






Classifier details:  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
Accuracy:  0.834930232558
Precision:  0.267876623377
Recall:  0.190196248196






Classifier details:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy:  0.874558139535
Precision:  0.317666666667
Recall:  0.0862262626263






Classifier details:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Accuracy:  0.854697674419
Precision:  0.0924666666667
Recall:  0.0305722222222






Classifier details:  KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
Accuracy:  0.27111627907
Precision:  0.701515492023
Recall:  0.272023255814






Classifier details:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Accuracy:  0.803395348837
Precision:  0.259128693529
Recall:  0.2546998557






Classifier details:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy:  0.858325581395
Precision:  0.291357142857
Recall:  0.119016666667






Classifier details:  QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=0.0001)
Accuracy:  0.860372093023
Precision:  0.0127688995215
Recall:  0.00927142857143




NameError: name 'cv' is not defined

In [56]:
# %load tester.py
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    print 'Loading classifier, dataset, and feature_list....'
    clf, dataset, feature_list = load_classifier_and_data()
    print 'Finished loading.'
    ### Run testing script
    print 'Running testing script.....'
    test_classifier(clf, dataset, feature_list)

if __name__ == '__main__':
    main()


Loading classifier, dataset, and feature_list....
Finished loading.
Running testing script.....


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of labels for any class cannot be less than 2.

In [51]:
# https://discussions.udacity.com/t/using-pipeline-precision-recall-have-been-decreased/45992

##############my code##############
#!/usr/bin/python
import pandas
import sys
import pickle
import csv
import matplotlib.pyplot as plt
import numpy
import tester
from pprint import pprint
import matplotlib.pyplot as plt
plt.interactive(False)

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.feature_selection import SelectKBest
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import grid_search
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

print 'Task 1: Select what features you will use'

target_label = 'poi'
email_features_list = [
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'shared_receipt_with_poi',
    'to_messages',
    #'email_address'
    ]
financial_features_list = [
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    'exercised_stock_options',
    'expenses',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary',
    'total_payments',
    'total_stock_value',
]

features_list = [target_label] + financial_features_list + email_features_list

### Load the dictionary containing the dataset

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
my_dataset = data_dict


print 'Task 2: Remove outliers'
from pprint import pprint
bonus_outliers = []
salary_outliers = []
for key in data_dict:
    val = data_dict[key]['bonus']
    val = data_dict[key]['salary']
    if val == 'NaN':
        continue
    bonus_outliers.append((key,int(val)))
    salary_outliers.append((key,int(val)))
print "\tDetected the outlier of 'TOTAL'"
pprint(sorted(bonus_outliers,key=lambda x:x[1],reverse=True)[:2])
pprint(sorted(salary_outliers,key=lambda x:x[1],reverse=True)[:2])
print ""


def remove_outlier(keys):
    my_dataset.pop(keys, 0)

outliers_list = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']
for outlier in outliers_list:
    remove_outlier(outlier)



print "Task 3: create new features"
print ""


def compute_fraction(poi_messages, all_messages):
    if poi_messages == 'NaN' or all_messages == 'NaN':
        return 0.
    fraction = float(poi_messages) / all_messages
    return fraction


for name in my_dataset:
    data_point = my_dataset[name]
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    data_point["fraction_from_poi"] = compute_fraction(from_poi_to_this_person, to_messages)

    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    data_point["fraction_to_poi"] = compute_fraction(from_this_person_to_poi, from_messages)

# create new copies of feature list for grading
new_features_list = features_list+['fraction_from_poi', 'fraction_to_poi']


### Extract features and labels from dataset for local testing

#MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
data = featureFormat(my_dataset, new_features_list)
labels, features = targetFeatureSplit(data)
scaler = preprocessing.MinMaxScaler()
features = MinMaxScaler().fit_transform(features)


def get_k_best(features_list, k):
    data = featureFormat(my_dataset, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    print "\t k_best.scores ", sorted_pairs
    k_best_features = dict(sorted_pairs[:k])
    return k_best_features.keys()

# get K-best features
best_features_list = get_k_best(new_features_list, 4)
best_features_list.insert(0, 'poi')

#print new_features_list
print '\t'"POI + Best_features_list: ", best_features_list
### Tune KNeighborsClassifier
print "###Tuned KNeigborsClassier###"
cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
metrics = ['minkowski', 'euclidean', 'manhattan']
weights = ['uniform', 'distance']
n_neighbors = [1,2,3,4,5,6,7,8,9,10]
param_grid_knc = dict(metric=metrics, weights=weights, n_neighbors=n_neighbors)
clf_knc = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid_knc, cv=cv)
clf_knc.fit(features, labels)
print "Tuned KNeighborsClassier estimator is: \n", clf_knc.best_estimator_
print clf_knc.best_score_
print ""
clf_knc = clf_knc.best_estimator_
tester.test_classifier(clf_knc, my_dataset, best_features_list)
## Pipeline

print "###Pipeline###"

pipeline = Pipeline([('normalization', scaler),
                     ('classifier', clf_knc)
])
clf = pipeline
features_list = best_features_list
tester.test_classifier(clf, my_dataset, features_list)



### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
print "Task 6: Dump your classifier"
features_list = best_features_list
dump_classifier_and_data(clf, my_dataset, features_list)

print "######end of code.########"

Task 1: Select what features you will use
Task 2: Remove outliers
	Detected the outlier of 'TOTAL'
[('TOTAL', 26704229), ('SKILLING JEFFREY K', 1111258)]
[('TOTAL', 26704229), ('SKILLING JEFFREY K', 1111258)]

Task 3: create new features

	 k_best.scores  [('exercised_stock_options', 24.815079733218194), ('total_stock_value', 24.182898678566879), ('bonus', 20.792252047181535), ('salary', 18.289684043404513), ('fraction_to_poi', 16.409712548035799), ('deferred_income', 11.458476579280369), ('long_term_incentive', 9.9221860131898225), ('restricted_stock', 9.2128106219771002), ('total_payments', 8.7727777300916792), ('shared_receipt_with_poi', 8.589420731682381), ('loan_advances', 7.1840556582887247), ('expenses', 6.0941733106389453), ('from_poi_to_this_person', 5.2434497133749582), ('other', 4.1874775069953749), ('fraction_from_poi', 3.1280917481567374), ('from_this_person_to_poi', 2.3826121082276739), ('director_fees', 2.1263278020077054), ('to_messages', 1.6463411294420076), ('deferral

In [59]:
# %load poi_id.py
#!/usr/bin/python
import warnings
warnings.filterwarnings('ignore')
import sys
import pickle
sys.path.append("tools/")
import matplotlib.pyplot as plt
from feature_format import featureFormat, targetFeatureSplit
import tester
from tester import dump_classifier_and_data
import pprint
import pandas as pd
from IPython.display import display, HTML
import numpy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

##################################################################################################################### 
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

financial_features = ['salary', 
                     'deferral_payments', 
                     'total_payments', 
                     'loan_advances', 
                     'bonus', 
                     'restricted_stock_deferred', 
                     'deferred_income', 
                     'total_stock_value', 
                     'expenses', 
                     'exercised_stock_options', 
                     'other', 
                     'long_term_incentive', 
                     'restricted_stock', 
                     'director_fees']

email_features = ['to_messages', 
                  'from_poi_to_this_person', 
                  'from_messages', 
                  'from_this_person_to_poi', 
                  'shared_receipt_with_poi']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

##################################################################################################################### 
### Task 2: Remove outliers

# Finding employees with 18 or more NaNs in their feature set
for i in data_dict:
    count = 0
    for j in data_dict[i]:
        if data_dict[i][j] == 'NaN':
            count += 1
    if count > 17:
        print 'Name of individual: ',i
        print 'Number of NaN values in the corresponding feature set: ',count

# not the name of a person
data_dict.pop('TOTAL',0) 
# not the name of a person
data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0) 
# all 20 features have NaN values
data_dict.pop('LOCKHART EUGENE E',0) 

# Convert data from dictionary to dataframe
df = pd.DataFrame.from_dict(data_dict, orient='index', dtype=None)

# Remove NaN
salary = df['salary'].replace(['NaN'], 0)
from_poi = df['from_poi_to_this_person'].replace(['NaN'], 0)
bonus = df['bonus'].replace(['NaN'], 0)

# Re shape the data
salary = numpy.reshape( numpy.array(salary), (len(salary), 1))
from_poi = numpy.reshape( numpy.array(from_poi), (len(from_poi), 1))
# bonus = numpy.reshape( numpy.array(bonus), (len(bonus), 1))

# Split the data into training and testing sets to generate a regression line
salary_train, salary_test, from_poi_train, from_poi_test = train_test_split(salary, 
                                                                            from_poi, 
                                                                            test_size=0.1, 
                                                                            random_state=42)

# Using a regression line to view outliers
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg = reg.fit(salary_train, from_poi_train)
reg_pred = reg.predict(salary_test)

# Check accuracy of regression
from sklearn.metrics import r2_score
r = r2_score(from_poi_test, reg_pred)
print 'R score of predicting emails from poi to person in question using the salary: ', r

## No real information gain from plot.
# Plot salary against from_poi
# import matplotlib.pyplot as plt
# %matplotlib inline  
# try:
#     plt.plot(ages, reg.predict(salary), color="blue")
# except NameError:
#     pass
# plt.scatter(salary, from_poi)
# plt.xlabel('Salary')
# plt.ylabel('From POI to this person')
# plt.show()

#####################################################################################################################
### Task 3: Create new feature(s)

### Store to my_dataset for easy export below.
my_dataset = data_dict
my_df = pd.DataFrame.from_dict(my_dataset, orient='index', dtype=None)
my_features_list = ['poi'] + financial_features + email_features

# Clean the data to be used to set up new feature.
salary = my_df['salary'].replace(['NaN'], 0,inplace = True)
total_payments = my_df['total_payments'].replace(['NaN'], 0,inplace = True)
bonus = my_df['bonus'].replace(['NaN'], 0,inplace = True)
total_stock_value = my_df['total_stock_value'].replace(['NaN'], 0,inplace = True)

# Create new column with total monetary assets -> ['net_worth'] using cleaned column data above.
my_df['net_worth'] = my_df['salary'] + my_df['total_payments'] + my_df['bonus'] + my_df['total_stock_value']
new_features_list = my_features_list + ['net_worth']

# Convert dataframe back to dictionary
my_dataset = my_df.to_dict(orient='index')


### Extract features(email and financial) and labels(poi or not) from dataset for local testing

# Takes a list of features ('features_list'), searches the data dictionary for those features, 
# and returns those features in the form of a data list.
data = featureFormat(my_dataset, new_features_list, sort_keys = True)
# Splits the data list, created by the previous statement, into poi(labels) and features
labels, features = targetFeatureSplit(data)

# Use feature selection to select k best features
kbest = SelectKBest(k = 10)
kbest.fit(features, labels)
scores = kbest.scores_

# Combine features with their scores
features_scores = zip(new_features_list[1:], scores)

# Top features
features_scores = dict(features_scores[:21])
sorted_features_scores = sorted(features_scores.items(), key=lambda x: x[1], reverse=True)
best_features = dict(sorted_features_scores[:4]).keys()
best_features = ['poi'] + best_features

# Scale the features                                                             
#MinMax Scaler
scaler = preprocessing.MinMaxScaler()
features = MinMaxScaler().fit_transform(features)

#print new_features_list
print 'POI + best features ', best_features


#####################################################################################################################
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


clf = KNeighborsClassifier()
accuracy, precision, recall = [], [], []
for i in range(1000):
    features = MinMaxScaler().fit_transform(features)
    features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                                labels, 
                                                                                test_size=0.3) 
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    # Append scores
    accuracy.append(accuracy_score(labels_test, prediction))
    precision.append(precision_score(labels_test, prediction, average="weighted"))
    recall.append(recall_score(labels_test, prediction, average="weighted"))
print "Classifier details: ", clf
print "Accuracy: ", numpy.mean(accuracy)
print "Precision: ", numpy.mean(precision)
print "Recall: ", numpy.mean(recall)
print '\n'
 
    
#####################################################################################################################
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Tune K Nearest Neighbors
cv = StratifiedShuffleSplit(labels, 10, random_state = 42)
knn = GridSearchCV(KNeighborsClassifier(), 
                   param_grid = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                                 'metric': ['manhattan','minkowski', 'euclidean'], 
                                 'weights': ['distance', 'uniform']}, 
                   cv = cv,
                   scoring = 'f1')
knn.fit(features, labels)
print 'K Nearest Neighbors best estimator: ', knn.best_estimator_
print 'K Nearest Neighbors best parameters: ', knn.best_params_
print 'K Nearest Neighbors best score: ', knn.best_score_
# tester.test_classifier(knn.best_estimator_, my_dataset, best_features)

# Pipeline module to run PCA to find features with maximum variance
print "Pipelining..."
pipeline = Pipeline([('normalization', scaler),
                     ('classifier', knn.best_estimator_)
                    ])
tester.test_classifier(pipeline, my_dataset, best_features)

# Best classifier being tested in tester.py
clf =  pipeline
features_list = best_features

#####################################################################################################################
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_list)

Name of individual:  WHALEY DAVID A
Number of NaN values in the corresponding feature set:  18
Name of individual:  WROBEL BRUCE
Number of NaN values in the corresponding feature set:  18
Name of individual:  LOCKHART EUGENE E
Number of NaN values in the corresponding feature set:  20
Name of individual:  THE TRAVEL AGENCY IN THE PARK
Number of NaN values in the corresponding feature set:  18
Name of individual:  GRAMM WENDY L
Number of NaN values in the corresponding feature set:  18
R score of predicting emails from poi to person in question using the salary:  0.0805558457316
POI + best features  ['poi', 'bonus', 'exercised_stock_options', 'net_worth', 'total_stock_value']
Classifier details:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Accuracy:  0.859558139535
Precision:  0.123677777778
Recall:  0.037746031746


K Nearest Neighbors best estimator:  KNeighborsClass