In [80]:
#!/usr/bin/python
%matplotlib inline

import sys
import pickle
sys.path.append("../tools/")

import pandas as pd
from IPython.display import display

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Storing data in a dataframe to simplify data manipulation
enron_df = pd.DataFrame.from_dict(data_dict, orient='index')
display(enron_df.head(2))
enron_df.info()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,...,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717,4484442,1729541,4175000.0,126027.0,1407.0,-126027.0,1729541,...,,2195.0,152.0,65.0,False,,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,,,178980,182466,257817,,,,,257817,...,,,,,False,,,,,


<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 21 columns):
salary                       146 non-null object
to_messages                  146 non-null object
deferral_payments            146 non-null object
total_payments               146 non-null object
exercised_stock_options      146 non-null object
bonus                        146 non-null object
restricted_stock             146 non-null object
shared_receipt_with_poi      146 non-null object
restricted_stock_deferred    146 non-null object
total_stock_value            146 non-null object
expenses                     146 non-null object
loan_advances                146 non-null object
from_messages                146 non-null object
other                        146 non-null object
from_this_person_to_poi      146 non-null object
poi                          146 non-null bool
director_fees                146 non-null object
deferred_income              146 non-null object


In [81]:
### Splitting data into Train x Test
from sklearn import cross_validation

### Replacing NaN's
enron_df.replace(to_replace={'NaN': 0}, inplace=True)

features = enron_df.drop(['poi', 'email_address'], axis=1)
labels = enron_df['poi']

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    features, labels, test_size=0.1, random_state=42)

In [82]:
### Columns with negative values
(enron_df.T < 0).any(1)

salary                       False
to_messages                  False
deferral_payments             True
total_payments               False
exercised_stock_options      False
bonus                        False
restricted_stock              True
shared_receipt_with_poi      False
restricted_stock_deferred     True
total_stock_value             True
expenses                     False
loan_advances                False
from_messages                False
other                        False
from_this_person_to_poi      False
poi                          False
director_fees                False
deferred_income               True
long_term_incentive          False
email_address                False
from_poi_to_this_person      False
dtype: bool

In [83]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(features_train)

features_train = scaler.transform(features_train)
features_test = scaler.transform(features_test)

In [102]:
### Task 1: Select what features you'll use.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### Creates and fits selector
selector = SelectKBest(chi2, k=10)
selector.fit(features_train, labels_train)
### Get idxs of columns to keep
selected_idx = selector.get_support(indices=True)
### Applies selection over features
features_train_selected = selector.transform(features_train)
features_test_selected = selector.transform(features_test)

enron_df[selected_idx].columns.tolist()

['to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'shared_receipt_with_poi',
 'loan_advances',
 'from_messages',
 'from_this_person_to_poi',
 'poi',
 'long_term_incentive']

---
### Looking for *Outliers*

---

(15L, 10L)

In [79]:
### Task 2: Remove outliers
enron_df.drop('TOTAL', inplace=True)

In [None]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [None]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)