#Intro to Machine Learning - Project log 
## Andrew Lavers - June 6, 2015

This python notebook documents the exploration of the data, algorithms and results.

In [426]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [428]:
import pickle
import sys
from feature_format import featureFormat, targetFeatureSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pprint import pprint

In [429]:
%%javascript
IPython.load_extensions('calico-spell-check')

<IPython.core.display.Javascript object>

##Question 1
>Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it.  As part     of your answer, give some background on the dataset and how it can be used to answer the project question.  Were       there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data           exploration”, “outlier investigation”]


### Goal
The analytical goal of this project is to investigate whether a “Person of Interest” in the Enron Fraud investigation, can be systematically identified from certain financial data points and/or from their email messages. The technical goal is to use machine-learning techniques for this investigation.

### Dataset background

The data set comprises person, financial, stock and email information and related email messages for persons that could be "persons of interest" in the Enron fraud investigation. The boolean variable `poi` with value `True` identifies persons that are poi's. 

Here is a sample row.


In [391]:
### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
# print an example row
print "'METTS MARK',"
pprint(data_dict['METTS MARK'])
# the totals should be skipped
xx = data_dict.pop( 'TOTAL', 0 ) 

# build a pandas dataframe for numerics to ease the investigation. Dont drop any zeros
data = featureFormat(data_dict, features, remove_all_zeroes = False)
df = pd.DataFrame(data, columns=features)
df['name'] = data_dict.keys()
df = df[['name'] + features ]
                  
# build a second dataframe with all values
dfc = pd.DataFrame(data_dict).transpose()

'METTS MARK',
{'bonus': 600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'mark.metts@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 94299,
 'from_messages': 29,
 'from_poi_to_this_person': 38,
 'from_this_person_to_poi': 1,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 1740,
 'poi': False,
 'restricted_stock': 585062,
 'restricted_stock_deferred': 'NaN',
 'salary': 365788,
 'shared_receipt_with_poi': 702,
 'to_messages': 807,
 'total_payments': 1061827,
 'total_stock_value': 585062}


In [392]:
money_features =['total_payments', 'salary', 'bonus', 
                 'deferral_payments', 'deferred_income', 'long_term_incentive',
                 'expenses', 'loan_advances', 
                 'other', 'director_fees', 
stock_features = ['restricted_stock', 'restricted_stock_deferred', 'exercised_stock_options', 'total_stock_value']
email_to_from_features =['to_messages', 'from_messages'] 
email_poi_features = ['shared_receipt_with_poi', 'from_this_person_to_poi', 'from_poi_to_this_person']
email_features = email_to_from_features + email_poi_features
id_features = ['name', 'email_address']
class_features = ['poi']

# put them together for a full list
features = class_features + email_features + money_features + stock_features
all_features = features

### Features 

There are different groups of features that are possibly closely related. For example, `bonus` is often directly related to `salary` and `total_payments` is the sum of other values. This lack of independence must be considered.

In [393]:
print '\n- Class labels'
pprint (class_features)
print '\n- Money features - Payments'
pprint (money_features)
print '\n- Stock features - Value of stock items'
pprint (stock_features)
print '\n- Email features'
pprint (email_features)
print '\n- Identification features'
pprint (id_features)







- Class labels
['poi']

- Money features - Payments
['total_payments',
 'salary',
 'bonus',
 'deferral_payments',
 'deferred_income',
 'long_term_incentive',
 'expenses',
 'loan_advances',
 'other',
 'director_fees']

- Stock features - Value of stock items
['restricted_stock',
 'restricted_stock_deferred',
 'exercised_stock_options',
 'total_stock_value']

- Email features
['to_messages',
 'from_messages',
 'shared_receipt_with_poi',
 'from_this_person_to_poi',
 'from_poi_to_this_person']

- Identification features
['name', 'email_address']


###Basic dataset facts



In [394]:
print "Number of persons: {}".format(len(df))
print "Number of persons of interest (POI): {:3}".format(int(sum(df['poi'])))
print "Number of persons with email addresses: {}".format(int(sum(dfc['email_address']!='NaN')))
print "Total email messages: {}".format(int(sum(df['to_messages']+df['from_messages'])))

Number of persons: 145
Number of persons of interest (POI):  18
Number of persons with email addresses: 111
Total email messages: 230708


###Basic dataset statistics

Reviewing the basic statistics below shows a very broad variation for most items. For example, the maximum bonus of $8million is ten times that of the 75th percentile. Outliers should be investigated. 

<style>
dataframe {
font-size:50%;
}
</style>

In [395]:
df[money_features].describe()

Unnamed: 0,total_payments,salary,bonus,deferral_payments,deferred_income,long_term_incentive,expenses,loan_advances,other,director_fees
count,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0
mean,2243477.0,184167.096552,671335.303448,220557.903448,-192347.524138,334633.986207,35131.372414,578793.103448,295210.02069,9911.489655
std,8817819.0,196959.768365,1230147.632511,751704.629341,604117.425636,685363.855952,45247.175705,6771011.748312,1127404.270001,31202.71294
min,0.0,0.0,0.0,-102500.0,-3504386.0,0.0,0.0,0.0,0.0,0.0
25%,91093.0,0.0,0.0,0.0,-36666.0,0.0,0.0,0.0,0.0,0.0
50%,916197.0,210500.0,300000.0,0.0,0.0,0.0,18834.0,0.0,947.0,0.0
75%,1934359.0,269076.0,800000.0,7961.0,0.0,374347.0,53122.0,0.0,150458.0,0.0
max,103559800.0,1111258.0,8000000.0,6426990.0,0.0,5145434.0,228763.0,81525000.0,10359729.0,137864.0


In [396]:
df[stock_features].describe()

Unnamed: 0,restricted_stock,restricted_stock_deferred,exercised_stock_options,total_stock_value
count,145.0,145.0,145.0,145.0
mean,862546.386207,72911.572414,2061486.103448,2889718.124138
std,2010852.212383,1297469.064327,4781941.261994,6172223.035654
min,-2604490.0,-1787380.0,0.0,-44093.0
25%,0.0,0.0,0.0,221141.0
50%,360528.0,0.0,607837.0,955873.0
75%,698920.0,0.0,1668260.0,2282768.0
max,14761694.0,15456290.0,34348384.0,49110078.0


In [397]:
dfc[email_features].describe()

Unnamed: 0,to_messages,from_messages,shared_receipt_with_poi,from_this_person_to_poi,from_poi_to_this_person
count,145.0,145.0,145.0,145.0,145.0
unique,87.0,65.0,84.0,42.0,58.0
top,,,,,
freq,59.0,59.0,59.0,59.0,59.0


### Missing Values

There are many missing values that can be seen from the below counts. However, for all variables, except `email_address`, replacing with zero seems perfectly consistent because having no directors fees, for example, is the same as being paid zero for directors fees.

In [398]:
print "Number of NaN values:\n"
for c in dfc.columns: print c, sum(dfc[c]=='NaN')

Number of NaN values:

bonus 64
deferral_payments 107
deferred_income 97
director_fees 129
email_address 34
exercised_stock_options 44
expenses 51
from_messages 59
from_poi_to_this_person 59
from_this_person_to_poi 59
loan_advances 142
long_term_incentive 80
other 53
poi 0
restricted_stock 36
restricted_stock_deferred 128
salary 51
shared_receipt_with_poi 59
to_messages 59
total_payments 21
total_stock_value 20


### Outliers

There are a number of variables above, with very large maximums that are much greater than the 75th percentile which suggests there are outliers. 

It's clear from the `total_payments`ranking below, that KENNETH LAY, the chairman is an outlier with total payments about 5 times that of the second highest, MARK FREVERT.


In [399]:
# print out top 5 for each feature to check for outliers
for ff in ['total_payments']:
    print df[['name', 'poi', ff]].sort_index(by=ff, ascending = False)[0:5]

                   name  poi  total_payments
65        LAY KENNETH L    1       103559793
128      FREVERT MARK A    0        17252530
117    BHATNAGAR SANJAY    0        15456290
43      LAVORATO JOHN J    0        10425757
95   SKILLING JEFFREY K    1         8682716


['loan_advances', 'restricted_stock_deferred'] occur for only a few persons, so eliminate them from the feature set.

In [400]:
# print out top 5 for each feature to check for outliers
for ff in ['loan_advances', 'restricted_stock_deferred']:
    print df[['name', 'poi', ff]].sort_index(by=ff, ascending = False)[0:5]

                        name  poi  loan_advances
65             LAY KENNETH L    1       81525000
128           FREVERT MARK A    0        2000000
100         PICKERING MARK R    0         400000
99               GRAY RODNEY    0              0
93   PEREIRA PAULO V. FERRAZ    0              0
                   name  poi  restricted_stock_deferred
117    BHATNAGAR SANJAY    0                   15456290
24        BELFER ROBERT    0                      44093
0            METTS MARK    0                          0
91            COX DAVID    0                          0
92   OVERDYKE JR JERE C    0                          0


In summary, the outliers are:

- KENNETH LAY, the chairman - outlier on most financial values.

- FREVERT MARK A - loan_advances 

- BHATNAGAR SANJAY - restricted_stock_deferred 

The `loan_advances` and `restricted_stock_deferred` only appear for a few people. These may be interesting 
but they won't be common POI indicators. We will drop them from our interesting features which require dropping the second and third names above.

In [401]:
data_dict.pop("LAY KENNETH L", 0)
data_dict.pop("FREVERT MARK A", 0)
data_dict.pop("BHATNAGAR SANJAY", 0)

try:
    money_features.remove("loan_advances")
except ValueError:
    pass

try:
    money_features.remove("restricted_stock_deferred")
except ValueError:
    pass

In [402]:
## rebuild data frames based on new feature list
data = featureFormat(data_dict, features, remove_all_zeroes = False)
df = pd.DataFrame(data, columns=features)
df['name'] = data_dict.keys()
df = df[['name'] + features ]

## Question 2 Feature Selection


### What features did you end up using in your POI identifier, and what selection process did you use to pick them?  

The features from the dataset used were :

In [403]:
top_features

['shared_receipt_with_poi',
 'deferred_income',
 'salary',
 'long_term_incentive',
 'expenses',
 'exercised_stock_options']

The feature selection was made by reviewing correlation matrices for feature groups - money features, stock features and email features. Here are the details

#### Correlation of financial features
Independence of variables is an important assumption in many of the machine learning algorithms, so use correlation to identify
- possible dependent variables
- variables with greatest correlation to the class identifier (poi)

In [404]:
df_money_corr1 = df[ ['poi'] + money_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_money_corr1

Unnamed: 0,poi,total_payments,salary,bonus,deferral_payments,deferred_income,long_term_incentive,expenses,other,director_fees
poi,1.0,0.206612,0.324657,0.305041,0.0253,0.32523,0.197602,0.188498,0.053743,0.114397
deferred_income,0.32523,0.160506,0.210001,0.352805,0.36937,1.0,0.272131,0.002818,0.054666,0.065057
salary,0.324657,0.632595,1.0,0.59498,0.024279,0.210001,0.457967,0.334952,0.181254,0.320729
bonus,0.305041,0.802158,0.59498,1.0,0.17141,0.352805,0.384315,0.209364,0.033491,0.173695
total_payments,0.206612,1.0,0.632595,0.802158,0.313233,0.160506,0.687409,0.167229,0.438909,0.229963
long_term_incentive,0.197602,0.687409,0.457967,0.384315,0.016748,0.272131,1.0,0.039198,0.423743,0.151872
expenses,0.188498,0.167229,0.334952,0.209364,0.120961,0.002818,0.039198,1.0,0.007211,0.116632
director_fees,0.114397,0.229963,0.320729,0.173695,0.101317,0.065057,0.151872,0.116632,0.120698,1.0
other,0.053743,0.438909,0.181254,0.033491,0.00477,0.054666,0.423743,0.007211,1.0,0.120698
deferral_payments,0.0253,0.313233,0.024279,0.17141,1.0,0.36937,0.016748,0.120961,0.00477,0.101317


As may be expected, there is a lot of correlation amongst many values. For example bonus is often a function of salary and this indicated by the 0.65 correlation coefficient.  'total_payments' is a sum of the others so there will definitely be a non-meaningful correlation. So we will

- drop `bonus` and keep `salary`
- drop `total_payments` because it is a total of others

In [405]:
top_money_features = list(df_money_corr1.index[1:3]) + list(df_money_corr1.index[5:7])
print top_money_features
df_money_corr2 = df[ ['poi'] + top_money_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_money_corr2

['deferred_income', 'salary', 'long_term_incentive', 'expenses']


Unnamed: 0,poi,deferred_income,salary,long_term_incentive,expenses
poi,1.0,0.32523,0.324657,0.197602,0.188498
deferred_income,0.32523,1.0,0.210001,0.272131,0.002818
salary,0.324657,0.210001,1.0,0.457967,0.334952
long_term_incentive,0.197602,0.272131,0.457967,1.0,0.039198
expenses,0.188498,0.002818,0.334952,0.039198,1.0


#### Correlation of stock features

In [406]:
df_stock_corr1 = df[['poi'] + stock_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_stock_corr1

Unnamed: 0,poi,restricted_stock,restricted_stock_deferred,exercised_stock_options,total_stock_value
poi,1.0,0.157551,0.074473,0.339236,0.336074
exercised_stock_options,0.339236,0.377307,0.122813,1.0,0.948783
total_stock_value,0.336074,0.648689,0.07797,0.948783,1.0
restricted_stock,0.157551,1.0,0.036967,0.377307,0.648689
restricted_stock_deferred,0.074473,0.036967,1.0,0.122813,0.07797


The two top correlations with `poi`, `exercised_stock_options` and `total_stock_value`  are themselves strongly correlated so we will choose only one stock feature.

In [407]:
top_stock_features = ['exercised_stock_options']
df_stock_corr2 = df[['poi'] + top_stock_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_stock_corr2

Unnamed: 0,poi,exercised_stock_options
poi,1.0,0.339236
exercised_stock_options,0.339236,1.0


#### Correlation of email features

In [408]:
df_email_corr1 = df[['poi'] + email_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_email_corr1

Unnamed: 0,poi,to_messages,from_messages,shared_receipt_with_poi,from_this_person_to_poi,from_poi_to_this_person
poi,1.0,0.089488,0.031135,0.228966,0.134489,0.186784
shared_receipt_with_poi,0.228966,0.883171,0.315983,1.0,0.543413,0.720117
from_poi_to_this_person,0.186784,0.615348,0.26878,0.720117,0.519917,1.0
from_this_person_to_poi,0.134489,0.613922,0.609422,0.543413,1.0,0.519917
to_messages,0.089488,1.0,0.517079,0.883171,0.613922,0.615348
from_messages,0.031135,0.517079,1.0,0.315983,0.609422,0.26878


The strongest is `shared_receipt_with_poi` at 0.22

The higher correlation between `to_messages` and `from_this_person_to_poi` of 0.61 is intuitive and not that useful. If a person sent many emails then many went to the poi. The 'correlation between `from_poi_to_this_person` and 
`shared_receipt_with_poi` forces use of only the latter.

Selecting the most meaningful email features

In [409]:
top_email_features = list(df_email_corr1.index[1:2])
print top_email_features
df_email_corr2 = df[['poi'] + top_email_features].corr().apply(abs).sort(columns='poi', ascending=False)
df_email_corr2

['shared_receipt_with_poi']


Unnamed: 0,poi,shared_receipt_with_poi
poi,1.0,0.228966
shared_receipt_with_poi,0.228966,1.0


#### Final Correlation of selected features

A quick sanity check of the selected features 

In [410]:
top_features = top_email_features + top_money_features + top_stock_features
df[class_features + top_features].corr().sort(columns='poi', ascending=False)

Unnamed: 0,poi,shared_receipt_with_poi,deferred_income,salary,long_term_incentive,expenses,exercised_stock_options
poi,1.0,0.228966,-0.32523,0.324657,0.197602,0.188498,0.339236
exercised_stock_options,0.339236,0.033446,-0.268103,0.267087,0.207501,0.054372,1.0
salary,0.324657,0.480924,-0.210001,1.0,0.457967,0.334952,0.267087
shared_receipt_with_poi,0.228966,1.0,-0.17943,0.480924,0.226764,0.266557,0.033446
long_term_incentive,0.197602,0.226764,-0.272131,0.457967,1.0,0.039198,0.207501
expenses,0.188498,0.266557,0.002818,0.334952,0.039198,1.0,0.054372
deferred_income,-0.32523,-0.17943,1.0,-0.210001,-0.272131,0.002818,-0.268103


### Did you have to do any scaling?  

Scaling was attempted but this yielded sightly poorer results. Code for scaling is in `poi_id.py` and the detailed results will be presented below. 

### Did you create new features?

A new feature, based on email text analysis was created. The code is in `filter_email2.py`

The new feature, named email_poi, is built as follows:

- Email sent by all persons is munged into a single dictionary. Trials showed that using the sent emails only produced better predictions than all the email
- Each message is split to remove forwarded/replied text using "-----" as delimiter. This makes the text more personal to the sender.
- Email text is pipelined through a CountVectorizer, TfidTransformer, and SGDClassifier. Trials showed the SGD Classifier producing better results than Naive Bayes.
- GridSearchCV is used to tune the classifier and find an optimal result

This proved to be a reasonably effective classifier producing F1 scores of 0.74 on the full dataset.

A new feature is derived from the predicted label from this text classifier.The new feature is:

poi_email:  -1: no email, 0: no poi signal, 1: poi signal

The revised dataset is `final_project_dataset_augmented_email_poi.pkl`


## Question 3 Algorithm

###What algorithm did you end up using? 

For the poi prediction K-Means produced the most accurate results based on F1 scores. 

For initial, untuned algorithm selection, the following results were obtained.

Algorithm | F1 score
------------ | -------------
DecisionTreeClassifier | 0.33351
Guassian NB | 0.37913
K-Means | 0.91141

SVC, K-neighbors and Adaboost were lower but the training datasets produced limited positives and f1 score calculation failed. 

These results were sensitive to the particular training sample. Choosing different `random_state` when splitting the test/train sets produced substantial variation, most likely due to sampling 25% from what is a very small dataset of only 145 observations with 18 poi'sa

For the DecisionTreeClassifier the Feature Importances were:

feature  |importance
------- | ----------
4                 expenses    |0.395600
5  exercised_stock_options    |0.287234
1          deferred_income    |0.109534
0  shared_receipt_with_poi    |0.098594
2                   salary    |0.055435
3      long_term_incentive    |0.053603

###What algorithms did you try?
 
I explored many algorithms and feature combinations -- See Appendix A for details.



##Question 4  Tuning

### What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?

Tuning involves choosing values for parameters that influence the operation of the algorithm, while measuring the effectiveness of prediction. During tuning one is also interested in understanding the sensitivity of the parameter changes, and which ones produce the greatest influence. bad parameter choices can lead to sub-optimal results in some cases, the algorithm failing to predict anything at all. 

### How did you tune the parameters of your particular algorithm?  

I did some manual experimentation, but ultimately used a grid search to explore many parameter settings. The grid search tries every parameter value combination and identifies the set that produces the best score. I used the F1 score in scikit learn during grid search. Gridsearch was used on the poi identifier and on the new feature text classifier pipeline.

## Question 5 Validation

###What is validation, and what’s a classic mistake you can make if you do it wrong?

Validation is the process of testing the model for effective prediction on other data that has not been used to train the model. A classic mistake is fitting and testing the model on the same or full dataset. The essence of a predictive model is to predict good results when presented with data that it has never seen before. Effective validation must follow this principle. 


###How did you validate your analysis?

Training samples were 25% of the full dataset, and the remaining 75% was used for validation. The `scikit.metrics.f1_score` scorer was used as well as the `tester.py` that was provided for the project. Because the dataset is small there was substantial variation on specific samples.  Some samples yielded zero poi's which make it impossible to build a model. 

##Question 6 

###Give at least 2 evaluation metrics, and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance


The optimal identifer achieved the following scores
Accuracy: 0.87800       Precision: 0.87427      Recall: 0.92337 F1: 0.89815     F2: 0.91311
Total predictions: 14000        True positives: 7531    False positives: 1083   False negatives:  625   True negatives: 4761

The precision score of 0.87 indicates that 87% of the actual pois were identified. Only 13% were missed. If we assume for sake of discussion that all poi's were criminals then we can think of this set as the pois that "got away". The recall score of 0.92 indicates that 92% of identified pois were really poi's and that 8% were "falsely accused" by the model.



# APPENDIX A - Algorithm trials

In [427]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

### Define a recorder function to format and log results

Results are written to 
- `results.txt'
- `results.csv`

In [414]:
def recorder(classifier, data_dict, features_used, \
             setup, options="",  random_state=None, notes=None):
    '''
    Returns a formatted message which is also written to results.txt. Unformatted data
    is written to results.csv
    
    BUT only if the F1 score exceeds a set constant
    '''
    import datetime
    import csv
    import os.path

    MIN_F1 = 0.3
    
    # get scores
    accuracy, precision, recall, f1, f2 = test_classifier(classifier, data_dict, features_used)
    scores = [accuracy, precision, recall, f1, f2]
    
    if f1 < MIN_F1:
      return None
    
    # build the message           
    msg = \
    "\n\n------ {0}".format(datetime.datetime.today()) + \
    "\nScores --  Accuracy: {:>0.5f}  Precision: {:>0.5f}  Recall: {:>0.5f}  F1: {:>0.5f} F2:{:>0.5f}".format(accuracy, precision, recall, f1, f2) + \
    "\nSetup: {0}  Random: {3}  Classifier {1}  Options set: {2}".format(setup, classifier, options, random_state) + \
    "\n\nFeatures: {0}".format(features_used) + \
    "\n" + \
    "Notes: {0}".format(notes) + \
    "\n---\n"

    # write a  text log ...
    with open("results.txt","a") as f:
        f.write(msg)
    
    # write a csv 
    csv_name = "results.csv"
    exists = os.path.isfile(csv_name)

    with open(csv_name, "ab") as fc:
        fcw = csv.writer(fc)
        # write a header
        if not exists:
            fcw.writerow(["datetime", "accuracy", "precision", "recall", "f1", "f2", \
                      "random_state", "setup", "classifier", "options", "feature_count","features_used"])        
        fcw.writerow([str(datetime.datetime.today()), accuracy, precision, recall, f1, f2, \
                      random_state, setup, classifier, options, len(features_used), features_used])
              
    return msg

## Important global settings.

The random_state can be set globally here as well as the test size. This facilitate testing multiple
sets. The samples are quite small so there is some variation across samples.

In [415]:
tst_state = 42
tst_size = 0.25

###Set up test/train data

In [416]:
features_list = class_features + top_features
data = featureFormat(data_dict, features_list)
poi, feature_data = targetFeatureSplit( data )
features_train, features_test, labels_train, labels_test = \
    train_test_split(feature_data, poi, test_size=tst_size, random_state=tst_state)

## Naive Bayes



In [417]:
# get the data and features
feature_list =  ["poi"] + money_features + email_features[2:5]
####[3:4] + email_features[2:5]

feature_list = class_features + top_features


data = featureFormat(data_dict, feature_list)
poi, feature_data = targetFeatureSplit( data )
features_train, features_test, labels_train, labels_test = \
    train_test_split(feature_data, poi, test_size=tst_size, random_state=tst_state)
    
# instantiate the classifier
from sklearn.naive_bayes import GaussianNB
opts = {}
clf = GaussianNB(**opts)
clf.fit(features_train, labels_train)

#predict
pred_labels_test = clf.predict(features_test)
my_notes = ""
print recorder(clf, data_dict, features_list,
    "GuassianNB", options = opts,  random_state=tst_state, notes = my_notes)



------ 2015-06-06 13:04:38.440779
Scores --  Accuracy: 0.84279  Precision: 0.43495  Recall: 0.33600  F1: 0.37913 F2:0.35202
Setup: GuassianNB  Random: 42  Classifier GaussianNB()  Options set: {}

Features: ['poi', 'shared_receipt_with_poi', 'deferred_income', 'salary', 'long_term_incentive', 'expenses', 'exercised_stock_options']
Notes: 
---



###Decision Tree Classifier 

In [None]:
# instantiate the classifier
from sklearn import tree
opts = {'random_state' : 4222}
clf = tree.DecisionTreeClassifier(**opts)

# fit the classifier on the training features and labels
clf.fit(features_train, labels_train)

# use the feature list to name the importances in the dataframe
imps = pd.DataFrame({'feature' : features_list[1:],  
                     'importance' : clf.feature_importances_}).sort(columns='importance', ascending=False)

# record results
my_notes = "\nFeature importances:\n{0}".format(imps)
print recorder(clf, data_dict, features_list,
    "Decision Tree", options = opts,  random_state=tst_state, notes = my_notes)



------ 2015-06-06 13:04:40.408770
Scores --  Accuracy: 0.81700  Precision: 0.34761  Recall: 0.32050  F1: 0.33351 F2:0.32558
Setup: Decision Tree  Random: 42  Classifier DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=4222, splitter='best')  Options set: {'random_state': 4222}

Features: ['poi', 'shared_receipt_with_poi', 'deferred_income', 'salary', 'long_term_incentive', 'expenses', 'exercised_stock_options']
Notes: 
Feature importances:
                   feature  importance
4                 expenses    0.395600
5  exercised_stock_options    0.287234
1          deferred_income    0.109534
0  shared_receipt_with_poi    0.098594
2                   salary    0.055435
3      long_term_incentive    0.053603
---



### K-means

In [None]:
# instantiate the classifier
from sklearn import cluster
opts = {}
clf = cluster.KMeans(**opts)

# fit the classifier on the training features and labels
clf.fit(features_train, labels_train)

from sklearn.metrics import f1_score  
labels_test_pred = clf.predict(features_test)
print "F1 score on test: {}".format(f1_score(labels_test, labels_test_pred, average=None))

my_notes = ""
print recorder(clf, data_dict, features_list,
    "K-means", options = opts,  random_state=tst_state, notes = my_notes)

### Support Vector Classifier

In [None]:
# instantiate the classifier
from sklearn import svm
opts = {'random_state' : 42}
clf = svm.SVC(**opts)

# fit the classifier on the training features and labels
clf.fit(features_train, labels_train)

from sklearn.metrics import f1_score  
labels_test_pred = clf.predict(features_test)
print "F1 score on test: {}".format(f1_score(labels_test, labels_test_pred, average=None))

my_notes = ""
print recorder(clf, data_dict, features_list,
    "SVC", options = opts,  random_state=tst_state, notes = my_notes)

### K-neighbors

In [None]:
# instantiate the classifier
from sklearn import neighbors

opts = {}
clf = neighbors.KNeighborsClassifier(**opts)

# fit the classifier on the training features and labels
clf.fit(features_train, labels_train)

from sklearn.metrics import f1_score  
labels_test_pred = clf.predict(features_test)
print "F1 score on test: {}".format(f1_score(labels_test, labels_test_pred))

print recorder(clf, data_dict, features_list,
    "K-neighbors", options = opts,  random_state=tst_state, notes = my_notes)

### Adaboost

In [None]:
# instantiate the classifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
opts = {}

# fit the classifier on the training features and labels
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
     algorithm="SAMME",
     n_estimators=10)
clf.fit(features_train, labels_train)

from sklearn.metrics import f1_score  
labels_test_pred = clf.predict(features_test)
print "F1 score on test: {}".format(f1_score(labels_test, labels_test_pred))

imps = pd.DataFrame({'feature' : features_list[1:],  'importance' : clf.feature_importances_})
my_notes = "\nFeature importances:\n{0}".format(imps)
print recorder(clf, data_dict, features_list,
    "Adaboost/Decision Tree", options = opts,  random_state=tst_state, notes = my_notes)

### tester

In [None]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000, print_results=False, 
                    print_false_positives=False, print_false_negatives=False ):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        pre_i = 0
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
                if print_false_negatives : print "False negative: ", data_dict[pre_i]
            elif prediction == 1 and truth == 0:
                false_positives += 1
                if print_false_negatives : print "False positive: ", data_dict[pre_i]
            else:
                true_positives += 1
            pre_i += 1

    try:
        accuracy = 0; precision=0; recall=0; f1=0; f2=0      
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        if print_results:
            print clf
            print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
            print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
            print ""
    except:
        print "Got a divide by zero when trying out:", clf
    return accuracy, precision, recall, f1, f2

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    pickle.dump(clf, open(CLF_PICKLE_FILENAME, "w") )
    pickle.dump(dataset, open(DATASET_PICKLE_FILENAME, "w") )
    pickle.dump(feature_list, open(FEATURE_LIST_FILENAME, "w") )

def load_classifier_and_data():
    clf = pickle.load(open(CLF_PICKLE_FILENAME, "r") )
    dataset = pickle.load(open(DATASET_PICKLE_FILENAME, "r") )
    feature_list = pickle.load(open(FEATURE_LIST_FILENAME, "r"))
    return clf, dataset, feature_list
