# Finding Enron persons of interest from email and financial data 

In [None]:
# standard library imports
import pickle
import sys
sys.path.append("../tools/")

# package imports
from ggplot import *
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import scale, StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# project imports
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [None]:
# load date and transform to dataframe 
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# As soon as you print the data frame you see the'Total' 
# row is present from the spreadsheet and needs removed
data_dict.pop('TOTAL',0)

df = pd.DataFrame.from_dict(data_dict, orient = 'index')

## Understanding the Dataset and Question

In [None]:
print "In our dataset we have"
print 'Observations(Enron employees): %s' % df.shape[0]
print 'Features: %s' % df.shape[1]
poi_count = sum(df['poi'])
poi_percent = round(poi_count/float(df.shape[0]),4) * 100
print "With %s of them (%s%%) being persons of interest" % (poi_count, poi_percent)

When we preview our dataset, it looks like there will be a lot of data missing throughout it:

In [None]:
df.head()

 Let's get a handle on how many missing observations there are for each column:

In [None]:
def percent_missing(df, cutoff_percent = 0.):
    for col in df.columns:
        try:
            missing_percent = sum(df[col] == 'NaN')/float(df.shape[0])
            if missing_percent >= cutoff_percent:
                print col, round(missing_percent, 2)
        except:
            # This is just poi bool
            pass

percent_missing(df)

So it looks like we are missing some values in every field we have (besides poi which is not listed here) but we kind of expected as much with messy data. Let's focus on the most worysome ones.

In [None]:
percent_missing(df, .5)

With these values missing in over half of observations, we will want to be especially cautious about how we use them. Althrough since we are hunting for very specific people that only make up 12% of the observations, it is possible some of them could be very useful.

With loan_advances only appearing in 2% I'll be removing it.

In [None]:
df = df.drop('loan_advances',axis=1)

In [None]:
# Create a version of the dataframe with missing values filled in
df_filled = df.replace('NaN',0)
df_filled['email_address'] = df_filled['email_address'].replace(0, 'None')

Outliers are going to be tricky with this data set. I wanted to find the most agredious outliers but with so much missing data as 0s and the insanely high financial benifits some of these people were recieving, I had to go all the way up to features that have a max value seven standard deviations above the mean to focus on half the features.

It looks like most of the features with big outliers are compensation items besides normal salary and bonuses. This kind of makes sense though with there be hevily compensated execs and normal employees so we don't know that any of these are actually problematic and will have to look into them more individually.

In [None]:
# setting to not use scientific notation since that made this harder to read
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df_description = df_filled.describe()

largest_outliers = [x for x in df_description if 
                    df_description[x]['max'] > df_description[x]['mean'] + df_description[x]['std'] * 7]

df_description[largest_outliers]

In [None]:
for col in df_description:
    print ggplot(aes('poi', col), data = df_filled) +\
            geom_boxplot()

I looked through the box-plots of the features with the largest outliers and made a list of those where the outlier seems especially far away which might be problematic. It turns out though that most of them are Kenneth Lay whom we know made out like a bandit from the fraud so for now I will be leaving all the outliers alone.

In [None]:
aggressive_outliers = ['total_payments', 'restricted_stock', 
                        'shared_receipt_with_poi', 'restricted_stock_deferred',
                        'other']
outlier_list = []
for outlier in aggressive_outliers:
    outlier_row = df_filled[df_filled[outlier] == max(df_filled[outlier])]
    outlier_list.append({'feature' : outlier, 'person' : outlier_row.index.values[0], 'poi': outlier_row['poi'][0]})

pd.DataFrame(outlier_list)

## Optimize Feature Selection/Engineering

People that recieve a director fee seem to be different from regular company employees, they don't draw a salary or bonuses and instead are compensated in stocks and director fees. I would like to see if using director as a boolian is more informative that specific values. 

In [None]:
df['director'] = df['director_fees'] != 'NaN'
df[df['director_fees'] != 'NaN']

I am also making features of what percent of emails were to/from pois instead of the straight counts that we have in the data set.

In [None]:
def email_percents(poi_emails, all_emails):
    percent_emails_to_poi = []
    for from_poi, all_from in zip(df[poi_emails], df[all_emails]):
        try:
            perc_to_poi = float(from_poi)/all_from
            percent_emails_to_poi.append(round(perc_to_poi,4))
        except:
            percent_emails_to_poi.append('NaN')
    return percent_emails_to_poi

df['percent_from_emails_to_poi'] = email_percents('from_this_person_to_poi', 'from_messages')
df['percent_emails_from_poi'] = email_percents('from_poi_to_this_person', 'to_messages')

From the plots below, it looks like the percents will be a more helpful parameter since it at least gives us some decent sized areas with only non pois.

In [None]:
ggplot(aes(x = 'from_this_person_to_poi', y = 'from_poi_to_this_person',
           color = 'poi' ), data = df ) +\
    geom_point() +\
    labs(title = "Count of emails to/from poi")

In [None]:
ggplot(aes(x = 'percent_from_emails_to_poi', y = 'percent_emails_from_poi',
           color = 'poi' ), data = df ) +\
    geom_point()+\
    labs(title = "Percent of emails to/from poi")

I was initially concerned about the person who sent 100% of their emails to a poi, but Gene Humphrey seems to be accurate, so we won't toss it as an outlier.

In [None]:
# Drop one of the featrures from all sets with over .75 correlation
features_less_corr_train = features_train.drop(['director_fees','to_messages',
                                                'total_payments', 'total_stock_value'], axis =1)
features_less_corr_test = features_test.drop(['director_fees','to_messages',
                                                'total_payments', 'total_stock_value'], axis =1)

# trean and set top import features for the less correlated dataframe
dt_lc_clf = DecisionTreeClassifier(random_state=1809)
dt_lc_clf.fit(features_less_corr_train, labels_train)
importance_lc_df = pd.DataFrame({'features':features_less_corr_train.columns,
                                 'importance':dt_lc_clf.feature_importances_})
dt_lc_important_features = importance_lc_df.sort_values('importance', ascending= False)['features'][0:6]
importance_lc_df.sort_values('importance', ascending= False)

# Scale the less correlated features
scaler = StandardScaler().fit(features_less_corr_train)
scaled_less_corr_features_train = scaler.transform(features_less_corr_train)
scaled_less_corr_features_test = scaler.transform(features_less_corr_test)

# Do logistic regression RFE on the less correlated features
lc_model = LogisticRegression()
lc_rfe = RFE(lc_model, 5)
lc_rfe.fit(scaled_less_corr_features_train, labels_train)

# set the features selected based on less correleated features
lc_rfe_selected = features_less_corr_train.columns[lc_rfe.support_]

importance_lc_df['lc_rfe_selected'] = lc_rfe.support_

Since we removed two of the features selected previously by decision trees, there are obviously a couple new ones in the most important list, interestingly thought the logistic regression rfe is the same features and I would have expected it to be the most affected by the correlation change.

In [None]:
# Check if RFE is the same in both feature sets
print lc_rfe_selected == rfe_selected

importance_lc_df.sort_values('importance', ascending= False)

In [None]:
# Make scaled featues DataFrames for consistency and feature selection below
scaled_less_corr_features_train = pd.DataFrame(scaled_less_corr_features_train,
                                               columns=features_less_corr_train.columns)
scaled_less_corr_features_test = pd.DataFrame(scaled_less_corr_features_test, 
                                              columns=features_less_corr_test.columns)
scaled_features_train = pd.DataFrame(scaled_features_train, columns=features_train.columns)
scaled_features_test = pd.DataFrame(scaled_features_test, columns=features_test.columns)

### Pick and Tune an Algorithm

We have some parameters to work with now so lets see how they perform with different Algorithms!

In [None]:
algos_tried = []

def add_results_to_dict(algo_name, data_used, pred, less_corr, scaled):
    '''Add the information about what algo was tried, which data 
    and its performance to algos_tried list'''
    algo_tried = {'algo': algo_name,
                  'data_used': data_used,
                  'accuracy' : accuracy_score(labels_test, pred),
                  'precision' : precision_score(labels_test, pred),
                  'recall': recall_score(labels_test, pred),
                  'less_corr': less_corr,
                  'scaled': scaled}
    algos_tried.append(algo_tried)

def test_all_datasets(classifier, algo, scaled = False):
    '''Try an algorithm with all of the datasets that have been prepared
    and the results to algos_tried'''
    if scaled:
        train_features = scaled_features_train
        test_features = scaled_features_test
        train_less_corr = scaled_less_corr_features_train
        test_less_corr = scaled_less_corr_features_test
    else:
        train_features = features_train
        test_features = features_test
        train_less_corr = features_less_corr_train
        test_less_corr = features_less_corr_test
    classifier(train_features, features_test, algo, 'All', False, scaled) 
    classifier(train_less_corr, test_less_corr,
               algo, 'All Less Corr', True, scaled) 
    classifier(train_features[dt_important_features], 
                 features_test[dt_important_features],
                 algo, 'DT Important', False, scaled) 
    classifier(train_less_corr[dt_lc_important_features], 
                 test_less_corr[dt_lc_important_features],
                 algo, 'DT Important', True, scaled)
    classifier(train_features[rfe_selected], 
                 features_test[rfe_selected],
                 algo, 'RFE Selected', False, scaled)
# this is pointless since they are the same values in both datasets
#     classifier(train_less_corr[lc_rfe_selected], 
#                  test_less_corr[lc_rfe_selected],
#                  algo, 'RFE Selected', True, scaled)
    classifier(features_train_pca, features_test_pca,
                 algo, 'PCA features', False, True) 

In [None]:
def nb_classify(train_features, test_features, algo_name, data_used, less_corr, scaled):
    '''Train and predict passed features with Gaussian nieve bays'''
    nb_clf = GaussianNB()
    nb_clf.fit(train_features, labels_train)
    nb_pred = nb_clf.predict(test_features)
    add_results_to_dict(algo_name, data_used, nb_pred, less_corr, scaled)

test_all_datasets(nb_classify, 'Naive Bayes', False)

In [None]:
def svm_classify(train_features, test_features, algo_name, data_used, less_corr, scaled):
    '''Train and predict passed features with Support Vector Machine'''
    svm_clf = SVC()
    svm_clf.fit(train_features, labels_train)
    svm_pred = svm_clf.predict(test_features)
    add_results_to_dict(algo_name, data_used, svm_pred, less_corr, scaled)
    
test_all_datasets(svm_classify, 'SVM', True)

In [None]:
def dt_classify(train_features, test_features, algo_name, data_used, less_corr, scaled):
    dt_clf = DecisionTreeClassifier(random_state=1809)
    dt_clf.fit(train_features, labels_train)
    dt_pred = dt_clf.predict(test_features)
    add_results_to_dict(algo_name, data_used, dt_pred, less_corr, scaled)

test_all_datasets(dt_classify, 'Decision Tree', False)

In [None]:
def rf_classify(train_features, test_features, algo_name, data_used, less_corr, scaled):
    rf_clf = RandomForestClassifier()
    rf_clf.fit(train_features, labels_train)
    rf_pred = rf_clf.predict(test_features)
    add_results_to_dict(algo_name, data_used, rf_pred, less_corr, scaled)

test_all_datasets(rf_classify, 'Random Forest', False)

In [None]:
def ada_classify(train_features, test_features, algo_name, data_used, less_corr, scaled):
    ada_clf = AdaBoostClassifier()
    ada_clf.fit(train_features, labels_train)
    ada_pred = ada_clf.predict(test_features)
    add_results_to_dict(algo_name, data_used, ada_pred, less_corr, scaled)

test_all_datasets(ada_classify, 'AdaBoost', False)

After running all of the algorithms I selected there is a lot to look at! Unfortunately our SVM's best fit with all data sets was to return false for all predictions, this gave it decent accuracy since people of interest are fairly rare... but it's completely useless to us. Let's focus on the best performers.

In [None]:
algos_tried_df = pd.DataFrame.from_dict(algos_tried)
algos_tried_df

Interestingly the AdaBoost had the same accuracy (tied for the highest), precision, and recall wit all of the data, less correlated decision tree importance data and all features decision tree importance data. I was surprised it achieved the same results with 3 different sets of data but maybe this shows how powerful of a tool Adaboost can be even with fairly small datasets. 

The random forest model with all data is also tied for highest accuracy, but was right 100% of the time it predicted the person to be a poi, it also made incorrect predictions more often though. This is a good contender for best algorithm if we are planning to just look closer at the people it predicts to be a poi. We may waste a lot of time researching bad leads though since it incorrectly predicts poi so often.

The Naive Bayes results from decision tree important features are good accuracy with somewhat balanced precision and recall. We have a pretty small dataset so far, so we can go with one of the more computationally intense algorithms if we think it serves us better. But this could be a good pick if all the sudden we were trying looking for these traits across an entire industry or some other massive dataset.

I really like the decision tree for this situation. It doesn't have quite as hight of accuracy or precision as the rest, but unlike the others we can easily understand why it is making the decisions it is making. This could be invaluable in an investigation because we could look at why people are being picked as a poi and decide if it makes sense to investigate them further. For example, we might see that Kenneth Lay's executive assistant gets tagged as a poi because of a high percent of emails to them from other pois but we would expect them to receive a lot of emails from Kenneth Lay.

In [None]:
df[df['percent_from_emails_to_poi']==1][
    ['from_this_person_to_poi', 'from_messages', 'percent_from_emails_to_poi']]

Now that we have created our new features lets split the test and training set.

In [None]:
# Split poi from the features
labels = df_filled['poi']
# I have relized I am not doing anything interesting with email addresses and they cause type issues
features = df_filled.drop(['poi', 'email_address'], axis=1)

features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.25, random_state=1809)


I am really interested in learning about feature selection and seeing what different results we get from different 
strategies. I started with feature importance from random forest. 

In [None]:
dt_clf = DecisionTreeClassifier(random_state=1809)
dt_clf.fit(features_train, labels_train)

It looks like a LOT of our features are useless, but we can see that 3 features are really useful for this approach and the importance plateaus around the 5th or 6th most important feature.

In [None]:
importance_df = pd.DataFrame({'features':features_train.columns, 'importance':dt_clf.feature_importances_})
dt_important_features = importance_df.sort_values('importance', ascending= False)['features'][0:6]
importance_df.sort_values('importance', ascending= False)

Next I want to fit a logistic regreassion and do a recusrive feature elimination. For this we need to first scale the data since we are using regression.



In [None]:
# Scale the training and testing features in a consistant way
scaler = StandardScaler().fit(features_train)
scaled_features_train = scaler.transform(features_train)
scaled_features_test = scaler.transform(features_test)

# Set up a logistic regression model and run RFE with
model = LogisticRegression()
rfe = RFE(model, 5)
rfe.fit(scaled_features_train, labels_train)

# The features that RFE has selected, to be used with mosdels later
rfe_selected = features_train.columns[rfe.support_]

# Checking the order is consistant for my own sanity
if (rfe.support_ == [x in rfe_selected for x in importance_df['features'].values]).all():
    importance_df['rfe_selected'] = rfe.support_


The recursive feature elimination has very little overlap with the top features from decision trees. Which is surprising to me, but likely has to do with the very different ways in which regression and decision trees use features. It will be interesting to see how this affects their performance with different algorithms.