# Enron Dataset

##  Data Cleaning

####  Load dataset

In [None]:
import sys
import pickle
import pandas as pd
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
    df = pd.DataFrame(data_dict).T

####  Peek into Dataset

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head()

####  Number of POI ,  Non-POI

In [None]:
from collections import Counter

Counter(df.poi)

#### Missing values

In [None]:
import numpy as np
df = df.replace('NaN', np.nan)
df.isnull().sum()

*** There are too much missing values in restricted_stock_deferred, director_fees,deferral_payments, loan_advances. ***
##  Select Features

First, I removed features which have a lot of missing values.

In [None]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = ['poi','salary', 'bonus', 'total_payments', 'long_term_incentive', 'deferred_income',
                      'total_stock_value', 'restricted_stock', 'exercised_stock_options', 'expenses', 'other',
                'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi']

enron = df[features_list].copy()

##  Remove outliers

In [None]:
### Task 2: Remove outliers
import matplotlib.pyplot as plt
%matplotlib inline 
fig, ax = plt.subplots(figsize=(10,6))
plt.scatter(enron.salary, enron.bonus)
plt.xlabel('Salary')
plt.ylabel('Bonus')
plt.title('Scatter plot ( Salary vs Bonus)')
plt.show()

As we can see a scatter plot there is on huge outlier.   
Let's check what it is.

In [None]:
enron.salary.argmax()

In [None]:
enron = enron.drop('TOTAL')

Drop 'TOTAL' and then draw scatter plot again

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
plt.scatter(enron.salary[enron.poi==1], enron.bonus[enron.poi==1],c='red',label='poi')
plt.scatter(enron.salary[enron.poi==0], enron.bonus[enron.poi==0],c='skyblue',label='non-poi')
plt.xlabel('Salary')
plt.ylabel('Bonus')
plt.title('Scatter plot ( Salary vs Bonus)')
plt.legend(loc='lower right')
plt.show()

##  Create New Features

I made new features fraction of message from poi and fraction of message to poi.  
Let's check new featrues by drawing scatter plot.

In [None]:
enron['fraction_from_poi'] = enron.from_poi_to_this_person / enron.to_messages
enron['fraction_to_poi'] = enron.from_this_person_to_poi / enron.from_messages
fig, ax = plt.subplots(figsize=(9,6))
plt.scatter(enron.fraction_to_poi[enron.poi == 1], enron.fraction_from_poi[enron.poi == 1],c='red',alpha=0.8,label='poi')
plt.scatter(enron.fraction_to_poi[enron.poi == 0], enron.fraction_from_poi[enron.poi == 0],c='skyblue', alpha=0.5, label='non-poi')
plt.legend(loc='upper right')


There is no distinct pattern, but their fraction are higher than ohters.   
*** before extracting features, let's clean some features. ***

In [None]:
for name in data_dict:
    
    data_point = data_dict[name]

    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    
    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
   
    if from_poi_to_this_person == 'NaN' or to_messages == 'NaN' :
        data_point["fraction_from_poi"] = "NaN"
    else :
        fraction_from_poi = float(from_poi_to_this_person) / float(to_messages)
        data_point["fraction_from_poi"] = fraction_from_poi
    
    if from_this_person_to_poi == 'NaN' or from_messages == 'NaN' :
        data_point['fraction_to_poi'] = "NaN"
    else :
        fraction_to_poi = float(from_this_person_to_poi) / float(from_messages)
        data_point["fraction_to_poi"] = fraction_to_poi
data_dict.pop('TOTAL')

Now, let's extract features.  
***  Add new features and remove features which were reduntant features. ***

In [None]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.

from sklearn.cross_validation import train_test_split

my_dataset = data_dict

features_list += ['fraction_from_poi','fraction_to_poi']
remove_list = ['to_messages','from_messages','from_poi_to_this_person','from_this_person_to_poi']

for x in remove_list :
    features_list.remove(x)

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

*** Now, Let's split dataset to train and test ***