# Identify Fraud from Enron Data - Further Exploration and Analysis

**By: Christian Guzman**

This report is on the Enron data that was processed as part of the Udacity final project. The goal of this  is to use different techniques in the data exploration and analysis phases to draw new insights from the data, as well as to further develop machine learning techniques. I will attempt to invoke many of the techniques I learned throughout the course.

In [81]:
import sys
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [82]:
financial_features = ['salary', 'deferral_payments', 'total_payments', 
                      'loan_advances', 'bonus', 'restricted_stock_deferred', 
                      'deferred_income', 'total_stock_value', 'expenses', 
                      'exercised_stock_options', 'other', 
                      'long_term_incentive', 'restricted_stock', 
                      'director_fees'] # all units are in US dollars.

email_features = ['to_messages', 'from_poi_to_this_person', 
                  'from_messages', 'from_this_person_to_poi', 
                  'shared_receipt_with_poi'] # units are generally number of 
                                             # emails messages; notable 
                                             # exception is ‘email_address’, 
                                             # which is a text string.

poi_label = ['poi'] # boolean 1 for person of interest, 0 for not.

features_list = financial_features + email_features
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [83]:
len(features_list)

19

There 19 features in the variable

In [84]:
data_dict['ALLEN PHILLIP K']

{'bonus': 4175000,
 'deferral_payments': 2869717,
 'deferred_income': -3081055,
 'director_fees': 'NaN',
 'email_address': 'phillip.allen@enron.com',
 'exercised_stock_options': 1729541,
 'expenses': 13868,
 'from_messages': 2195,
 'from_poi_to_this_person': 47,
 'from_this_person_to_poi': 65,
 'loan_advances': 'NaN',
 'long_term_incentive': 304805,
 'other': 152,
 'poi': False,
 'restricted_stock': 126027,
 'restricted_stock_deferred': -126027,
 'salary': 201955,
 'shared_receipt_with_poi': 1407,
 'to_messages': 2902,
 'total_payments': 4484442,
 'total_stock_value': 1729541}

The above is an example data point of employee Phillip K Allen.

In [85]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df.replace(to_replace='NaN', value=np.nan, inplace=True)

In [86]:
df = df[features_list]
df.head()

Unnamed: 0,salary,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,exercised_stock_options,other,long_term_incentive,restricted_stock,director_fees,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,shared_receipt_with_poi
ALLEN PHILLIP K,201955.0,2869717.0,4484442.0,,4175000.0,-126027.0,-3081055.0,1729541.0,13868.0,1729541.0,152.0,304805.0,126027.0,,2902.0,47.0,2195.0,65.0,1407.0
BADUM JAMES P,,178980.0,182466.0,,,,,257817.0,3486.0,257817.0,,,,,,,,,
BANNANTINE JAMES M,477.0,,916197.0,,,-560222.0,-5104.0,5243487.0,56301.0,4046157.0,864523.0,,1757552.0,,566.0,39.0,29.0,0.0,465.0
BAXTER JOHN C,267102.0,1295738.0,5634343.0,,1200000.0,,-1386055.0,10623258.0,11200.0,6680544.0,2660303.0,1586055.0,3942714.0,,,,,,
BAY FRANKLIN R,239671.0,260455.0,827696.0,,400000.0,-82782.0,-201641.0,63014.0,129142.0,,69.0,,145796.0,,,,,,


In [103]:
### Count number of null, and negative values for each column
# Number of nan
# ex: np.isnan(df['salary']).values.sum()
print "Number of null values"
for col in df.columns:
    val = np.isnan(df[col]).values.sum()
    print col.upper() +":", val, ";", \
    round(float(val) / float(len(df)) * 100, 2), "%"

Number of null values
SALARY: 51 ; 34.93 %
DEFERRAL_PAYMENTS: 107 ; 73.29 %
TOTAL_PAYMENTS: 21 ; 14.38 %
LOAN_ADVANCES: 142 ; 97.26 %
BONUS: 64 ; 43.84 %
RESTRICTED_STOCK_DEFERRED: 128 ; 87.67 %
DEFERRED_INCOME: 97 ; 66.44 %
TOTAL_STOCK_VALUE: 20 ; 13.7 %
EXPENSES: 51 ; 34.93 %
EXERCISED_STOCK_OPTIONS: 44 ; 30.14 %
OTHER: 53 ; 36.3 %
LONG_TERM_INCENTIVE: 80 ; 54.79 %
RESTRICTED_STOCK: 36 ; 24.66 %
DIRECTOR_FEES: 129 ; 88.36 %
TO_MESSAGES: 60 ; 41.1 %
FROM_POI_TO_THIS_PERSON: 60 ; 41.1 %
FROM_MESSAGES: 60 ; 41.1 %
FROM_THIS_PERSON_TO_POI: 60 ; 41.1 %
SHARED_RECEIPT_WITH_POI: 60 ; 41.1 %


In [106]:
# Number of negative
print "Number of negative values"
for col in df.columns:
    ctr = 0
    for elem in df[col]:
        if elem < 0:
            ctr += 1
    print col.upper() + ":", ctr, ";", \
    round(float(ctr) / float(len(df)) * 100, 2), "%"

Number of negative values
SALARY: 0 ; 0.0 %
DEFERRAL_PAYMENTS: 1 ; 0.68 %
TOTAL_PAYMENTS: 0 ; 0.0 %
LOAN_ADVANCES: 0 ; 0.0 %
BONUS: 0 ; 0.0 %
RESTRICTED_STOCK_DEFERRED: 16 ; 10.96 %
DEFERRED_INCOME: 49 ; 33.56 %
TOTAL_STOCK_VALUE: 1 ; 0.68 %
EXPENSES: 0 ; 0.0 %
EXERCISED_STOCK_OPTIONS: 0 ; 0.0 %
OTHER: 0 ; 0.0 %
LONG_TERM_INCENTIVE: 0 ; 0.0 %
RESTRICTED_STOCK: 1 ; 0.68 %
DIRECTOR_FEES: 0 ; 0.0 %
TO_MESSAGES: 0 ; 0.0 %
FROM_POI_TO_THIS_PERSON: 0 ; 0.0 %
FROM_MESSAGES: 0 ; 0.0 %
FROM_THIS_PERSON_TO_POI: 0 ; 0.0 %
SHARED_RECEIPT_WITH_POI: 0 ; 0.0 %


In [15]:
# Remove negative numbers
for col in df.columns:
    if type(df[col][0]) == np.float64:
        df[col] = df[col].abs()

In [9]:
df['salary'].describe()

count    9.500000e+01
mean     5.621943e+05
std      2.716369e+06
min      4.770000e+02
25%      2.118160e+05
50%      2.599960e+05
75%      3.121170e+05
max      2.670423e+07
Name: salary, dtype: float64

In [13]:
df['salary'].isnull().sum()

51

In [61]:
# Remove NaN values.
def impute(column, strat='median'):
    strats = {}
    strats['mean'] = column.mean()
    strats['median'] = column.median()
    strats['mode'] = column.mode()
    for i in range(len(column)):
        if np.isnan(column[i]):
            column[i] = strats[strat]
    return column

In [58]:
print "NaN"np.isnan(test[1])

True

In [62]:
test = df['salary']
print test.median(), test.isnull().sum()

259996.0 51


In [64]:
test = impute(test)
test.head()

ALLEN PHILLIP K       201955.0
BADUM JAMES P         259996.0
BANNANTINE JAMES M       477.0
BAXTER JOHN C         267102.0
BAY FRANKLIN R        239671.0
Name: salary, dtype: float64

It works!

In [66]:
for elem in df.columns:
    df[elem] = impute(df[elem])

salary
deferral_payments
total_payments
loan_advances
bonus
restricted_stock_deferred
deferred_income
total_stock_value
expenses
exercised_stock_options
other
long_term_incentive
restricted_stock
director_fees
to_messages
from_poi_to_this_person
from_messages
from_this_person_to_poi
shared_receipt_with_poi


In [68]:
df['salary'].isnull()

ALLEN PHILLIP K                  False
BADUM JAMES P                    False
BANNANTINE JAMES M               False
BAXTER JOHN C                    False
BAY FRANKLIN R                   False
BAZELIDES PHILIP J               False
BECK SALLY W                     False
BELDEN TIMOTHY N                 False
BELFER ROBERT                    False
BERBERIAN DAVID                  False
BERGSIEKER RICHARD P             False
BHATNAGAR SANJAY                 False
BIBI PHILIPPE A                  False
BLACHMAN JEREMY M                False
BLAKE JR. NORMAN P               False
BOWEN JR RAYMOND M               False
BROWN MICHAEL                    False
BUCHANAN HAROLD G                False
BUTTS ROBERT H                   False
BUY RICHARD B                    False
CALGER CHRISTOPHER F             False
CARTER REBECCA C                 False
CAUSEY RICHARD A                 False
CHAN RONNIE                      False
CHRISTODOULOU DIOMEDES           False
CLINE KENNETH W          