In [146]:
import sys
import pickle
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

pickle_file = "data.pkl"

with open(pickle_file, "rb") as all_pickle:
    raw_dict = pickle.load(all_pickle,encoding="latin1")

feature_names = []
features = []

first_iter = True
for person_name, values in raw_dict.items():
    # Array to hold current feature values
    temp_array = np.array([])

    # Only run on first iteration
    if (first_iter == True):
        # Put feature names in list
        feature_names = values.keys()

    first_iter = False

    # Get feature values for current person
    for feature_name, feature_value in values.items():
        # Append feature_value to temporary array
        temp_array = np.append(temp_array, feature_value)
    # Add temporary array to features
    features.append(temp_array)

# Order of columns (Want poi at end)
cols = ["email_address",
        "long_term_incentive",
        "expenses",
        "exercised_stock_options",
        "from_messages",
        "salary",
        "director_fees",
        "total_payments",
        "restricted_stock",
        "from_poi_to_this_person",
        "from_this_person_to_poi",
        "total_stock_value",
        "bonus",
        "restricted_stock_deferred",
        "loan_advances",
        "shared_receipt_with_poi",
        "other",
        "deferred_income",
        "deferral_payments",
        "to_messages",
        "poi"]

# Save to pandas dataframe
df_raw = pd.DataFrame(data=features,columns=feature_names)
df_raw = df_raw[cols] # rearrange columns so POI is last

df_raw.head()


Unnamed: 0,email_address,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,...,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,,,,1753766.0,,,,,264013.0,,...,1945360,,-72419.0,,,,,,,False
1,michael.kopper@enron.com,602671.0,118134.0,,,224305.0,,2652612.0,985032.0,,...,985032,800000.0,,,,907502.0,,,,True
2,mark.metts@enron.com,,94299.0,,29.0,365788.0,,1061827.0,585062.0,38.0,...,585062,600000.0,,,702.0,1740.0,,,807.0,False
3,,,33785.0,1030329.0,,,,87410.0,,,...,1030329,,,,,,,53625.0,,False
4,james.derrick@enron.com,484000.0,51124.0,8831913.0,909.0,492375.0,,550981.0,1787380.0,64.0,...,8831913,800000.0,-1787380.0,,1401.0,7482.0,-1284000.0,,2181.0,False


In [184]:
df = df_raw.copy()

We'll remove the email addresses since they won't provide preictive power

In [185]:
df = df.drop(["email_address"],axis=1)
df.head()

Unnamed: 0,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,from_this_person_to_poi,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,,,1753766.0,,,,,264013.0,,,1945360,,-72419.0,,,,,,,False
1,602671.0,118134.0,,,224305.0,,2652612.0,985032.0,,,985032,800000.0,,,,907502.0,,,,True
2,,94299.0,,29.0,365788.0,,1061827.0,585062.0,38.0,1.0,585062,600000.0,,,702.0,1740.0,,,807.0,False
3,,33785.0,1030329.0,,,,87410.0,,,,1030329,,,,,,,53625.0,,False
4,484000.0,51124.0,8831913.0,909.0,492375.0,,550981.0,1787380.0,64.0,20.0,8831913,800000.0,-1787380.0,,1401.0,7482.0,-1284000.0,,2181.0,False


Now, we'll convert the column values from strings to numberical values

In [203]:
df["poi"] = df["poi"].map({"False":0, "True":1})

In [198]:
list_to_convert = df.columns.values[:-1]
for col in list_to_convert:
    df = df.replace({col:"NaN"}, np.nan) # Replace String "NaN" with numpy data type "nan"
    df[col] = pd.to_numeric(df[col])
df.head()

Unnamed: 0,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,from_this_person_to_poi,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,,,1753766.0,,,,,264013.0,,,1945360.0,,-72419.0,,,,,,,0.0
1,602671.0,118134.0,,,224305.0,,2652612.0,985032.0,,,985032.0,800000.0,,,,907502.0,,,,1.0
2,,94299.0,,29.0,365788.0,,1061827.0,585062.0,38.0,1.0,585062.0,600000.0,,,702.0,1740.0,,,807.0,0.0
3,,33785.0,1030329.0,,,,87410.0,,,,1030329.0,,,,,,,53625.0,,0.0
4,484000.0,51124.0,8831913.0,909.0,492375.0,,550981.0,1787380.0,64.0,20.0,8831913.0,800000.0,-1787380.0,,1401.0,7482.0,-1284000.0,,2181.0,0.0


In [None]:
float_list = ["long_term_incentive","expenses","exercised_stock_options","salary","director_fees","total_payments","restricted_stock","total_stock_value","bonus","restricted_stock_deferred","loan_advances","other","deferred_income","deferral_payments"]
int_list = ["from_messages","from_poi_to_this_person","from_this_person_to_poi","shared_receipt_with_poi","to_messages"]

In [128]:
# Turn "NaN" into numpy nan instead of string
for col in df.columns.values:
    col_str = str(col)
    df = df.replace({col:"NaN"}, np.nan)
#     df[col] = df[col].replace({"NaN":np.nan})
df.head()

Unnamed: 0,email_address,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,...,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,,,,1753766.0,,,,,264013.0,,...,1945360,,-72419.0,,,,,,,False
1,michael.kopper@enron.com,602671.0,118134.0,,,224305.0,,2652612.0,985032.0,,...,985032,800000.0,,,,907502.0,,,,True
2,mark.metts@enron.com,,94299.0,,29.0,365788.0,,1061827.0,585062.0,38.0,...,585062,600000.0,,,702.0,1740.0,,,807.0,False
3,,,33785.0,1030329.0,,,,87410.0,,,...,1030329,,,,,,,53625.0,,False
4,james.derrick@enron.com,484000.0,51124.0,8831913.0,909.0,492375.0,,550981.0,1787380.0,64.0,...,8831913,800000.0,-1787380.0,,1401.0,7482.0,-1284000.0,,2181.0,False


In [129]:
type(df["to_messages"][0])

float

In [136]:
type(df["long_term_incentive"][0])

numpy.float64

In [115]:
# Function to convert column values to specified data type
def convert_column(col_list,data_type):
    for col in col_list:
        temp_vals = []             # list to overwrite column w/appropriate data type
        col_vals = df[col].values  # Values in columns
        # For each val, convert to desired data type
        for val in col_vals:       
            if not(val == np.nan): 
                if(data_type == int):
                    val = int(val)
                elif(val == float):
                    val = float(val)
            # Append either NaN or converted value to temp_vals
            temp_vals.append(val)
        # Overwrite column with the values that have the correct data type
        df[col] = temp_vals

The reason I converted the values manually is because df.astype returned an error due to the NaN's. 

In [116]:
float_list = ["long_term_incentive","expenses","exercised_stock_options","salary","director_fees","total_payments","restricted_stock","total_stock_value","bonus","restricted_stock_deferred","loan_advances","other","deferred_income","deferral_payments"]
int_list = ["from_messages","from_poi_to_this_person","from_this_person_to_poi","shared_receipt_with_poi","to_messages"]

In [118]:
convert_column(float_list,float)
convert_column(int_list,int)

In [119]:
type(df["long_term_incentive"][4])

numpy.float64

In [120]:
df.head()

Unnamed: 0,email_address,long_term_incentive,expenses,exercised_stock_options,from_messages,salary,director_fees,total_payments,restricted_stock,from_poi_to_this_person,...,total_stock_value,bonus,restricted_stock_deferred,loan_advances,shared_receipt_with_poi,other,deferred_income,deferral_payments,to_messages,poi
0,,,,1753766.0,,,,,264013.0,,...,1945360.0,,-72419.0,,,,,,,False
1,michael.kopper@enron.com,602671.0,118134.0,,,224305.0,,2652612.0,985032.0,,...,985032.0,800000.0,,,,907502.0,,,,True
2,mark.metts@enron.com,,94299.0,,29.0,365788.0,,1061827.0,585062.0,38.0,...,585062.0,600000.0,,,702.0,1740.0,,,807.0,False
3,,,33785.0,1030329.0,,,,87410.0,,,...,1030329.0,,,,,,,53625.0,,False
4,james.derrick@enron.com,484000.0,51124.0,8831913.0,909.0,492375.0,,550981.0,1787380.0,64.0,...,8831913.0,800000.0,-1787380.0,,1401.0,7482.0,-1284000.0,,2181.0,False


In [62]:
df.astype({"long_term_incentive": np.double,
           "expenses": np.double,
           "exercised_stock_options": np.double,
           "from_messages": np.int16,
           "salary": np.double,
           "director_fees": np.double,
           "total_payments": np.double,
           "restricted_stock": np.double,
           "from_poi_to_this_person": np.int16,
           "from_this_person_to_poi": np.int16,
           "total_stock_value": np.double,
           "bonus": np.double,
           "restricted_stock_deferred": np.double,
           "loan_advances": np.double,
           "shared_receipt_with_poi": np.int16,
           "other": np.double,
           "deferred_income": np.double,
           "deferral_payments": np.double,
           "to_messages": np.int16})

ValueError: cannot convert float NaN to integer

In [2]:
# Choose which features to use imputer
features_for_imputer = []

In [None]:
# Use iterative imputer to handle missing values
# At each iteration, the Imputer uses one feature column as the output Y and the other feature columns as the inputs X 
# It then fits a regressor on (X,Y) for known Y.
# Then missing values are predicted based on the regressor
Imputer = IterativeImputer(max_iter=10, random_state=42)
Imputer.fit()

In [29]:
type(df["long_term_incentive"][0])

numpy.str_

In [22]:
df["poi"]

0      False
1      False
2      False
3      False
4      False
5        0.0
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16       0.0
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28      True
29     False
       ...  
116    False
117    False
118    False
119    False
120    False
121    False
122    False
123    False
124    False
125    False
126    False
127    False
128    False
129    False
130    False
131    False
132      0.0
133     True
134    False
135    False
136    False
137    False
138     True
139    False
140    False
141      0.0
142    False
143    False
144    False
145      1.0
Name: poi, Length: 146, dtype: object

In [23]:
# Convert POI's to 0's and 1's
df["poi"] = df["poi"].map({"True":1, "False":0, "0.0":0, "1.0":1})
print(df["poi"])

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     1
29     0
      ..
116    0
117    0
118    0
119    0
120    0
121    0
122    0
123    0
124    0
125    0
126    0
127    0
128    0
129    0
130    0
131    0
132    0
133    1
134    0
135    0
136    0
137    0
138    1
139    0
140    0
141    0
142    0
143    0
144    0
145    1
Name: poi, Length: 146, dtype: int64


In [27]:
# Let's analyze all the missing features
df.isnull().sum()

email_address                0
long_term_incentive          0
expenses                     0
exercised_stock_options      0
from_messages                0
salary                       0
director_fees                0
total_payments               0
restricted_stock             0
from_poi_to_this_person      0
from_this_person_to_poi      0
total_stock_value            0
bonus                        0
restricted_stock_deferred    0
loan_advances                0
shared_receipt_with_poi      0
other                        0
deferred_income              0
deferral_payments            0
to_messages                  0
poi                          0
dtype: int64