In [10]:
import sys
import pickle
import numpy as np
import pandas as pd

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

import sklearn
import scipy

from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

print("I have used the following Versions:")
print("Numpy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("SkLearn Version:", sklearn.__version__)
print("Scipy Version:", scipy.__version__)

I have used the following Versions:
('Numpy Version:', '1.16.2')
('Pandas Version:', u'0.24.2')
('SkLearn Version:', '0.20.3')
('Scipy Version:', '1.2.1')


In [3]:
# Conversion code

# content = ''
# outsize = 0
# with open('final_project_dataset.pkl', 'rb') as infile:
#     content = infile.read()
# with open('final_project_dataset_new.pkl', 'wb') as output:
#     for line in content.splitlines():
#         outsize += len(line) + 1
#         output.write(line + str.encode('\n'))

# print("Done. Saved %s bytes." % (len(content)-outsize))

### Task 1: Select what features you'll use.

In [11]:
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi', 'salary', 'to_messages', 'total_payments', 'bonus', 'total_stock_value', 'expenses', 'from_poi_to_this_person', 'exercised_stock_options', 'from_messages', 'other', 'from_this_person_to_poi', 'shared_receipt_with_poi', 'restricted_stock']

### Load the dictionary containing the dataset
# with open("final_project_dataset.pkl", "r") as data_file:
#     data_dict = pickle.load(data_file)
# data_dict

with open('final_project_dataset_new.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [12]:
# Converting Dictionary to Numpy Array

name_keys = sorted(list(data_dict.keys()))
rows = len(name_keys)

data_keys = list(data_dict[name_keys[0]].keys())
cols = len(data_keys) + 1

print(rows, cols)

dataset = {}
for c in range(cols):
    col = []
    for r in range(rows):
        if c == 0:
            col.append(name_keys[r])
        else:
            value = data_dict[name_keys[r]][data_keys[c - 1]]
            if value == "NaN":
                col.append(np.nan)
            else:
                col.append(value)
    if c == 0:
        dataset["poi_name"] = col
    else:
        dataset[data_keys[c - 1]] = col



data_frame = pd.DataFrame(dataset)

data_frame.head(10)

(146, 22)


Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,other,poi,poi_name,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868.0,2195.0,47.0,65.0,...,152.0,False,ALLEN PHILLIP K,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0
1,,178980.0,,,,257817.0,3486.0,,,,...,,False,BADUM JAMES P,,,,,,182466.0,257817.0
2,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301.0,29.0,39.0,0.0,...,864523.0,False,BANNANTINE JAMES M,1757552.0,-560222.0,477.0,465.0,566.0,916197.0,5243487.0
3,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,2660303.0,False,BAXTER JOHN C,3942714.0,,267102.0,,,5634343.0,10623258.0
4,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142.0,,,,...,69.0,False,BAY FRANKLIN R,145796.0,-82782.0,239671.0,,,827696.0,63014.0
5,,684694.0,,,,1599641.0,,,,,...,874.0,False,BAZELIDES PHILIP J,,,80818.0,,,860136.0,1599641.0
6,700000.0,,,,sally.beck@enron.com,,37172.0,4343.0,144.0,386.0,...,566.0,False,BECK SALLY W,126027.0,,231330.0,2639.0,7315.0,969068.0,126027.0
7,5249999.0,2144013.0,-2334434.0,,tim.belden@enron.com,953136.0,17355.0,484.0,228.0,108.0,...,210698.0,True,BELDEN TIMOTHY N,157569.0,,213999.0,5521.0,7991.0,5501630.0,1110705.0
8,,-102500.0,,3285.0,,3285.0,,,,,...,,False,BELFER ROBERT,,44093.0,,,,102500.0,-44093.0
9,,,,,david.berberian@enron.com,1624396.0,11892.0,,,,...,,False,BERBERIAN DAVID,869220.0,,216582.0,,,228474.0,2493616.0


In [13]:
class_counts = data_frame["poi"].value_counts()
class_priors = class_counts / rows
print(class_counts)
print(class_priors)

False    128
True      18
Name: poi, dtype: int64
False    0.876712
True     0.123288
Name: poi, dtype: float64


In [14]:
# Removing unrequired columns
poi_names = data_frame.pop('poi_name')
poi_labels = data_frame.pop('poi')
emails = data_frame.pop('email_address')

### Task 2: Remove outliers

In [15]:
# Counting NaN Values for each Column
nan_vals = data_frame.isnull().sum(axis = 0)
print(nan_vals)

bonus                         64
deferral_payments            107
deferred_income               97
director_fees                129
exercised_stock_options       44
expenses                      51
from_messages                 60
from_poi_to_this_person       60
from_this_person_to_poi       60
loan_advances                142
long_term_incentive           80
other                         53
restricted_stock              36
restricted_stock_deferred    128
salary                        51
shared_receipt_with_poi       60
to_messages                   60
total_payments                21
total_stock_value             20
dtype: int64


In [16]:
# Removing Columns with 50% or more Nan Values
nan_thresh = 0.5
nan_percents = np.array(nan_vals) / float(rows)
print(nan_percents)

required_features = list(np.array(data_frame.columns)[nan_thresh - nan_percents > 0])
data_frame = data_frame[required_features]
print(data_frame.columns)

[0.43835616 0.73287671 0.66438356 0.88356164 0.30136986 0.34931507
 0.4109589  0.4109589  0.4109589  0.97260274 0.54794521 0.3630137
 0.24657534 0.87671233 0.34931507 0.4109589  0.4109589  0.14383562
 0.1369863 ]
Index([u'bonus', u'exercised_stock_options', u'expenses', u'from_messages',
       u'from_poi_to_this_person', u'from_this_person_to_poi', u'other',
       u'restricted_stock', u'salary', u'shared_receipt_with_poi',
       u'to_messages', u'total_payments', u'total_stock_value'],
      dtype='object')


In [17]:
# Replacing NaNs with the Medians of Respective Features
for df_col in data_frame.columns:
    data_frame[df_col].fillna(data_frame[df_col].median(), inplace=True)

print(data_frame.head(5))

       bonus  exercised_stock_options  expenses  from_messages  \
0  4175000.0                1729541.0   13868.0         2195.0   
1   769375.0                 257817.0    3486.0           41.0   
2   769375.0                4046157.0   56301.0           29.0   
3  1200000.0                6680544.0   11200.0           41.0   
4   400000.0                1310813.5  129142.0           41.0   

   from_poi_to_this_person  from_this_person_to_poi      other  \
0                     47.0                     65.0      152.0   
1                     35.0                      8.0    52382.0   
2                     39.0                      0.0   864523.0   
3                     35.0                      8.0  2660303.0   
4                     35.0                      8.0       69.0   

   restricted_stock    salary  shared_receipt_with_poi  to_messages  \
0          126027.0  201955.0                   1407.0       2902.0   
1          451740.0  259996.0                    740.5       121

In [18]:
z_scores = np.abs(stats.zscore(data_frame))
print(z_scores)

[[0.3122757  0.10986752 0.17077072 ... 0.58672028 0.00091724 0.11821485]
 [0.1123973  0.16663765 0.19496459 ... 0.25213206 0.16132395 0.15898283]
 [0.1123973  0.02050661 0.07188625 ... 0.57209644 0.13396551 0.02087561]
 ...
 [0.1123973  0.17121587 0.09367752 ... 0.25213206 0.12706015 0.16227056]
 [0.1123973  0.14391058 0.07737194 ... 0.25213206 0.1546931  0.16309313]
 [0.1123973  0.16914723 0.07469201 ... 0.25213206 0.16607313 0.16078502]]


In [19]:
# Datapoints with 3 Std Devs more/less
threshold = 3
z_thresh = np.where(z_scores > 3)
z_thresh

(array([  6,   7,   7,  31,  34,  72,  73,  73,  73,  75,  78,  78,  78,
         79, 117, 117, 130, 130, 130, 130, 130, 130, 130, 130, 139],
       dtype=int64),
 array([ 5,  9, 10,  5,  4,  3,  3,  5, 10, 10,  4,  5,  9, 11,  9, 10,  0,
         1,  2,  6,  7,  8, 11, 12,  9], dtype=int64))

In [20]:
outliers, outlier_counts = np.unique(z_thresh[0], return_counts=True)
outlier_thresh = cols / 2
outlier_indices = np.where(outlier_counts > outlier_thresh)
outlier_idx = -1
for oi in outlier_indices:
    outlier_idx = oi
data_frame.drop(outlier_idx)

Unnamed: 0,bonus,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,other,restricted_stock,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,4175000.0,1729541.0,13868.0,2195.0,47.0,65.0,152.0,126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0
1,769375.0,257817.0,3486.0,41.0,35.0,8.0,52382.0,451740.0,259996.0,740.5,1211.0,182466.0,257817.0
2,769375.0,4046157.0,56301.0,29.0,39.0,0.0,864523.0,1757552.0,477.0,465.0,566.0,916197.0,5243487.0
3,1200000.0,6680544.0,11200.0,41.0,35.0,8.0,2660303.0,3942714.0,267102.0,740.5,1211.0,5634343.0,10623258.0
4,400000.0,1310813.5,129142.0,41.0,35.0,8.0,69.0,145796.0,239671.0,740.5,1211.0,827696.0,63014.0
5,769375.0,1599641.0,46950.0,41.0,35.0,8.0,874.0,451740.0,80818.0,740.5,1211.0,860136.0,1599641.0
6,700000.0,1310813.5,37172.0,4343.0,144.0,386.0,566.0,126027.0,231330.0,2639.0,7315.0,969068.0,126027.0
7,5249999.0,953136.0,17355.0,484.0,228.0,108.0,210698.0,157569.0,213999.0,5521.0,7991.0,5501630.0,1110705.0
8,769375.0,3285.0,46950.0,41.0,35.0,8.0,52382.0,451740.0,259996.0,740.5,1211.0,102500.0,-44093.0
9,769375.0,1624396.0,11892.0,41.0,35.0,8.0,52382.0,869220.0,216582.0,740.5,1211.0,228474.0,2493616.0


In [21]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

In [22]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [23]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(data_frame, poi_labels, random_state = 100)

In [24]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

clf = GaussianNB(class_priors)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, preds))
print("Gaussian Naive Bayes CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Gaussian Naive Bayes Precision:", precision_score(y_test, preds, average="weighted"))
print("Gaussian Naive Bayes Recall:", recall_score(y_test, preds, average="weighted"))
print("Gaussian Naive Bayes F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("KNN Accuracy:", accuracy_score(y_test, preds))
print("KNN CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("KNN Precision:", precision_score(y_test, preds, average="weighted"))
print("KNN Recall:", recall_score(y_test, preds, average="weighted"))
print("KNN F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = LogisticRegression()
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Logistic Regression Accuracy:", accuracy_score(y_test, preds))
print("Logistic Regression CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Logistic Regression Precision:", precision_score(y_test, preds, average="weighted"))
print("Logistic Regression Recall:", recall_score(y_test, preds, average="weighted"))
print("Logistic Regression F1-Score:", f1_score(y_test, preds, average="weighted"))

# clf = SVC(gamma='scale', degree=3)
# clf.fit(x_train, y_train, dtype=np.float)
# preds = clf.predict(x_test)
# print("---------------------------------------------------")
# print("SVM Accuracy:", accuracy_score(y_test, preds))
# print("SVM CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
# print("SVM Precision:", precision_score(y_test, preds, average="weighted"))
# print("SVM Recall:", recall_score(y_test, preds, average="weighted"))
# print("SVM F1-Score:", f1_score(y_test, preds, average="weighted"))


clf = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=5, min_samples_leaf=5)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Decision Tree Accuracy:", accuracy_score(y_test, preds))
print("Decision Tree CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Decision Tree Precision:", precision_score(y_test, preds, average="weighted"))
print("Decision Tree Recall:", recall_score(y_test, preds, average="weighted"))
print("Decision Tree F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = RandomForestClassifier(n_estimators = 20, criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Random Forrest Accuracy:", accuracy_score(y_test, preds))
print("Random Forrest CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Random Forrest Precision:", precision_score(y_test, preds, average="weighted"))
print("Random Forrest Recall:", recall_score(y_test, preds, average="weighted"))
print("Random Forrest F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = AdaBoostClassifier(n_estimators = 20, random_state = 100)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("AdaBoost Accuracy:", accuracy_score(y_test, preds))
print("AdaBoost CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("AdaBoost Precision:", precision_score(y_test, preds, average="weighted"))
print("AdaBoost Recall:", recall_score(y_test, preds, average="weighted"))
print("AdaBoost F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = GradientBoostingClassifier(n_estimators = 20)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Gradient Boosting Accuracy:", accuracy_score(y_test, preds))
print("Gradient Boosting CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Gradient Boosting Precision:", precision_score(y_test, preds, average="weighted"))
print("Gradient Boosting Recall:", recall_score(y_test, preds, average="weighted"))
print("Gradient Boosting F1-Score:", f1_score(y_test, preds, average="weighted"))

---------------------------------------------------
('Gaussian Naive Bayes Accuracy:', 0.1891891891891892)
('Gaussian Naive Bayes CV Score:', 0.2795238095238095)
('Gaussian Naive Bayes Precision:', 0.8841698841698842)
('Gaussian Naive Bayes Recall:', 0.1891891891891892)
('Gaussian Naive Bayes F1-Score:', 0.13553259141494436)
---------------------------------------------------
('KNN Accuracy:', 0.8648648648648649)
('KNN CV Score:', 0.8771428571428572)
('KNN Precision:', 0.747991234477721)
('KNN Recall:', 0.8648648648648649)
('KNN F1-Score:', 0.8021934978456717)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


---------------------------------------------------
('Logistic Regression Accuracy:', 0.8378378378378378)
('Logistic Regression CV Score:', 0.8357142857142856)
('Logistic Regression Precision:', 0.7447447447447448)
('Logistic Regression Recall:', 0.8378378378378378)
('Logistic Regression F1-Score:', 0.7885532591414945)
---------------------------------------------------
('Decision Tree Accuracy:', 0.8108108108108109)
('Decision Tree CV Score:', 0.8157142857142858)
('Decision Tree Precision:', 0.7938165438165438)
('Decision Tree Recall:', 0.8108108108108109)
('Decision Tree F1-Score:', 0.8017556017556017)
---------------------------------------------------
('Random Forrest Accuracy:', 0.8648648648648649)
('Random Forrest CV Score:', 0.8771428571428572)
('Random Forrest Precision:', 0.747991234477721)
('Random Forrest Recall:', 0.8648648648648649)
('Random Forrest F1-Score:', 0.8021934978456717)
---------------------------------------------------
('AdaBoost Accuracy:', 0.8648648648648649

In [25]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# I am choosing the AdaBoost Classifier

# Fine Tuning
num_estimators = range(1, 31, 2)
learning_rates = list(np.linspace(0.1, 2, 20, dtype=np.float32))

scores = np.zeros((len(num_estimators), len(learning_rates)))

for ni, ne in enumerate(num_estimators):
    for li, lr in enumerate(learning_rates):
        # Classifier with the Changing Parameters
        clf = AdaBoostClassifier(n_estimators = ne, learning_rate = lr, random_state = 100)
        clf.fit(x_train, y_train)
        preds = clf.predict(x_test)
        
        # Computing Cusomized Score
        score = accuracy_score(y_test, preds) + \
                cross_val_score(clf, data_frame, poi_labels, cv=5).mean() + \
                precision_score(y_test, preds, average="weighted") + \
                recall_score(y_test, preds, average="weighted") + \
                f1_score(y_test, preds, average="weighted")
        scores[ni][li] = score / 5.0

max_score = np.max(scores)
max_idxs = np.where(scores == max_score)
best_ne = num_estimators[max_idxs[0][0]]
best_lr = learning_rates[max_idxs[1][0]]
print("Maximum Score =", max_score, " with n_estimators =", best_ne, "and learning rate =", best_lr)


('Maximum Score =', 0.8804874958992606, ' with n_estimators =', 15, 'and learning rate =', 1.0)


In [26]:
clf = AdaBoostClassifier(n_estimators = best_ne, learning_rate=best_lr, random_state = 100)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("AdaBoost Accuracy:", accuracy_score(y_test, preds))
print("AdaBoost CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("AdaBoost Precision:", precision_score(y_test, preds, average="weighted"))
print("AdaBoost Recall:", recall_score(y_test, preds, average="weighted"))
print("AdaBoost F1-Score:", f1_score(y_test, preds, average="weighted"))

('AdaBoost Accuracy:', 0.8918918918918919)
('AdaBoost CV Score:', 0.8557142857142856)
('AdaBoost Precision:', 0.9039039039039038)
('AdaBoost Recall:', 0.8918918918918919)
('AdaBoost F1-Score:', 0.8590355060943297)


In [27]:
# Example starting point. Try investigating other evaluation techniques!
# from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [21]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)