In [33]:
import sys
import pickle
import numpy as np
import pandas as pd

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
import tester 
import sklearn
import scipy

from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

print("I have used the following Versions:")
print("Numpy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("SkLearn Version:", sklearn.__version__)
print("Scipy Version:", scipy.__version__)

I have used the following Versions:
('Numpy Version:', '1.16.2')
('Pandas Version:', u'0.24.2')
('SkLearn Version:', '0.20.3')
('Scipy Version:', '1.2.1')


In [2]:
# Conversion code

# content = ''
# outsize = 0
# with open('final_project_dataset.pkl', 'rb') as infile:
#     content = infile.read()
# with open('final_project_dataset_new.pkl', 'wb') as output:
#     for line in content.splitlines():
#         outsize += len(line) + 1
#         output.write(line + str.encode('\n'))

# print("Done. Saved %s bytes." % (len(content)-outsize))

### Task 1: Select what features you'll use.

In [34]:
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi',
                'salary',
                'bonus', 
                'long_term_incentive', 
                'deferred_income', 
                'deferral_payments',
                'loan_advances', 
                'other',
                'expenses', 
                'director_fees',
                'total_payments',
                'exercised_stock_options',
                'restricted_stock',
                'restricted_stock_deferred',
                'total_stock_value',
                'to_messages',
                'from_messages',
                'from_this_person_to_poi',
                'from_poi_to_this_person']
### Load the dictionary containing the dataset
# with open("final_project_dataset.pkl", "r") as data_file:
#     data_dict = pickle.load(data_file)
# data_dict

with open('final_project_dataset_new.pkl', 'rb') as f:
    data_dict = pickle.load(f)

name_keys = sorted(list(data_dict.keys()))
rows = len(name_keys)
class_counts = data_frame["poi"].value_counts()
class_priors = class_counts / rows
print(class_counts)
print(class_priors)

False    126
True      17
Name: poi, dtype: int64
False    0.863014
True     0.116438
Name: poi, dtype: float64


In [4]:
# Converting Dictionary to Numpy Array
data_frame = pd.DataFrame.from_dict(data_dict, orient = 'index')
#Order columns in DataFrame, exclude email column
data_frame = data_frame[features_list]
data_frame = data_frame.replace('NaN', np.nan)
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 19 columns):
poi                          146 non-null bool
salary                       95 non-null float64
bonus                        82 non-null float64
long_term_incentive          66 non-null float64
deferred_income              49 non-null float64
deferral_payments            39 non-null float64
loan_advances                4 non-null float64
other                        93 non-null float64
expenses                     95 non-null float64
director_fees                17 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
to_messages                  86 non-null float64
from_messages                86 non-null float64
from_this_person_to_poi      86 non-null float

In [5]:
#split of POI and non-POI in the dataset
poi_non_poi = data_frame.poi.value_counts()
poi_non_poi.index=['non-POI', 'POI']
print "POI / non-POI split"
poi_non_poi

POI / non-POI split


non-POI    128
POI         18
Name: poi, dtype: int64

In [6]:
# Removing unrequired columns
# poi_names = data_frame.pop('poi_name')
# poi_labels = data_frame.pop('poi')
# emails = data_frame.pop('email_address')

### Task 2: Remove outliers

In [7]:
# Counting NaN Values for each Column
nan_vals = data_frame.isnull().sum(axis = 0)
print(nan_vals)

poi                            0
salary                        51
bonus                         64
long_term_incentive           80
deferred_income               97
deferral_payments            107
loan_advances                142
other                         53
expenses                      51
director_fees                129
total_payments                21
exercised_stock_options       44
restricted_stock              36
restricted_stock_deferred    128
total_stock_value             20
to_messages                   60
from_messages                 60
from_this_person_to_poi       60
from_poi_to_this_person       60
dtype: int64


In [8]:
from sklearn.preprocessing import Imputer

# Replacing 'NaN' in financial features with 0
data_frame.iloc[:,:15] = data_frame.iloc[:,:15].fillna(0)

email_features = ['to_messages', 'from_messages', 'from_this_person_to_poi', 'from_poi_to_this_person']

imp = Imputer(missing_values='NaN', strategy='median', axis=0)

#impute missing values of email features 
data_frame.loc[data_frame[data_frame.poi == 1].index,email_features] = imp.fit_transform(data_frame[email_features][data_frame.poi == 1])
data_frame.loc[data_frame[data_frame.poi == 0].index,email_features] = imp.fit_transform(data_frame[email_features][data_frame.poi == 0])



In [9]:
outliers = data_frame.quantile(.5) + 1.5 * (data_frame.quantile(.75)-data_frame.quantile(.25))
pd.DataFrame((data_frame[1:] > outliers[1:]).sum(axis = 1), columns = ['# of outliers']).\
    sort_values('# of outliers',  ascending = [0]).head(7)

# exclude 3 outliers from the data set
data_frame = data_frame.drop(['TOTAL', 'SKILLING JEFFREY K', 'FREVERT MARK A'],0)

In [40]:
### Task 3: Create new feature(s)
#Create new feature(s)
data_frame["fraction_from_poi"] = data_frame["from_poi_to_this_person"].\
divide(data_frame["to_messages"], fill_value = 0)

data_frame["fraction_to_poi"] = data_frame["from_this_person_to_poi"].\
divide(data_frame["from_messages"], fill_value = 0)

data_frame["fraction_from_poi"] = data_frame["fraction_from_poi"].fillna(0.0)
data_frame["fraction_to_poi"] = data_frame["fraction_to_poi"].fillna(0.0)

### Store to my_dataset for easy export below.
my_dataset = data_frame.to_dict('index')

In [41]:
nan_vals = data_frame.isnull().sum(axis = 0)
print(nan_vals)

poi                          0
salary                       0
bonus                        0
long_term_incentive          0
deferred_income              0
deferral_payments            0
loan_advances                0
other                        0
expenses                     0
director_fees                0
total_payments               0
exercised_stock_options      0
restricted_stock             0
restricted_stock_deferred    0
total_stock_value            0
to_messages                  0
from_messages                0
from_this_person_to_poi      0
from_poi_to_this_person      0
fraction_from_poi            0
fraction_to_poi              0
dtype: int64


In [53]:
#Decision tree using features with non-null importance
clf = DecisionTreeClassifier(random_state = 75)
clf.fit(data_frame.iloc[:,1:], data_frame.iloc[:,:1])

# show the features with non null importance, sorted and create features_list of features for the model
features_importance = []
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0:
        features_importance.append([data_frame.columns[i+1], clf.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
for f_i in features_importance:
    print f_i
features_list = [x[0] for x in features_importance]
features_list.insert(0, 'poi')

['fraction_to_poi', 0.3474206349206347]
['expenses', 0.32768459235546066]
['to_messages', 0.16170357031701577]
['total_stock_value', 0.08901338313103019]
['deferred_income', 0.07417781927585848]


In [57]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(data_frame, labels, random_state = 100)
clf = GaussianNB()
clf.fit(x_train, y_train)
poi_labels = data_frame.pop('poi')

In [58]:

preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, preds))
print("Gaussian Naive Bayes CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Gaussian Naive Bayes Precision:", precision_score(y_test, preds, average="weighted"))
print("Gaussian Naive Bayes Recall:", recall_score(y_test, preds, average="weighted"))
print("Gaussian Naive Bayes F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("KNN Accuracy:", accuracy_score(y_test, preds))
print("KNN CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("KNN Precision:", precision_score(y_test, preds, average="weighted"))
print("KNN Recall:", recall_score(y_test, preds, average="weighted"))
print("KNN F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = LogisticRegression()
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Logistic Regression Accuracy:", accuracy_score(y_test, preds))
print("Logistic Regression CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Logistic Regression Precision:", precision_score(y_test, preds, average="weighted"))
print("Logistic Regression Recall:", recall_score(y_test, preds, average="weighted"))
print("Logistic Regression F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=5, min_samples_leaf=5)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Decision Tree Accuracy:", accuracy_score(y_test, preds))
print("Decision Tree CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Decision Tree Precision:", precision_score(y_test, preds, average="weighted"))
print("Decision Tree Recall:", recall_score(y_test, preds, average="weighted"))
print("Decision Tree F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = RandomForestClassifier(n_estimators = 20, criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Random Forrest Accuracy:", accuracy_score(y_test, preds))
print("Random Forrest CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Random Forrest Precision:", precision_score(y_test, preds, average="weighted"))
print("Random Forrest Recall:", recall_score(y_test, preds, average="weighted"))
print("Random Forrest F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = AdaBoostClassifier(n_estimators = 20, random_state = 100)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("AdaBoost Accuracy:", accuracy_score(y_test, preds))
print("AdaBoost CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("AdaBoost Precision:", precision_score(y_test, preds, average="weighted"))
print("AdaBoost Recall:", recall_score(y_test, preds, average="weighted"))
print("AdaBoost F1-Score:", f1_score(y_test, preds, average="weighted"))

clf = GradientBoostingClassifier(n_estimators = 20)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("---------------------------------------------------")
print("Gradient Boosting Accuracy:", accuracy_score(y_test, preds))
print("Gradient Boosting CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("Gradient Boosting Precision:", precision_score(y_test, preds, average="weighted"))
print("Gradient Boosting Recall:", recall_score(y_test, preds, average="weighted"))
print("Gradient Boosting F1-Score:", f1_score(y_test, preds, average="weighted"))


---------------------------------------------------
('Gaussian Naive Bayes Accuracy:', 0.8888888888888888)
('Gaussian Naive Bayes CV Score:', 0.7314614121510674)
('Gaussian Naive Bayes Precision:', 0.8380952380952381)
('Gaussian Naive Bayes Recall:', 0.8888888888888888)
('Gaussian Naive Bayes F1-Score:', 0.8627450980392157)
---------------------------------------------------
('KNN Accuracy:', 0.9166666666666666)
('KNN CV Score:', 0.8743185550082101)
('KNN Precision:', 0.8402777777777778)
('KNN Recall:', 0.9166666666666666)
('KNN F1-Score:', 0.8768115942028986)
---------------------------------------------------
('Logistic Regression Accuracy:', 0.7777777777777778)
('Logistic Regression CV Score:', 0.8324137931034483)
('Logistic Regression Precision:', 0.8279569892473118)
('Logistic Regression Recall:', 0.7777777777777778)
('Logistic Regression F1-Score:', 0.8020833333333334)
---------------------------------------------------
('Decision Tree Accuracy:', 1.0)
('Decision Tree CV Score:',

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


('Random Forrest CV Score:', 0.9164532019704434)
('Random Forrest Precision:', 1.0)
('Random Forrest Recall:', 1.0)
('Random Forrest F1-Score:', 1.0)
---------------------------------------------------
('AdaBoost Accuracy:', 1.0)
('AdaBoost CV Score:', 0.9086042692939245)
('AdaBoost Precision:', 1.0)
('AdaBoost Recall:', 1.0)
('AdaBoost F1-Score:', 1.0)
---------------------------------------------------
('Gradient Boosting Accuracy:', 1.0)
('Gradient Boosting CV Score:', 0.902183908045977)
('Gradient Boosting Precision:', 1.0)
('Gradient Boosting Recall:', 1.0)
('Gradient Boosting F1-Score:', 1.0)


In [60]:
'''
Source:https://towardsdatascience.com/metrics-to-evaluate-your-machine-learning-algorithm-f10ba6e38234
Classification Accuracy is what we usually mean, when we use the term accuracy.
It is the ratio of number of correct predictions to the total number of input samples.

Precision : It is the number of correct positive results divided by the number of positive results
predicted by the classifier.

Recall : It is the number of correct positive results divided by the number of all relevant samples 
(all samples that should have been identified as positive).

'''
# I am choosing the AdaBoost Classifier

# Fine Tuning
num_estimators = range(1, 31, 2)
learning_rates = list(np.linspace(0.1, 2, 20, dtype=np.float32))

scores = np.zeros((len(num_estimators), len(learning_rates)))

for ni, ne in enumerate(num_estimators):
    for li, lr in enumerate(learning_rates):
        # Classifier with the Changing Parameters
        clf = AdaBoostClassifier(n_estimators = ne, learning_rate = lr, random_state = 100)
        clf.fit(x_train, y_train)
        preds = clf.predict(x_test)
        
        # Computing Cusomized Score
        score = accuracy_score(y_test, preds) + \
                cross_val_score(clf, data_frame, poi_labels, cv=5).mean() + \
                precision_score(y_test, preds, average="weighted") + \
                recall_score(y_test, preds, average="weighted") + \
                f1_score(y_test, preds, average="weighted")
        scores[ni][li] = score / 5.0

max_score = np.max(scores)
max_idxs = np.where(scores == max_score)
best_ne = num_estimators[max_idxs[0][0]]
best_lr = learning_rates[max_idxs[1][0]]
print("Maximum Score =", max_score, " with n_estimators =", best_ne, "and learning rate =", best_lr)

clf = AdaBoostClassifier(n_estimators = best_ne, learning_rate=best_lr, random_state = 100)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print("AdaBoost Accuracy:", accuracy_score(y_test, preds))
print("AdaBoost CV Score:", cross_val_score(clf, data_frame, poi_labels, cv=5).mean())
print("AdaBoost Precision:", precision_score(y_test, preds, average="weighted"))
print("AdaBoost Recall:", recall_score(y_test, preds, average="weighted"))
print("AdaBoost F1-Score:", f1_score(y_test, preds, average="weighted"))

('Maximum Score =', 0.9874351395730706, ' with n_estimators =', 11, 'and learning rate =', 0.7)
('AdaBoost Accuracy:', 1.0)
('AdaBoost CV Score:', 0.9371756978653532)
('AdaBoost Precision:', 1.0)
('AdaBoost Recall:', 1.0)
('AdaBoost F1-Score:', 1.0)


In [61]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)