In [3]:
#!/usr/bin/python
import numpy as np
import sys
import pickle
sys.path.append("../tools/")
import pandas

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 1: Remove outliers

# due to the samll dataset, it is hard to identify the outliers
# the only known outlier is "total" in the table, so it should be removed.
# remove the outlier called "total"
df = pandas.DataFrame.from_records(list(data_dict.values()))
employees = pandas.Series(list(data_dict.keys()))
df.set_index(employees, inplace=True)

# substitue the string NaN to np.nan
df.replace(to_replace = "NaN", value = np.nan, inplace = True) 

# drop 2 meanless row
df = df.drop("TOTAL", axis = 0)
df = df.drop("THE TRAVEL AGENCY IN THE PARK", axis =0)    
    
# drop the people who have less info
for i in df.index:
        if df.ix[i].count() < 8:
            df = df.drop(i, axis = 0)

# 'if there is no stock, set the stock relevant data to 0
df['exercised_stock_options'].replace(to_replace = np.nan, value = 0, inplace = True)
df['restricted_stock'].replace(to_replace = np.nan, value = 0, inplace = True)
df['restricted_stock_deferred'].replace(to_replace = np.nan, value = 0, inplace = True)
df['total_stock_value'].replace(to_replace = np.nan, value = 0, inplace = True)

#total payment will refer to data before, cannot be median
df['total_payments'].replace(to_replace = np.nan, value = 0, inplace = True)
    
# drop the feature with so many NaN --over 100 NaN
df = df.drop('loan_advances', axis = 1)
df = df.drop('restricted_stock_deferred', axis = 1)
#df = df.drop('director_fees', axis = 1)
df = df.drop('deferral_payments', axis = 1)
#df = df.drop('deferred_income', axis = 1)


# drop the useless feature - email_address
df = df.drop('email_address', axis = 1)
 


# fill the NaN to median
df.replace(to_replace = np.nan, value = df.median(), inplace = True)


### Task 2: Create new feature(s) and adjust feature(s)

# add new variable poi msgs and receipt number
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

n_from_poi = scaler.fit_transform(df.from_poi_to_this_person.values.reshape(-1,1))
n_to_poi = scaler.fit_transform(df.from_this_person_to_poi.values.reshape(-1,1))
n_shared_receipt_poi = scaler.fit_transform(df.shared_receipt_with_poi .values.reshape(-1,1))
poi_email_receipt = n_from_poi + n_to_poi+n_shared_receipt_poi
# change list to series
poi_email_receipt = [n[0] for n in list(poi_email_receipt)]
# add new feature to df
df['poi_email_receipt'] = pandas.Series(poi_email_receipt, index = df.index)

# add new feature poi_mail_ratio
n_to_msgs = scaler.fit_transform(df.to_messages.values.reshape(-1,1))
n_from_msgs = scaler.fit_transform(df.from_messages.values.reshape(-1,1))

poi_mails = n_from_poi + n_to_poi
poi_mails = [n[0] for n in list(poi_mails)]
poi_mails = pandas.Series(poi_mails)

all_mails = n_to_msgs + n_from_msgs
all_mails = [n[0] for n in list(all_mails)]
all_mails = pandas.Series(all_mails)

poi_mail_ratio =  list((poi_mails+1)/(all_mails+1))
df['poi_mail_ratio'] = pandas.Series(poi_mail_ratio, index = df.index)

df['pct_from_poi'] = df['from_poi_to_this_person']/(df['from_messages'] + 1)
df['pct_to_poi'] = df['from_this_person_to_poi']/(df['from_messages'] + 1)

# bonus & stk feature
n_bonus = scaler.fit_transform(df.bonus.values.reshape(-1,1))
n_total_stock_value = scaler.fit_transform(df.total_stock_value.values.reshape(-1,1))

pct_bonus_w_stk = n_bonus + n_total_stock_value
pct_bonus_w_stk = [n[0] for n in list(pct_bonus_w_stk)]
df['pct_bonus_w_stk'] = pandas.Series(pct_bonus_w_stk, index = df.index)


### Store to my_dataset for easy export below.
data_dict = dict(df.to_dict('index'))
my_dataset = data_dict



### Task 3: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

# in this part, i'll going to select the feature by deciesion tree or selectKbest 
# in part 4, so i put all feautre except loan_advances.
features = list(df.columns.values)
features.remove('poi')
features_list = ['poi'] + features

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
#from sklearn.svm import SVC # poor
#from sklearn import tree #DecisionTreeClassifier()
#from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#make pipelines
scaler = MinMaxScaler()
skb = SelectKBest()
pca = PCA()
gbclf = GradientBoostingClassifier(loss = 'exponential')

pipeline = Pipeline([
        ('scaler', scaler),
        ('skb', skb),
        ('pca', pca),
        ('gbclf', gbclf)
        ])


### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

from pprint import pprint

# Build StratifiedShuffleSplit cv
from sklearn.cross_validation import StratifiedShuffleSplit
cv_sss = StratifiedShuffleSplit(labels, 10, random_state=42)

# Build GridCV parameter
# pre-processing
from sklearn.grid_search import GridSearchCV

# parameter for select k best
k = [k for k in range(10,17)]
# parameter for pca
c = [x for x in range(2,5)]
# parameter for gbclf
gbclf_lr =[0.1, 0.2, 0.3, 0.4]
gbclf_msl = [gbclf_msl for gbclf_msl in range(2,5)]
gbclf_md = [gbclf_md for gbclf_md in range(6,8)]

param_grid = {'skb__k': k,
              'pca__n_components': [3],
              'gbclf__n_estimators':[100],
              'gbclf__learning_rate': [0.4],
              'gbclf__max_depth': [7],
              'gbclf__min_samples_leaf': [3],
             }

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

gridCV_object = GridSearchCV(estimator = pipeline, 
                             param_grid = param_grid,
                             scoring = 'f1',
                             cv = StratifiedShuffleSplit(labels_train, 10, random_state=42))



gridCV_object.fit(features_train, labels_train)
clf = gridCV_object.best_estimator_


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

### Task 1: Remove outliers

In [160]:
### !/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")
import pandas
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 1: Remove outliers

# due to the samll dataset, it is hard to identify the outliers
# the only known outlier is "total" in the table, so it should be removed.
# remove the outlier called "total"
df = pandas.DataFrame.from_records(list(data_dict.values()))
employees = pandas.Series(list(data_dict.keys()))
df.set_index(employees, inplace=True)

# substitue the string NaN to np.nan
df.replace(to_replace = "NaN", value = np.nan, inplace = True) 

# drop 2 meanless row
df = df.drop("TOTAL", axis = 0)
df = df.drop("THE TRAVEL AGENCY IN THE PARK", axis =0)    
    
# drop the people who have less info
for i in df.index:
        if df.ix[i].count() < 8:
            df = df.drop(i, axis = 0)

# 'if there is no stock, set the stock relevant data to 0
df['exercised_stock_options'].replace(to_replace = np.nan, value = 0, inplace = True)
df['restricted_stock'].replace(to_replace = np.nan, value = 0, inplace = True)
df['restricted_stock_deferred'].replace(to_replace = np.nan, value = 0, inplace = True)
df['total_stock_value'].replace(to_replace = np.nan, value = 0, inplace = True)

#total payment will refer to data before, cannot be median
df['total_payments'].replace(to_replace = np.nan, value = 0, inplace = True)
    
# drop the feature with so many NaN --over 100 NaN
df = df.drop('loan_advances', axis = 1)
df = df.drop('restricted_stock_deferred', axis = 1)
#df = df.drop('director_fees', axis = 1)
df = df.drop('deferral_payments', axis = 1)
#df = df.drop('deferred_income', axis = 1)


# drop the useless feature - email_address
df = df.drop('email_address', axis = 1)
 


# fill the NaN to median
df.replace(to_replace = np.nan, value = df.median(), inplace = True)

### Task 2: Create new feature(s) and adjust feature(s)

In [161]:
### Task 2: Create new feature(s) and adjust feature(s)

# add new variable poi msgs and receipt number
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

n_from_poi = scaler.fit_transform(df.from_poi_to_this_person.values.reshape(-1,1))
n_to_poi = scaler.fit_transform(df.from_this_person_to_poi.values.reshape(-1,1))
n_shared_receipt_poi = scaler.fit_transform(df.shared_receipt_with_poi .values.reshape(-1,1))
poi_email_receipt = n_from_poi + n_to_poi+n_shared_receipt_poi
# change list to series
poi_email_receipt = [n[0] for n in list(poi_email_receipt)]
# add new feature to df
df['poi_email_receipt'] = pandas.Series(poi_email_receipt, index = df.index)

# add new feature poi_mail_ratio
n_to_msgs = scaler.fit_transform(df.to_messages.values.reshape(-1,1))
n_from_msgs = scaler.fit_transform(df.from_messages.values.reshape(-1,1))

poi_mails = n_from_poi + n_to_poi
poi_mails = [n[0] for n in list(poi_mails)]
poi_mails = pandas.Series(poi_mails)

all_mails = n_to_msgs + n_from_msgs
all_mails = [n[0] for n in list(all_mails)]
all_mails = pandas.Series(all_mails)

poi_mail_ratio =  list((poi_mails+1)/(all_mails+1))
df['poi_mail_ratio'] = pandas.Series(poi_mail_ratio, index = df.index)

df['pct_from_poi'] = df['from_poi_to_this_person']/(df['from_messages'] + 1)
df['pct_to_poi'] = df['from_this_person_to_poi']/(df['from_messages'] + 1)

# bonus & stk feature
n_bonus = scaler.fit_transform(df.bonus.values.reshape(-1,1))
n_total_stock_value = scaler.fit_transform(df.total_stock_value.values.reshape(-1,1))

pct_bonus_w_stk = n_bonus + n_total_stock_value
pct_bonus_w_stk = [n[0] for n in list(pct_bonus_w_stk)]
df['pct_bonus_w_stk'] = pandas.Series(pct_bonus_w_stk, index = df.index)


### Store to my_dataset for easy export below.
data_dict = dict(df.to_dict('index'))

In [162]:
print df.head()

                      bonus  deferred_income  exercised_stock_options  \
METTS MARK         600000.0        -151927.0                1297049.0   
BAXTER JOHN C     1200000.0       -1386055.0                6680544.0   
ELLIOTT STEVEN     350000.0        -400729.0                4890344.0   
CORDES WILLIAM R   750000.0        -151927.0                 651850.0   
HANNON KEVIN P    1500000.0       -3117011.0                5538001.0   

                  expenses  from_messages  from_poi_to_this_person  \
METTS MARK         94299.0           29.0                     38.0   
BAXTER JOHN C      11200.0           41.0                     35.0   
ELLIOTT STEVEN     78552.0           41.0                     35.0   
CORDES WILLIAM R   46547.5           12.0                     10.0   
HANNON KEVIN P     34039.0           32.0                     32.0   

                  from_this_person_to_poi  long_term_incentive      other  \
METTS MARK                            1.0             422158.0 

In [54]:
print df.columns.values
print len(df.columns.values) - 1

['bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'exercised_stock_options' 'expenses' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi' 'long_term_incentive'
 'other' 'poi' 'restricted_stock' 'restricted_stock_deferred' 'salary'
 'shared_receipt_with_poi' 'to_messages' 'total_payments'
 'total_stock_value' 'poi_email_receipt' 'poi_mail_ratio' 'pct_from_poi'
 'pct_to_poi']
22


### Task 3: Select what features you'll use.

In [55]:
### Task 3: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

# in this part, i'll going to select the feature by deciesion tree or selectKbest 
# in part 4, so i put all feautre except loan_advances.
features = list(df.columns.values)
features.remove('poi')
features_list = ['poi'] + features

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [21]:
# code from vivek_29420285151271 to replace f1 as scoring criterion

def precision_recall(labels,predictions):
    ind_true_pos = [i for i in range(0,len(labels)) if (predictions[i]==1) & (labels[i]==1)]
    ind_false_pos = [i for i in range(0,len(labels)) if ((predictions[i]==1) & (labels[i]==0))]
    ind_false_neg = [i for i in range(0,len(labels)) if ((predictions[i]==0) & (labels[i]==1))]
    ind_true_neg = [i for i in range(0,len(labels)) if ((predictions[i]==0) & (labels[i]==0))]
    precision = 0
    recall = 0
    
    
    ind_labels = [i for i in range(0,len(labels)) if labels[i]==1]
    
    if len(ind_labels) !=0:
        if float( len(ind_true_pos) + len(ind_false_pos))!=0:
            precision = float(len(ind_true_pos))/float( len(ind_true_pos) + len(ind_false_pos))
        if float( len(ind_true_pos) + len(ind_false_neg))!=0:
            recall = float(len(ind_true_pos))/float( len(ind_true_pos) + len(ind_false_neg))
        return precision, recall
    else:
        return -1,-1

def custom_scorer(labels, predictions):
    precision,recall = precision_recall(labels,predictions)
    min_score = min(precision, recall)
    return min_score

### Task 4: Try a varity of classifiers

In [13]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
#from sklearn.svm import SVC # poor
#from sklearn import tree #DecisionTreeClassifier()
#from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#make pipelines
scaler = MinMaxScaler()
skb = SelectKBest()
pca = PCA()
gbclf = GradientBoostingClassifier(loss = 'exponential')

pipeline = Pipeline([
        ('scaler', scaler),
        ('skb', skb),
        ('pca', pca),
        ('gbclf', gbclf)
        ])


### Task 5: Tune your classifier

In [32]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

from pprint import pprint

# Build StratifiedShuffleSplit cv
from sklearn.cross_validation import StratifiedShuffleSplit
cv_sss = StratifiedShuffleSplit(labels, 10, random_state=42)

# Build GridCV parameter
# pre-processing
from sklearn.grid_search import GridSearchCV

# parameter for select k best
k = [k for k in range(10,17)]
# parameter for pca
c = [x for x in range(2,5)]
# parameter for gbclf
gbclf_lr =[0.1, 0.2, 0.3, 0.4]
gbclf_msl = [gbclf_msl for gbclf_msl in range(2,5)]
gbclf_md = [gbclf_md for gbclf_md in range(6,8)]

param_grid = {'skb__k': [10],
              'pca__n_components': [3],
              'gbclf__n_estimators':[100],
              'gbclf__learning_rate': [0.1,0.2,0.3,0.4],
              'gbclf__max_depth': [6,7],
              'gbclf__min_samples_leaf': [2,3,4],
             }

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

gridCV_object = GridSearchCV(estimator = pipeline, 
                             param_grid = param_grid,
                             scoring = 'f1',
                             cv = StratifiedShuffleSplit(labels_train, 10, random_state=42))

gridCV_object.fit(features_train, labels_train)
clf = gridCV_object.best_estimator_

### Check scores

In [2]:
print gridCV_object.best_params_

{'pca__n_components': 3, 'gbclf__learning_rate': 0.4, 'skb__k': 10, 'gbclf__n_estimators': 100, 'gbclf__max_depth': 7, 'gbclf__min_samples_leaf': 3}


In [125]:
k_bestfeatures = clf.named_steps['skb']
kbest_feautures_list = list(df.columns.values)
kbest_feautures_list.remove('poi')
print kbest_feautures_list
print k_bestfeatures.scores_



['bonus', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'long_term_incentive', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value', 'poi_email_receipt', 'poi_mail_ratio', 'pct_from_poi', 'pct_to_poi', 'pct_bonus_w_stk']
[ 15.6545407    6.61322022   0.17721519   6.89673177   0.02370564
   0.21372759   4.44827252   2.04978159   0.64425933   0.07093984
   5.00725905   6.5145399    5.56763188   0.29109633   5.18520557
   7.30340226   6.00890174   7.21247411   1.65296169   4.09017607
  17.4141019 ]


In [115]:

from sklearn import metrics 
from sklearn.metrics import classification_report

pred = clf.predict(features_test)
print "\nBest estimator accuracy score:"
print clf.score(features_test, pred)
print "Best estimator precision score: "
print metrics.precision_score(labels_test, pred)
print "Best estimator recall score "
print metrics.recall_score(labels_test, pred,)

print "\n", classification_report(labels_test, pred)


Best estimator accuracy score:
1.0
Best estimator precision score: 
0.363636363636
Best estimator recall score 
0.666666666667

             precision    recall  f1-score   support

        0.0       0.92      0.76      0.83        29
        1.0       0.36      0.67      0.47         6

avg / total       0.82      0.74      0.77        35

