In [29]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import seaborn as sns
from seaborn import plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

### Classification Error Metric Challenges

**Settings:  Where applicable, use test_size=0.30, random_state=4444.  This will permit comparison of results across users.

*These reference the Classification Challenges.*

#### Challenge 1

For the house representatives data set, calculate the accuracy, precision, recall and f1 scores of each classifier you built (on the test set).

In [30]:
column_names = ['party', 'handicap', 'water', 'budget', 'physician', 'elsavador', 'religion', 'satellite', 'nicaraguan',
               'missile', 'immigration', 'snyfuels', 'education', 'superfund', 'crime', 'dutyfree', 'export']
cols = np.char.lower(cols)
cols = np.char.replace(cols,' ','_')
cols = np.char.replace(cols,'-','_')
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data',names = cols)

In [31]:
df.head()

Unnamed: 0,class_name,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [32]:
def f(x):
    if(x == 'y'):
        return 1
    elif(x == 'n'):
        return 0
    elif(x == '?'):
        return np.nan
    else:
        return x

In [7]:
df = (df.applymap(f)).dropna(how='any')
df.head()

Unnamed: 0,class_name,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa
5,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
8,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
19,democrat,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
23,democrat,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25,democrat,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [8]:
X = (df.iloc[:,1:])
y = df.iloc[:,0].to_frame()
X_test, X_train, y_test, y_train = train_test_split(X,y,test_size=0.3)

###### K Neighbors Classifier
4 had the highest accuracy score from challenge 7 so we will only use this

In [34]:
models = [KNeighborsClassifier(n_neighbors=4),
          LogisticRegression(),
          GaussianNB(),
          SVC(probability=True),
          DecisionTreeClassifier(),
          RandomForestClassifier()]
model_names = ['KNN', 'Logistic', 'Naive Bayes', 'SVM', 'Decision Tree', 'Random Forest']

In [37]:
accuracy = []
precision_rep = []
precision_dem = []
recall_rep = []
recall_dem = []
f1_rep = []
f1_dem = []

for model in models:
    model.fit(X_train,y_train)
    y_hat = model.predict(X_test)
    
    accuracy.append(accuracy_score(y_test, y_hat))
    precision_rep.append(precision_score(y_test, y_hat, pos_label='democrat'))
    precision_dem.append(precision_score(y_test, y_hat, pos_label='republican') )
    recall_rep.append(recall_score(y_test, y_hat, pos_label='democrat'))
    recall_dem.append(recall_score(y_test, y_hat, pos_label='republican'))
    f1_rep.append(f1_score(y_test, y_hat, pos_label='democrat'))
    f1_dem.append(f1_score(y_test, y_hat, pos_label='republican'))

  # Remove the CWD from sys.path while we load stuff.
  y = column_or_1d(y, warn=True)
  # Remove the CWD from sys.path while we load stuff.


In [40]:
accuracy_df = pd.DataFrame({'accuracy':accuracy, 
                            'precision_rep':precision_rep, 
                            'precision_dem':precision_dem,
                            'recall_rep':recall_rep, 
                            'recall_dem':recall_dem, 
                            'f1_rep':f1_rep, 
                            'f1_dem':f1_dem})
accuracy_df.set_index([model_names])

Unnamed: 0,accuracy,f1_dem,f1_rep,precision_dem,precision_rep,recall_dem,recall_rep
KNN,0.919753,0.913907,0.924855,0.873418,0.963855,0.958333,0.888889
Logistic,0.950617,0.945946,0.954545,0.921053,0.976744,0.972222,0.933333
Naive Bayes,0.950617,0.946667,0.954023,0.910256,0.988095,0.986111,0.922222
SVM,0.938272,0.934211,0.94186,0.8875,0.987805,0.986111,0.9
Decision Tree,0.882716,0.867133,0.895028,0.873239,0.89011,0.861111,0.9
Random Forest,0.95679,0.95302,0.96,0.922078,0.988235,0.986111,0.933333


#### Challenge 3

Calculate the same metrics you did in challenge 1, but this time in a cross validation scheme with the `cross_val_score` function (like in Challenge 9).

In [44]:
fpr_list = []
tpr_list = []
auc_list = []
for model in models:
    y_hat_prob = model.predict_proba(X_test)[:,1]
    fpr,tpr,_ = roc_curve(y_test,y_hat_prob,pos_label='democrat')
    fpr_list.append(fpr)
    tpr_list.append(tpr)
    auc_list.append(auc(fpr, tpr))

In [45]:
plt.figure(figsize=(16,10))
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.title('ROC Curve for All Models')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
for i in range(len(fpr_list)):
    plt.plot(fpr_list[i], tpr_list[i], label = model_names[i] + ' - AUC: ' + str(auc_list[i]))
plt.legend(loc = 0, fontsize = 15)

<matplotlib.legend.Legend at 0x7f153b4c8c88>

#### Challenge 4

For your movie classifiers, calculate the precision and recall for each class.

#### Challenge 5

Draw the ROC curve (and calculate AUC) for the logistic regression classifier from challenge 12.