In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import preprocessing

# Exercises. Part 2: Poisonous Mushrooms

In [None]:
mush = pd.read_csv('agaricus-lepiota.data.csv', header = None, names=['edible', 'cap-shape', 'cap-surface', 'cap-color', 
                                                                      'bruises?','odor', 'gill-attachment', 'gill-spacing', 
                                                                      'gill-size','gill-color', 'stalk-shape', 'stalk-root',
                                                                      'stalk-surface-above-ring', 'stalk-surface-below-ring',
                                                                      'stalk-color-above-ring', 'stalk-color-below-ring',
                                                                      'veil-type', 'veil-color', 'ring-number', 'ring-type',
                                                                      'spore-print-color', 'population','habitat'])
mush.head()

In [None]:
#Analyze Data
mush['stalk-root'].value_counts()

In [None]:
mush.info()

In [None]:
#Drop duplicate values
duplicated_values = mush.drop_duplicates()
duplicated_values.info() #NO DUPLICATED VALUES

In [None]:
#Drop columns and separate independent and dependent variables
X = mush.drop(columns=['edible','stalk-root'])
y = mush['edible']

In [None]:
# See if we have NaNs
X.isna().sum()

In [None]:
# Enconde the categorical variables in order to turn it numerical (Create the function)
def one_hot(cat):
    dummies = pd.get_dummies(X[cat], prefix = cat)
    res = pd.concat([X, dummies], axis = 1)
    return(res)

In [None]:
# Isolate categorical and numerical ones
cat_features = X.select_dtypes(include=['object'])
num_features = X.select_dtypes(include=['int','float'])

In [None]:
# Apply the encoding to the categorical columns
for cat_var in cat_features:
    X = one_hot(cat_var)

In [None]:
# Drop the old ones from the dataset
X.drop(cat_features, axis=1, inplace = True)

In [None]:
X.head()

In [None]:
# Convert y into boolean
y[y=='p']=0
y[y=='e']=1
y.head()

In [None]:
y=y.astype('int')

In [None]:
print(X.shape, y.shape)

In [None]:
# Split data train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

# KNN Model

In [None]:
# Build KNN model
knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
# Choose K between 1 to 40 to analyze score outcomes
k_range = range(1, 40)
k_scores = []

# Use iteration to caclulate different K in models, then return the average accuracy based on the CV
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores_knn = cross_val_score(knn, X_train, y_train, cv=5, scoring='roc_auc')
    k_scores.append(scores_knn.mean())

In [None]:
#Look for the maximum score
index_max = np.argmax(k_scores)
index_max

In [None]:
k_scores[index_max]

In [None]:
# index 0 is the best K which gives a value of 1 for ROC_AUC

# Plot
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated ROC_AUC')
plt.show()

# Naive Bayes Model

In [None]:
alpha = [0.01,0.1,1,10,100]
n_scores = []

for a in alpha:
    bnl = BernoulliNB(a)
    scores_nb = cross_val_score(bnl, X_train, y_train, cv=5, scoring='roc_auc')
    n_scores.append(scores_nb.mean())
    print(np.mean(scores_nb))

In [None]:
max(n_scores)

In [None]:
# Plot
plt.semilogx(alpha, n_scores)
plt.xlabel('Value of alpha for Naive Bayes')
plt.ylabel('Cross-Validated ROC_AUC')
plt.show()

In [None]:
# 0.01 alpha is the best alpha which gives a value of 0.99898 for ROC_AUC

# SVM Model

In [None]:
alpha = [0.001,0.1,1,10,100,1000]
s_scores = []

for a in alpha:
    svm = SVC(gamma='scale', C = a, probability=True)
    scores_svm = cross_val_score(svm, X_train, y_train, cv=5, scoring='roc_auc')
    s_scores.append(scores_svm.mean())
    print(np.mean(scores_svm))

In [None]:
max(s_scores)

In [None]:
# Plot
plt.semilogx(alpha, s_scores)
plt.xlabel('Value of alpha for SVM')
plt.ylabel('Cross-Validated ROC_AUC')
plt.show()

In [None]:
# alpha > 0.1 are the best alpha which gives values of 1 for ROC_AUC

# Best overall Model

In [None]:
# Applying best model
best_model = SVC(gamma='scale', C = 10, probability=True)
train = best_model.fit(X_train, np.ravel(y_train))

In [None]:
# Predictions
predictions_test = best_model.predict_proba(X_test)[:,1]
fpr_test, tpr_test, _ = metrics.roc_curve(y_test, predictions_test)

In [None]:
# Plotting
plt.plot(fpr_test, tpr_test, 'r', linewidth=2, markersize=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('ROC curve')
plt.show()