In [2]:
from __future__ import division
import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import KernelPCA, PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, cross_validation, grid_search
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif, RFECV, SelectKBest

In [8]:
ls ../datasets/mushroom

Index                   agaricus-lepiota.names  index.html
agaricus-lepiota.data   expanded.Z


In [31]:
fields = []
bank_csv = "../datasets/mushroom/agaricus-lepiota.data"
with open(bank_csv, 'rb') as bcsv:
    reader = csv.reader(bcsv, delimiter=',')
    for row in reader:
        fields.append(row)

In [32]:
mucks_df = pd.DataFrame(fields)

In [33]:
mucks_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [34]:
indices = mucks_df.columns

In [35]:
# Numeric labeling of features
attrs = []

for col in indices:
    attr_vals = set(mucks_df[col])
    attrs.append(list(attr_vals))
    val_map = {val:i for val, i in zip(attrs[-1], range(len(attr_vals)))}
    mucks_df[col] = mucks_df[col].replace(val_map)

In [36]:
# 1-hot encoding
for attr, old_col_name in zip(attrs, indices):
    if len(attr) > 2:
        for label, col_name in zip(range(len(attr)), attr):
            mucks_df[col_name+str(old_col_name)] = \
                mucks_df[[old_col_name]].applymap(lambda x: 1 if x == label else 0)

        mucks_df.drop(old_col_name, axis=1, inplace=True)

In [40]:
mucks_df.head()

Unnamed: 0,0,4,6,7,8,10,16,c1,b1,f1,...,s21,v21,y21,d22,g22,m22,l22,p22,u22,w22
0,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [42]:
y = mucks_df[0].as_matrix()
mucks_df.drop(0, axis=1, inplace=True)

X = mucks_df.as_matrix()
X.shape

(8124, 112)

In [44]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
 X,y, train_size=5000, random_state=20)

# Mushroom Classifier

In [45]:
# Logit
logit_clf = LogisticRegressionCV(solver='liblinear', Cs=[.001,.01,.1,1,10,100], penalty='l1')
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

1.0

In [46]:
# Logit
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], penalty='l2')
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

1.0

In [48]:
# SVM 
svm_params = {'C':[.01,1,10,100,500], 'degree':[1,2,3]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=3, n_jobs=4)
svm_clf.fit(X_train,y_train)
print svm_clf.best_score_
svm_clf.best_params_

1.0


{'C': 100, 'degree': 1}

In [49]:
# Random Forests
rf_params = {"max_features":[1,2,4,6,8,12,16,20], 
             "n_estimators":[1024]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=3, n_jobs=4)
rf_clf.fit(X_train,y_train)
print rf_clf.best_score_
rf_clf.best_params_

1.0


{'max_features': 1, 'n_estimators': 1024}

In [50]:
# KNN
knn_params = {"n_neighbors":[1,2,3,5,7,10,15,25,50,100,500]}
clf = KNeighborsClassifier()
knn_clf = grid_search.GridSearchCV(clf, knn_params, cv=3, n_jobs=4)
knn_clf.fit(X_train,y_train);
print knn_clf.best_score_
knn_clf.best_params_

1.0


{'n_neighbors': 1}

In [51]:
# Gradient Boosting
gb_params = {"n_estimators":[64,128,512,1024],
            "learning_rate":[.01,.1]}
clf = GradientBoostingClassifier()
gb_clf = grid_search.GridSearchCV(clf, gb_params, cv=3, n_jobs=4)
gb_clf.fit(X_train,y_train);
print gb_clf.best_score_
gb_clf.best_params_

1.0


{'learning_rate': 0.01, 'n_estimators': 1024}