In [58]:
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from numpy.random import permutation
from numpy import array_split, concatenate
from sklearn.metrics import roc_curve, auc, mean_squared_error
import pandas as pd
import numpy as np

In [59]:
class MushroomProblem:
    def __init__(self, data_file):
        self.df = pd.read_csv(data_file)
        for k in self.df.columns[1:]:
            self.df[k], _ = pd.factorize(self.df[k])
        
        sorted_cats = sorted(pd.Categorical(self.df['class']).categories)
        self.classes = np.array(sorted_cats)
        self.features = self.df.columns[self.df.columns != 'class']
    
    def __factorize(self, data):
        y, _ = pd.factorize(pd.Categorical(data['class']), sort=True)
        return y
        
    def validation_data(self, fold):
        df = self.df
        response = []
        assert len(df) > folds
        
        # Create a permutation of the indices with the given len.
        # Split into list into n-folds.
        perms = array_split(permutation(len(df)), folds)
        
        for i in range(folds):
            # Let fold i be the testing data.
            train_idxs = list(range(folds))
            train_idxs.pop(i)

            # Let the remaining be the training data.
            train = []
            for idx in train_idxs:
                train.append(perms[idx])
            
            # Flatten the lists.
            train = concatenate(train)
            
            test_idx = perms[i]
            
            # Select the rows with the given indices.
            training = df.iloc[train]
            test_data = df.iloc[test_idx]
            
            y = self.__factorize(training)
            classifier = self.train(training[self.features], y)
            predictions = classifier.predict(test_data[self.features])
            
            expected = self.__factorize(test_data)
            response.append([predictions, expected])

        return response

In [60]:
class MushroomRegression(MushroomProblem):
    def train(self, X, y):
        reg = DecisionTreeRegressor()
        reg = reg.fit(X, y)
        return reg

    def validate(self, folds):
        responses = []
        
        for y_true, y_pred in self.validation_data(folds):
            responses.append(mean_squared_error(y_true, y_pred))
    
        return responses

In [61]:
class MushroomClassifier(MushroomProblem):
    def validate(self, folds):
        confusion_matrices = []
        
        for test, training in self.validation_data(folds):
            confusion_matrices.append(self.confusion_matrix(training, test))
        
        return confusion_matrices

    def confusion_matrix(self, train, test):
        return pd.crosstab(test, train, rownames=['actual'], colnames=['preds'])

In [62]:
class MushroomForest(MushroomClassifier):
    def train(self, X, y):
        clf = RandomForestClassifier(n_jobs=2)
        clf = clf.fit(X, y)
        return clf

In [63]:
class MushroomTree(MushroomClassifier):
    def train(self, X, y):
        clf = DecisionTreeClassifier()
        clf = clf.fit(X, y)
        return clf

In [64]:
data = './data/decision_tree/agaricus-lepiota.data'
folds = 5

print('Calculating score for decision tree')
tree = MushroomTree(data)
tree.validate(folds)

Calculating score for decision tree


[preds     0    1
 actual          
 0       853    0
 1         0  772, preds     0    1
 actual          
 0       794    0
 1         0  831, preds     0    1
 actual          
 0       867    0
 1         0  758, preds     0    1
 actual          
 0       854    0
 1         0  771, preds     0    1
 actual          
 0       840    0
 1         0  784]

In [66]:
print('Calculating score for random forest')
tree = MushroomForest(data)
tree.validate(folds)

Calculating score for random forest




[preds     0    1
 actual          
 0       835    0
 1         0  790, preds     0    1
 actual          
 0       828    0
 1         0  797, preds     0    1
 actual          
 0       846    0
 1         0  779, preds     0    1
 actual          
 0       856    0
 1         0  769, preds     0    1
 actual          
 0       843    0
 1         0  781]

In [67]:
print('Calculating score for regression')
tree = MushroomRegression(data)
tree.validate(folds)

Calculating score for regression


[0.0, 0.0, 0.0, 0.0, 0.0]