Kaggle: https://www.kaggle.com/jiuzhang/ninechapter-rf

Tutorial: https://www.kaggle.com/jiuzhang/aicamp-ensemble-exercise-2-complete-version

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

In [2]:
TRAIN_DIR = './input/mushrooms.csv'

# Explantory Data Analyisis - Take a Glance at the Data

In [3]:
train = pd.read_csv(TRAIN_DIR, sep=',', header=0)

In [4]:
train.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
train.shape

(8124, 23)

# Prepare Data

In [6]:
def label_encode_data(data):
    # Create Encoders
    encoders = {}
    for col in data.columns:
        data.loc[data[col].isnull(), col] = 'missing'
        
        encoder = LabelEncoder()
        values = data[col].tolist() + ['missing']
        encoder.fit(values)
        encoders[col] = encoder
        
    # Calculate Dimension
    dim = 0
    for col, encoder in encoders.items():
        dim += len(encoder.classes_)
    
    # Create X
    num_sample = data.shape[0]
    X = np.zeros((num_sample, dim))
    col_num = 0
    for col, encoder in encoders.items():
        num_elements = len(encoder.classes_)
        X[np.arange(num_sample), encoder.transform(data[col]) + col_num] = 1
        col_num += len(encoder.classes_)
        
    return X

In [7]:
X = label_encode_data(train.iloc[:, 1:])
y = np.array(train.iloc[:, 0].apply(lambda x: 0 if x == 'e' else 1))
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (8124, 139)
y shape: (8124,)


# Models

In [8]:
from sklearn.model_selection import cross_val_score

1.Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
cross_val_score(logreg, X, y, cv=8).mean()

0.9260029188161825

2.Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=6)
cross_val_score(dt, X, y, cv=8).mean()

0.9655492416896163

3.Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
cross_val_score(rf, X, y, cv=8).mean()

0.9475670067103681

4.Bagged Tree

In [12]:
from sklearn.base import BaseEstimator
class BaggedTreeClassifier(BaseEstimator):
    def __init__(self, M):
        self.M = M
        
    def fit(self, X, y):
        N = len(X)
        self.models = []
        for m in range(self.M):
            idx = np.random.choice(N, size=N, replace=True)
            Xb, yb = X[idx], y[idx]
            model = DecisionTreeClassifier(max_depth=5)
            model.fit(Xb, yb)
            self.models.append(model)
    
    def predict(self, X):
        predictions = np.zeros(len(X))
        for model in self.models:
            predictions += model.predict(X)
        return np.round(predictions / self.M)
    
    def score(self, X, y):
        result = self.predict(X)
        return np.mean(result == y)

In [13]:
btree = BaggedTreeClassifier(M=20)
cross_val_score(btree, X, y, cv=8).mean()

0.9990147783251231

5.Fake Random Forest

In [14]:
class FakeRandomForest(BaseEstimator):    
    def __init__(self, M):
        self.M = M     
        
    def fit(self, X, y, num_features=None):
        num_samples, dim = X.shape
        if num_features is None:
            num_features = int(np.sqrt(dim))
            
        self.models = []
        self.features = []
        
        for m in range(self.M):
            tree = DecisionTreeClassifier()
            
            # Select Samples
            idx = np.random.choice(num_samples, size=num_samples, replace=True)
            X_curr, y_curr = X[idx], y[idx]
            
            # Select Features
            features = np.random.choice(dim, size=num_features, replace=False)
            
            tree.fit(X_curr[:, features], y_curr)
            self.features.append(features)
            self.models.append(tree)
              
    def predict(self, X):
        num_samples = len(X)
        results = np.zeros(num_samples)
        for features, tree in zip(self.features, self.models):
            results += tree.predict(X[:, features])
        return np.round(results / self.M)
    
    def score(self, X, y):
        prediction = self.predict(X)
        return np.mean(prediction == y)

In [15]:
fakerf = FakeRandomForest(M=20)
cross_val_score(fakerf, X, y, cv=8).mean()

0.9274899635390403