In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

In [2]:
df = pd.read_csv('./datasets/train.csv')

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
df['SalePrice'].median()

162500.0

In [5]:
df['BiggerThanMedianSale'] = np.where(df['SalePrice'] > df['SalePrice'].median(), 1, 0)

In [6]:
X = df[['Lot Area', 'Overall Qual', 'Bedroom AbvGr']]
y = df['BiggerThanMedianSale']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Standard CART

In [7]:
tree = DecisionTreeClassifier() # Classic Decision Tree
tree.fit(X_train, y_train)

tree.tree_.max_depth # how many steps from the root node?

29

In [8]:
tree.score(X_test, y_test)

0.8362573099415205

## Let's try Bagging!

In [9]:
bagged = BaggingClassifier(base_estimator=DecisionTreeClassifier())
bagged.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [10]:
# All our 10 trees
bagged.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=78775797, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=469241619, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_frac

In [11]:
bagged.estimators_[0].tree_.max_depth # our first tree had 24 levels 

22

In [13]:
bagged.estimators_[1].tree_.max_depth # our second tree had 24 levels 

17

In [21]:
bagged.score(X_train, y_train)

0.9681404421326398

In [15]:
bagged.score(X_test, y_test)

0.8401559454191033

## Let's try Random Forest!

In [17]:
rand_forest = RandomForestClassifier(max_features=2)
rand_forest.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
temp = rand_forest.estimators_[0]
temp.max_features_

2

In [19]:
rand_forest.score(X_train, y_train)

0.9622886866059818

In [22]:
rand_forest.score(X_test, y_test)

0.8499025341130604

## Let's try ExtraTrees!

In [23]:
extra = ExtraTreesClassifier(max_features=2)
extra.fit(X_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [24]:
extra.score(X_train, y_train)

0.9817945383615084

In [25]:
extra.score(X_test, y_test)

0.8382066276803118

## Let's optimize RandomForest!

In [34]:
from sklearn.model_selection import GridSearchCV
my_params = {
#     'n_estimators': [, 45, 47],
#     'max_depth': [None, 10, 15, 20],
#     'criterion': ['gini', 'entropy'],
#     'max_features': ['auto']

# Attempt 2:
    'max_depth': [None, 5,6,7,8,9,10],
    'criterion': ['gini'],
    'max_features': ['auto']
}
grid = GridSearchCV(RandomForestClassifier(), my_params, cv=5)
grid.fit(X_train, y_train)
grid.best_params_



{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto'}

In [36]:
grid.score(X_train, y_train)

0.8576072821846554

In [35]:
grid.score(X_test, y_test)

0.8654970760233918