In [58]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

import warnings
warnings.simplefilter(action='ignore')

In [59]:
df = pd.read_csv('./datasets/train.csv')

In [60]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [61]:
df['SalePrice'].median()

162500.0

In [62]:
df['BiggerThanMedSale'] = df.SalePrice.map(lambda x: 0 if x <= 162500 else 1)

In [63]:
df.isnull().sum()

Id                      0
PID                     0
MS SubClass             0
MS Zoning               0
Lot Frontage          330
Lot Area                0
Street                  0
Alley                1911
Lot Shape               0
Land Contour            0
Utilities               0
Lot Config              0
Land Slope              0
Neighborhood            0
Condition 1             0
Condition 2             0
Bldg Type               0
House Style             0
Overall Qual            0
Overall Cond            0
Year Built              0
Year Remod/Add          0
Roof Style              0
Roof Matl               0
Exterior 1st            0
Exterior 2nd            0
Mas Vnr Type           22
Mas Vnr Area           22
Exter Qual              0
Exter Cond              0
                     ... 
Bedroom AbvGr           0
Kitchen AbvGr           0
Kitchen Qual            0
TotRms AbvGrd           0
Functional              0
Fireplaces              0
Fireplace Qu         1000
Garage Type 

In [64]:
X = df[['Lot Area', 'Overall Qual', 'Bedroom AbvGr']]
y = df['BiggerThanMedSale']

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [67]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [68]:
tree.tree_.max_depth

# .tree_ lets us access info about the tree

29

In [69]:
tree.score(X_train, y_train)

0.9817945383615084

In [70]:
tree.score(X_test, y_test)

0.8362573099415205

In [71]:
bagged = BaggingClassifier(base_estimator=DecisionTreeClassifier())
bagged.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [72]:
bagged.estimators_[0].tree_.max_depth

# bagged.estimators_ this displays all of our models, this is a list, if we want to look at a single one we can index []

22

In [73]:
bagged.score(X_train, y_train)

0.9648894668400521

In [74]:
bagged.score(X_test, y_test)

0.8187134502923976

## Let's try a Random Forest

In [75]:
np.sqrt(3)

1.7320508075688772

In [76]:
rf = RandomForestClassifier(max_features=2)

In [77]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [78]:
temp = rf.estimators_[0]

# assigning this to a temp variable allows us to further tabulate after indexing

In [79]:
temp.max_features_

2

In [80]:
rf.score(X_train, y_train)

0.9674902470741222

In [81]:
rf.score(X_test, y_test)

0.8460038986354775

## Let's try ExtraTress!

In [82]:
xtra = ExtraTreesClassifier(max_features=2)

In [83]:
xtra.fit(X_train, y_train)

#Extra trees is diff from random forests bc it's random splitting and not optimizing Gini

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [84]:
xtra.score(X_train, y_train)

0.9817945383615084

In [85]:
xtra.score(X_test, y_test)

0.8323586744639376

## Let's optimize RandomForest

In [86]:
from sklearn.model_selection import GridSearchCV

In [99]:
params = {'n_estimators': [4, 5, 6, 7, 8],
          'criterion': ['gini', 'entropy'],
          'max_features': ['auto', 2, 3],
          'max_depth': [5, 6, 7, 8, 9, 10]}

In [100]:
grid = GridSearchCV(RandomForestClassifier(), params, cv=5)

In [101]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 2, 3], 'max_depth': [5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [102]:
grid.score(X_train, y_train)

0.8758127438231469

In [103]:
grid.score(X_test, y_test)

0.8752436647173489

In [104]:
grid.best_params_

{'criterion': 'entropy', 'max_depth': 7, 'max_features': 2, 'n_estimators': 7}