In [127]:
import pandas as pd
import numpy as np

train = pd.read_csv('/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit3/data/iowa_housing/train.csv')
test  = pd.read_csv('/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit3/data/iowa_housing/test.csv')


In [128]:
y = np.log(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)
test_id = test['Id']
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [129]:
train_empty = train.loc[:, train.isnull().sum() > 0]
# grab the columns
cols = train_empty.columns.tolist()
# fill with the appropriate value  -- NA, Other, could also work
train[['GarageType', 'GarageFinish']] = train[['GarageType', 'GarageFinish']].fillna('None')
test[['GarageType', 'GarageFinish']]  = test[['GarageType', 'GarageFinish']].fillna('None')

# we'll use this for GarageYrBlt since it's a numeric column
train['GarageYrBlt'].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True)

# finding the values to use in the training set
ms_mode   = train['MSZoning'].mode()[0]
gcarsmean = train['GarageCars'].mean()

# and applying them to the test set
test['MSZoning'].fillna(ms_mode, inplace=True)
test['GarageCars'].fillna(gcarsmean, inplace=True)

In [130]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass']  = test['MSSubClass'].astype(str)

In [131]:
# your answer here
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
ridge = Ridge()

garage_mapping = {
    'None': 0, # no garage
    'Unf' : 1, # unfinished garage
    'RFn' : 2, # partially finished garage
    'Fin' : 3  # finished garage
}

# mapping for the ordinal column
mapping = {
    'col': 'GarageFinish',
    'mapping': garage_mapping
}

# initialize everything
ore   = OrdinalEncoder(cols=['GarageFinish'], mapping=[mapping])
ohe   = OneHotEncoder()
sc    = StandardScaler()
ridge = Ridge(alpha=100)

# make the pipeline
pipe = make_pipeline(ore, ohe, sc, ridge)

In [132]:
from sklearn.model_selection import cross_val_score

cross_val_score(estimator=pipe, X=train, y=y, cv=10)
# This returns the R2 value for every fold

array([0.87788159, 0.91038498, 0.91084233, 0.82906485, 0.86482861,
       0.86753863, 0.87662249, 0.89085373, 0.74298285, 0.88002388])

In [133]:
pipe.fit(train, y).score(train, y)
# R2 as well

0.8814865300749261

In [134]:
from sklearn.metrics import mean_squared_error, make_scorer
import category_encoders as ce

scorer = make_scorer(mean_squared_error)

cross_val_score(estimator=pipe, X=train, y=y, scoring = scorer, cv=10)
# The score is now no longer R2, it's the mean squared error
# This is part of what we were evaluated on the Boston Housing dataset.
# They were doing the square root of all these values. 
scores = np.sqrt(cross_val_score(estimator=pipe, X=train, y=y, scoring = scorer, cv=10))

In [135]:
scores
# This is the number we got for the kaggle scores

array([0.13197657, 0.11734106, 0.12674218, 0.17603549, 0.16704623,
       0.13068589, 0.13633373, 0.12306828, 0.19083807, 0.13923503])

In [136]:
np.mean(scores)
# This is an indication of how you'd perform against the test set

0.14393025367074608

In [137]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [138]:
rf_pipe = make_pipeline(ore, ohe, rf)

In [139]:
rf_pipe.fit(train, y)



Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['GarageFinish'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'GarageFinish',
                                          'mapping': {'Fin': 3, 'None': 0,
                                                      'RFn': 2, 'Unf': 1}}],
                                return_df=True, verbose=0)),
                ('onehotencoder',
                 OneHotEncoder(cols=['MSSubClass', 'MSZoning', 'Neighborhood',
                                     'GarageType'],
                               drop_...
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_d

In [140]:
rf_pipe.steps[2][1].feature_importances_
# Position 0 is the ordinal encoder
# Get the item at position 2 and get the first item there, which is our random forest
# Random forests don't have coefficients
# They DO have feature importances
# This number corresponds for every single column

array([1.33293488e-03, 1.38313200e-03, 3.85532612e-04, 4.82379393e-04,
       1.24212516e-04, 6.41442673e-05, 3.86076290e-04, 1.83191799e-04,
       2.10508660e-03, 3.51175292e-05, 3.17834349e-04, 8.38066475e-06,
       1.94528494e-04, 4.20108855e-06, 3.85472368e-04, 9.58736143e-04,
       5.36984095e-03, 4.75967149e-03, 1.66160493e-04, 2.63858310e-04,
       3.43455931e-02, 6.71686884e-04, 1.94484258e-04, 1.55280046e-03,
       4.22434931e-04, 9.78368570e-04, 4.56995571e-04, 1.45754914e-03,
       7.39749833e-04, 3.91476839e-04, 1.05452570e-03, 8.44902010e-04,
       8.98351117e-04, 1.68126287e-04, 1.23938996e-03, 2.14535744e-05,
       2.93995438e-03, 4.21010990e-04, 1.12908727e-03, 3.62357711e-04,
       4.79108436e-04, 2.16846714e-04, 7.75765074e-06, 2.88409630e-05,
       2.09770106e-04, 8.74906555e-06, 5.68020317e-01, 2.01622945e-02,
       3.19583382e-02, 5.46279250e-02, 6.04848368e-02, 9.69021864e-03,
       8.05986502e-02, 2.53230766e-03, 1.51188953e-03, 4.19813235e-03,
      

In [141]:
columns = rf_pipe.steps[1][1].get_feature_names()
# This returns the feature names
# Once you one hot encode something the number of columns is going to be very different

In [142]:
importances = rf_pipe.steps[2][1].feature_importances_
# Turning it into a dataframe

In [143]:
features = pd.DataFrame({
    'Column': columns,
    'Importance': importances,
    # The value for the importance key will be called the importances
}).sort_values(by='Importance', ascending=False)

In [144]:
features.head()
# Gives you the list of features and their relative importance

Unnamed: 0,Column,Importance
46,OverallQual,0.56802
52,GrLivArea.1,0.080599
50,1stFlrSF,0.060485
64,GarageCars,0.058278
49,GrLivArea,0.054628


In [145]:
features['Importance'].sum()
# Should add up to 1

1.0

In [146]:
features.head(25)
# Certain numbers get small pretty quickly

Unnamed: 0,Column,Importance
46,OverallQual,0.56802
52,GrLivArea.1,0.080599
50,1stFlrSF,0.060485
64,GarageCars,0.058278
49,GrLivArea,0.054628
20,LotArea,0.034346
48,YearBuilt,0.031958
62,GarageYrBlt,0.022764
47,OverallCond,0.020162
51,2ndFlrSF,0.00969


In [147]:
features.tail(25)['Importance'].sum()
# The bottom 25 features account for basically 4/10ths of 1 percent
# You could get probably rid of these entirely
# There's a pretty obvious cutoff point for when things start having additional impact

0.004476840689252015

In [148]:
import pickle
# This module saves variables from a notebook and lets them be used elsewhere

with open('pipe.pkl', 'wb') as pipe:
# wb ... the w means writing. you're writing something to the harddrive
# b stands for byte code
    pickle.dump(rf_pipe, pipe)
    # Simple yet powerful
    # We created an empty file called pipe.pkl and said we're going to write into it
    # We gave the file a variable name called pipe
    # We said the contents of this file are the rf_pipe
    # This is just wrapped up in a box and in shrink rap, and we can deploy it elsewhere!
    

In [149]:
mean_squared_error

<function sklearn.metrics.regression.mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average')>

In [150]:
encoding_pipe = make_pipeline(ore, ohe)

In [151]:
train = encoding_pipe.fit_transform(train)

In [152]:
train.head()

Unnamed: 0,MSSubClass_1,MSSubClass_2,MSSubClass_3,MSSubClass_4,MSSubClass_5,MSSubClass_6,MSSubClass_7,MSSubClass_8,MSSubClass_9,MSSubClass_10,...,GarageType_1,GarageType_2,GarageType_3,GarageType_4,GarageType_5,GarageType_6,GarageType_7,GarageYrBlt,GarageFinish,GarageCars
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2003.0,2,2
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1976.0,2,2
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2001.0,2,2
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1998.0,1,3
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2000.0,2,3


In [189]:
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

In [192]:
root_error = mse(y, y.mean())

In [193]:
root_error

0.15945250615661058

In [194]:
train.GarageYrBlt > 2003

0       False
1       False
2       False
3       False
4       False
        ...  
1455    False
1456    False
1457    False
1458    False
1459    False
Name: GarageYrBlt, Length: 1460, dtype: bool

In [195]:
left = train[train.GarageYrBlt > 2003]
# This returns where this condition is true
right = train[~(train.GarageYrBlt > 2003)]
# The right split would be where this is not true

In [196]:
left.head()
# The left split would be where this is true

Unnamed: 0,MSSubClass_1,MSSubClass_2,MSSubClass_3,MSSubClass_4,MSSubClass_5,MSSubClass_6,MSSubClass_7,MSSubClass_8,MSSubClass_9,MSSubClass_10,...,GarageType_1,GarageType_2,GarageType_3,GarageType_4,GarageType_5,GarageType_6,GarageType_7,GarageYrBlt,GarageFinish,GarageCars
6,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2004.0,2,2
11,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,2005.0,3,3
13,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2006.0,2,3
18,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2004.0,1,2
20,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,2005.0,2,3


In [197]:
right.head()

Unnamed: 0,MSSubClass_1,MSSubClass_2,MSSubClass_3,MSSubClass_4,MSSubClass_5,MSSubClass_6,MSSubClass_7,MSSubClass_8,MSSubClass_9,MSSubClass_10,...,GarageType_1,GarageType_2,GarageType_3,GarageType_4,GarageType_5,GarageType_6,GarageType_7,GarageYrBlt,GarageFinish,GarageCars
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2003.0,2,2
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1976.0,2,2
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2001.0,2,2
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1998.0,1,3
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2000.0,2,3


In [198]:
right_error = mse(y[right.index], y[right.index].mean())
left_error = mse(y[left.index], y[left.index].mean())

In [199]:
right_error

0.13780060493057458

In [200]:
weighted_left_error = (len(left)/len(train)) * left_error
# You're weighting the amount of error on each side by the number of samples that there are

In [201]:
weighted_right_error = (len(right)/len(train)) * right_error

In [202]:
root_error - weighted_left_error - weighted_right_error
# The closer this number gets to zero the more you're going to reject it
# Takes the value which returns the largest number and splits the data
# Then keeps doing this over and over again
# The number that's used to split should be as big as possible

0.02657429804255934