In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv('/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit3/data/iowa_housing/train.csv')
test  = pd.read_csv('/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit3/data/iowa_housing/test.csv')


In [5]:
y = np.log(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)
test_id = test['Id']
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [6]:
train_empty = train.loc[:, train.isnull().sum() > 0]
# grab the columns
cols = train_empty.columns.tolist()
# fill with the appropriate value  -- NA, Other, could also work
train[['GarageType', 'GarageFinish']] = train[['GarageType', 'GarageFinish']].fillna('None')
test[['GarageType', 'GarageFinish']]  = test[['GarageType', 'GarageFinish']].fillna('None')

# we'll use this for GarageYrBlt since it's a numeric column
train['GarageYrBlt'].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True)

# finding the values to use in the training set
ms_mode   = train['MSZoning'].mode()[0]
gcarsmean = train['GarageCars'].mean()

# and applying them to the test set
test['MSZoning'].fillna(ms_mode, inplace=True)
test['GarageCars'].fillna(gcarsmean, inplace=True)

In [7]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass']  = test['MSSubClass'].astype(str)

In [11]:
# your answer here
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
ridge = Ridge()

garage_mapping = {
    'None': 0, # no garage
    'Unf' : 1, # unfinished garage
    'RFn' : 2, # partially finished garage
    'Fin' : 3  # finished garage
}

# mapping for the ordinal column
mapping = {
    'col': 'GarageFinish',
    'mapping': garage_mapping
}

# initialize everything
ore   = OrdinalEncoder(cols=['GarageFinish'], mapping=[mapping])
ohe   = OneHotEncoder()
sc    = StandardScaler()
ridge = Ridge(alpha=100)

# make the pipeline
pipe = make_pipeline(ore, ohe, sc, ridge)

In [13]:
from sklearn.model_selection import cross_val_score

cross_val_score(estimator=pipe, X=train, y=y, cv=10)
# This returns the R2 value for every fold

array([0.87788159, 0.91038498, 0.91084233, 0.82906485, 0.86482861,
       0.86753863, 0.87662249, 0.89085373, 0.74298285, 0.88002388])

In [14]:
pipe.fit(train, y).score(train, y)
# R2 as well

0.8814865300749261

In [20]:
from sklearn.metrics import mean_squared_error, make_scorer
import category_encoders as ce

scorer = make_scorer(mean_squared_error)

cross_val_score(estimator=pipe, X=train, y=y, scoring = scorer, cv=10)
# The score is now no longer R2, it's the mean squared error
# This is part of what we were evaluated on the Boston Housing dataset.
# They were doing the square root of all these values. 
scores = np.sqrt(cross_val_score(estimator=pipe, X=train, y=y, scoring = scorer, cv=10))

In [21]:
scores
# This is the number we got for the kaggle scores

array([0.13197657, 0.11734106, 0.12674218, 0.17603549, 0.16704623,
       0.13068589, 0.13633373, 0.12306828, 0.19083807, 0.13923503])

In [24]:
np.mean(scores)
# This is an indication of how you'd perform against the test set

0.14393025367074608

In [25]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [26]:
rf_pipe = make_pipeline(ore, ohe, rf)

In [27]:
rf_pipe.fit(train, y)



Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['GarageFinish'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'GarageFinish',
                                          'mapping': {'Fin': 3, 'None': 0,
                                                      'RFn': 2, 'Unf': 1}}],
                                return_df=True, verbose=0)),
                ('onehotencoder',
                 OneHotEncoder(cols=['MSSubClass', 'MSZoning', 'Neighborhood',
                                     'GarageType'],
                               drop_...
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_d

In [30]:
rf_pipe.steps[2][1].feature_importances_
# Position 0 is the ordinal encoder
# Get the item at position 2 and get the first item there, which is our random forest
# Random forests don't have coefficients
# They DO have feature importances
# This number corresponds for every single column

array([1.72657805e-03, 8.73925499e-04, 6.77072234e-04, 1.06397430e-03,
       2.38094100e-04, 1.66152338e-04, 2.11483636e-03, 8.60722955e-05,
       3.40226738e-03, 3.17066487e-05, 6.23854403e-04, 1.26619619e-04,
       1.01281120e-04, 6.84997682e-06, 1.13073810e-04, 7.47325931e-04,
       8.48908718e-03, 6.41901790e-03, 2.84345491e-04, 3.57025340e-05,
       3.05724096e-02, 1.08062560e-03, 2.81109226e-04, 2.22115966e-03,
       3.19154536e-04, 6.97224538e-04, 4.01081856e-04, 1.55137910e-03,
       1.31983978e-03, 8.50843551e-04, 9.23801001e-04, 7.13247189e-04,
       1.93134651e-03, 2.56210932e-04, 1.23050297e-03, 2.09643317e-05,
       2.89200831e-03, 4.73316074e-04, 3.20713466e-04, 1.19278147e-04,
       7.78571681e-04, 1.70656011e-04, 2.30180934e-05, 6.05653772e-07,
       3.36997636e-04, 4.39462281e-05, 5.44400091e-01, 1.68491676e-02,
       3.83852078e-02, 8.61147682e-02, 7.08699895e-02, 1.00334459e-02,
       4.23954569e-02, 6.99544306e-03, 2.26363952e-03, 3.63197754e-03,
      

In [37]:
columns = rf_pipe.steps[1][1].get_feature_names()
# This returns the feature names
# Once you one hot encode something the number of columns is going to be very different

In [38]:
importances = rf_pipe.steps[2][1].feature_importances_
# Turning it into a dataframe

In [39]:
features = pd.DataFrame({
    'Column': columns,
    'Importance': importances,
    # The value for the importance key will be called the importances
}).sort_values(by='Importance', ascending=False)

In [40]:
features.head()
# Gives you the list of features and their relative importance

Unnamed: 0,Column,Importance
46,OverallQual,0.5444
49,GrLivArea,0.086115
50,1stFlrSF,0.07087
64,GarageCars,0.053685
52,GrLivArea.1,0.042395


In [42]:
features['Importance'].sum()
# Should add up to 1

1.0

In [43]:
features.head(25)
# Certain numbers get small pretty quickly

Unnamed: 0,Column,Importance
46,OverallQual,0.5444
49,GrLivArea,0.086115
50,1stFlrSF,0.07087
64,GarageCars,0.053685
52,GrLivArea.1,0.042395
48,YearBuilt,0.038385
20,LotArea,0.030572
62,GarageYrBlt,0.026917
47,OverallCond,0.016849
63,GarageFinish,0.010322


In [45]:
features.tail(25)['Importance'].sum()
# The bottom 25 features account for basically 4/10ths of 1 percent
# You could get probably rid of these entirely
# There's a pretty obvious cutoff point for when things start having additional impact

0.0041287019951707455