# Regression
- Housing prices

## 1. Importing libraries

In [140]:
import numpy as np
import pandas as pd
from collections import namedtuple
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from ipywidgets import interact, Dropdown
from pandas.api.types import is_numeric_dtype, is_object_dtype
import warnings

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

warnings.filterwarnings("ignore")

## 2. Loading the data

In [9]:
df = pd.read_csv('../docs/Housing_Train.csv')

In [6]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [10]:
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1350,70,RM,50.0,5250,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,122000
1,1138,50,RL,54.0,6342,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,94000
2,748,70,RM,65.0,11700,Pave,Pave,IR1,Lvl,AllPub,...,0,,,,0,5,2009,WD,Normal,265979
3,305,75,RM,87.0,18386,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,295000
4,1133,70,RM,90.0,9900,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,117500


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [13]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## 2. Model fitting

In [145]:
def plot_scatter(num_column, cat_column):
    """Housing prices as function of numeric and categorical types"""
    
    fig, ax = plt.subplots(1,2,figsize=(13,6))
    fig.tight_layout(pad=5)
    ax = ax.ravel()
    
    ### NUMERICAL DATA ###
    #_______________________________________________________________
    # Code for model fitting Linear model and Gradientboosted regressor
    X = df_new[num_column].values
    y = df_new['SalePrice'].values
    X_test, X_train, y_test, y_train = train_test_split(X, y, train_size=0.8, random_state=30)
    vals = np.sort(X).reshape(-1,1)
    
    # SGD
    sgd_pipe = Pipeline([('scaler', StandardScaler()),
                         ('sgd', SGDRegressor())])
    sgd_gs = GridSearchCV(estimator=sgd_pipe,
                          cv=5,
                          n_jobs=-1,
                          param_grid = {'sgd__eta0':[0.001, 0.01, 0.1, 0.3],
                                        'sgd__max_iter':[300, 1000, 3000]})
    sgd_gs.fit(X_train.reshape(-1,1), y_train.reshape(-1,1))
    
    # Gradient Boosted Trees
    gbt_gs = GridSearchCV(estimator=GradientBoostingRegressor(),
                          cv=5,
                          n_jobs=-1,
                          param_grid = {'max_depth':[3, 10, 30],
                                        'n_estimators':[30, 100, 300]})
    gbt_gs.fit(X_train.reshape(-1,1), y_train.reshape(-1,1))
    
    
    # Numerical plot
    ax[0].scatter(df[num_column].values, df['SalePrice'].values , color='k', s=10)
    ax[0].plot(vals, sgd_gs.best_estimator_.predict(vals), color='r')
    ax[0].plot(vals, gbt_gs.best_estimator_.predict(vals), color='b')
    ax[0].set_xlabel('Sale price', fontsize=15)
    ax[0].set_ylabel(num_column, fontsize=15)
    
    
    ### CATEGORICAL DATA ###
    #_______________________________________________________________
    # Categorical plots
    ax[1] = sns.boxplot(x='SalePrice', y=cat_column, data=df, ax=ax[1])
    ax[1].set_xlabel('Sale price', fontsize=15)
    ax[1].set_ylabel(cat_column, fontsize=15)
    

# Cleaning 
num_rows = df.shape[0]
columns_to_include = [i for i in df.columns if df[i].isna().sum() < int(0.1*num_rows)]
df_new = df[columns_to_include].dropna(how='any').copy()

numeric_columns = [b for _,b in enumerate(df_new.dtypes.to_dict()) if is_numeric_dtype(df_new[b])] 
numeric_columns.remove('SalePrice')
cat_columns = [b for _,b in enumerate(df_new.dtypes.to_dict()) if is_object_dtype(df_new[b])] 


# Ipywidgets
interact(plot_scatter,
         num_column = Dropdown(options=numeric_columns,
                      value='MSSubClass',
                      description='Numeric Feature:',
                      disabled=False),
        cat_column = Dropdown(options=cat_columns,
                      value='Street',
                      description='Categorical Feature:',
                      disabled=False))
    

interactive(children=(Dropdown(description='Numeric Feature:', index=1, options=('Id', 'MSSubClass', 'LotArea'…

<function __main__.plot_scatter(num_column, cat_column)>

## 3. Model fitting and prediction

In [105]:
df_new.shape

(1338, 75)

In [104]:
# Removing columns with significant number of Nan's and 
# entries that have nan's 
num_rows = df.shape[0]
columns_to_include = [i for i in df.columns if df[i].isna().sum() < int(0.1*num_rows)]
df_new = df[columns_to_include].dropna(how='any').copy()

# Selecting numeric and categorical columns
numeric_columns = [b for _,b in enumerate(df_new.dtypes.to_dict()) if is_numeric_dtype(df_new[b])] 
numeric_columns.remove('SalePrice')
cat_columns = [b for _,b in enumerate(df_new.dtypes.to_dict()) if is_object_dtype(df_new[b])] 

# Defining inputs and outputs
X = df_new.drop(columns = ['SalePrice']).copy()
y = df_new['SalePrice']

# One hot encoder
transformer_name = 'ohe_categorial'
ohe_final = ColumnTransformer([
            (transformer_name, OneHotEncoder(sparse=False), cat_columns)],
            remainder='passthrough')

sgd_pipe = Pipeline([('ohe', ohe_final),
                     ('scaler', StandardScaler()),
                     ('sgd', SGDRegressor())])

# Train test split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size=0.8, random_state=30)
sgd_gs = GridSearchCV(estimator=sgd_pipe,
                      cv=5,
                      n_jobs=-1,
                      param_grid = {'sgd__eta0':[0.001, 0.01, 0.1, 0.3],
                                    'sgd__max_iter':[300, 1000, 3000]})
# Fitting the model
sgd_gs.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/sklearn/pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/Users/ade/opt/anaconda3/envs/ML/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dis

In [103]:
sgd_gs.best_estimator_.predict(X_test)

ValueError: Found unknown categories ['C (all)'] in column 0 during transform