# MLR with Categorical predictors

In [20]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

In [21]:
from statsmodels import __version__
print(__version__)

0.14.0


In [22]:
df0 = pd.read_csv('homes.csv')
df0[:3]

Unnamed: 0,price,area,beds,baths,garage,year,style,lotsize,ac,pool,quality,highway
0,360000,3032,4,4,2,1972,1,22221,YES,NO,MEDIUM,NO
1,340000,2058,4,2,2,1976,1,22912,YES,NO,MEDIUM,NO
2,250000,1780,4,3,2,1980,1,21345,YES,NO,MEDIUM,NO


In [23]:
# Change style to categorical

In [24]:
df0['style'] = df0['style'].astype(object)
df0.dtypes

price       int64
area        int64
beds        int64
baths       int64
garage      int64
year        int64
style      object
lotsize     int64
ac         object
pool       object
quality    object
highway    object
dtype: object

In [25]:
pd.value_counts(df0.beds)

3    202
4    179
2     64
5     52
6     12
1      9
7      3
0      1
Name: beds, dtype: int64

In [26]:
# restrict on the number of beds

In [27]:
df = df0.copy()

In [28]:
df = df[(df.beds > 1) & (df.beds < 5)]
df.shape, df0.shape

((445, 12), (522, 12))

In [29]:
df.shape, df0.shape

((445, 12), (522, 12))

## statsmodels.formula.api

In [30]:
model1 = smf.ols(formula = 'price ~ area+beds+baths+garage+year+\
                                    C(style)+lotsize+C(ac)+C(pool)+\
                                    C(quality)+C(highway)',
                 data = df).fit()

In [31]:
model1 = smf.ols(formula = 'price ~ area+beds+baths+garage+year+\
                             style+lotsize+ac+pool+quality+highway',
                 data = df).fit()
model1.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.845
Method:,Least Squares,F-statistic:,135.9
Date:,"Wed, 20 Sep 2023",Prob (F-statistic):,1.08e-163
Time:,13:12:15,Log-Likelihood:,-5451.8
No. Observations:,445,AIC:,10940.0
Df Residuals:,426,BIC:,11020.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.473e+06,3.93e+05,-6.285,0.000,-3.25e+06,-1.7e+06
style[T.2],-2.006e+04,8638.441,-2.322,0.021,-3.7e+04,-3080.425
style[T.3],-1.151e+04,8251.632,-1.395,0.164,-2.77e+04,4707.107
style[T.4],2.31e+04,1.72e+04,1.346,0.179,-1.06e+04,5.68e+04
style[T.5],-7407.6304,1.55e+04,-0.479,0.632,-3.78e+04,2.3e+04
style[T.6],-3.06e+04,1.51e+04,-2.029,0.043,-6.02e+04,-951.905
style[T.7],-4.664e+04,8514.836,-5.477,0.000,-6.34e+04,-2.99e+04
style[T.9],-9.094e+04,5.26e+04,-1.728,0.085,-1.94e+05,1.25e+04
ac[T.YES],-1075.0761,7501.411,-0.143,0.886,-1.58e+04,1.37e+04

0,1,2,3
Omnibus:,58.445,Durbin-Watson:,1.344
Prob(Omnibus):,0.0,Jarque-Bera (JB):,269.978
Skew:,0.449,Prob(JB):,2.37e-59
Kurtosis:,6.709,Cond. No.,4260000.0


In [32]:
# Change quality base level to LOW

In [33]:
model1 = smf.ols(formula = 'price ~ area+beds+baths+garage+year+\
                                    C(style)+lotsize+C(ac)+C(pool)+\
                                    C(quality,Treatment(reference="LOW"))+\
                                    C(highway)',
                 data = df).fit()
model1.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.845
Method:,Least Squares,F-statistic:,135.9
Date:,"Wed, 20 Sep 2023",Prob (F-statistic):,1.08e-163
Time:,13:12:15,Log-Likelihood:,-5451.8
No. Observations:,445,AIC:,10940.0
Df Residuals:,426,BIC:,11020.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.61e+06,3.89e+05,-6.711,0.000,-3.37e+06,-1.85e+06
C(style)[T.2],-2.006e+04,8638.441,-2.322,0.021,-3.7e+04,-3080.425
C(style)[T.3],-1.151e+04,8251.632,-1.395,0.164,-2.77e+04,4707.107
C(style)[T.4],2.31e+04,1.72e+04,1.346,0.179,-1.06e+04,5.68e+04
C(style)[T.5],-7407.6304,1.55e+04,-0.479,0.632,-3.78e+04,2.3e+04
C(style)[T.6],-3.06e+04,1.51e+04,-2.029,0.043,-6.02e+04,-951.905
C(style)[T.7],-4.664e+04,8514.836,-5.477,0.000,-6.34e+04,-2.99e+04
C(style)[T.9],-9.094e+04,5.26e+04,-1.728,0.085,-1.94e+05,1.25e+04
C(ac)[T.YES],-1075.0761,7501.411,-0.143,0.886,-1.58e+04,1.37e+04

0,1,2,3
Omnibus:,58.445,Durbin-Watson:,1.344
Prob(Omnibus):,0.0,Jarque-Bera (JB):,269.978
Skew:,0.449,Prob(JB):,2.37e-59
Kurtosis:,6.709,Cond. No.,4220000.0


In [34]:
# Remove beds, ac

In [35]:
model2 = smf.ols(formula = 'price ~ area+baths+garage+year+\
                                    C(style)+lotsize+C(pool)+\
                                    C(quality,Treatment(reference="LOW"))+\
                                    C(highway)',
                 data = df).fit()
model2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,153.4
Date:,"Wed, 20 Sep 2023",Prob (F-statistic):,8.210000000000001e-166
Time:,13:12:15,Log-Likelihood:,-5452.0
No. Observations:,445,AIC:,10940.0
Df Residuals:,428,BIC:,11010.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.601e+06,3.82e+05,-6.803,0.000,-3.35e+06,-1.85e+06
C(style)[T.2],-2.081e+04,8517.731,-2.443,0.015,-3.76e+04,-4067.036
C(style)[T.3],-1.162e+04,8230.471,-1.412,0.159,-2.78e+04,4557.609
C(style)[T.4],2.298e+04,1.71e+04,1.343,0.180,-1.06e+04,5.66e+04
C(style)[T.5],-7512.9785,1.53e+04,-0.490,0.624,-3.76e+04,2.26e+04
C(style)[T.6],-3.101e+04,1.5e+04,-2.066,0.039,-6.05e+04,-1506.413
C(style)[T.7],-4.67e+04,8487.820,-5.502,0.000,-6.34e+04,-3e+04
C(style)[T.9],-9.06e+04,5.25e+04,-1.725,0.085,-1.94e+05,1.26e+04
C(pool)[T.YES],2.179e+04,1.05e+04,2.068,0.039,1075.399,4.25e+04

0,1,2,3
Omnibus:,58.505,Durbin-Watson:,1.34
Prob(Omnibus):,0.0,Jarque-Bera (JB):,268.359
Skew:,0.452,Prob(JB):,5.33e-59
Kurtosis:,6.695,Cond. No.,4150000.0


### Create DataFrame for prediction

In [36]:
newvalue = df[:1].copy()
del newvalue['price']

In [37]:
newvalue

Unnamed: 0,area,beds,baths,garage,year,style,lotsize,ac,pool,quality,highway
0,3032,4,4,2,1972,1,22221,YES,NO,MEDIUM,NO


In [38]:
list1 = [3150,2,3,2,1996,1,26250,'YES','YES','HIGH','NO']
list1

[3150, 2, 3, 2, 1996, 1, 26250, 'YES', 'YES', 'HIGH', 'NO']

In [39]:
list2 = newvalue.columns.tolist()
list2

['area',
 'beds',
 'baths',
 'garage',
 'year',
 'style',
 'lotsize',
 'ac',
 'pool',
 'quality',
 'highway']

In [40]:
newvalue2 = pd.DataFrame(list1,index = list2).T
newvalue2

Unnamed: 0,area,beds,baths,garage,year,style,lotsize,ac,pool,quality,highway
0,3150,2,3,2,1996,1,26250,YES,YES,HIGH,NO


In [41]:
# newvalue.append(list1,columns = list2)

In [42]:
newvalue.area = 3150
newvalue.beds = 2
newvalue.baths = 3
newvalue.garage = 2
newvalue.year = 1996
newvalue.style = 1
newvalue.lotsize = 26250 
newvalue.ac = 'YES'
newvalue.pool = 'YES'
newvalue.quality = 'HIGH'
newvalue.highway = 'NO'

In [43]:
newvalue

Unnamed: 0,area,beds,baths,garage,year,style,lotsize,ac,pool,quality,highway
0,3150,2,3,2,1996,1,26250,YES,YES,HIGH,NO


In [44]:
model1.predict(newvalue)

0    585922.526796
dtype: float64

In [45]:
# prediction with intervals

In [46]:
df1 = model1.get_prediction(newvalue)
df1.summary_frame()

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,585922.526796,15323.494616,555803.458271,616041.595321,479849.693761,691995.359831


In [47]:
# house price PI is (479849.70, 691995.36) dollars

## sklearn

In [48]:
from sklearn.linear_model import LinearRegression

In [49]:
# transform categoricals to binary columns

In [50]:
df2 = pd.get_dummies(df,columns = ['style','ac','pool','quality','highway'],\
                     drop_first = True)
df2[:5]

  df2 = pd.get_dummies(df,columns = ['style','ac','pool','quality','highway'],\


Unnamed: 0,price,area,beds,baths,garage,year,lotsize,style_2,style_3,style_4,style_5,style_6,style_7,style_9,ac_YES,pool_YES,quality_LOW,quality_MEDIUM,highway_YES
0,360000,3032,4,4,2,1972,22221,0,0,0,0,0,0,0,1,0,0,1,0
1,340000,2058,4,2,2,1976,22912,0,0,0,0,0,0,0,1,0,0,1,0
2,250000,1780,4,3,2,1980,21345,0,0,0,0,0,0,0,1,0,0,1,0
3,205500,1638,4,2,2,1963,17342,0,0,0,0,0,0,0,1,0,0,1,0
4,275500,2196,4,3,2,1968,21786,0,0,0,0,0,1,0,1,0,0,1,0


In [51]:
# Split df2 into response and predictors

In [52]:
Y = df2.price
X = df2.drop(columns = 'price',axis = 1)

In [53]:
len(X.columns)

18

In [54]:
# fit model

In [55]:
model2 = LinearRegression().fit(X,Y)

In [56]:
model2.intercept_

-2472913.9489935217

In [57]:
# suppress scientific notation (on numpy output)

In [58]:
np.set_printoptions(precision=None, suppress=True)

In [59]:
# model2 regression coefficients

In [60]:
df3 = pd.DataFrame(model2.coef_.round(2),
                   columns = ['coef'],
                   index = X.columns)
df3

Unnamed: 0,coef
area,117.22
beds,-2222.98
baths,9246.56
garage,7423.74
year,1300.62
lotsize,1.33
style_2,-20059.7
style_3,-11511.87
style_4,23097.27
style_5,-7407.63


In [61]:
# create DataFrame for prediction

In [62]:
newvalue = df2[:1].copy()
del newvalue['price']
newvalue

Unnamed: 0,area,beds,baths,garage,year,lotsize,style_2,style_3,style_4,style_5,style_6,style_7,style_9,ac_YES,pool_YES,quality_LOW,quality_MEDIUM,highway_YES
0,3032,4,4,2,1972,22221,0,0,0,0,0,0,0,1,0,0,1,0


In [63]:
newvalue.columns

Index(['area', 'beds', 'baths', 'garage', 'year', 'lotsize', 'style_2',
       'style_3', 'style_4', 'style_5', 'style_6', 'style_7', 'style_9',
       'ac_YES', 'pool_YES', 'quality_LOW', 'quality_MEDIUM', 'highway_YES'],
      dtype='object')

In [64]:
newvalue.area = 3150
newvalue.beds = 2
newvalue.baths = 3
newvalue.garage = 2
newvalue.year = 1996
newvalue.lotsize = 26250 
newvalue.ac_YES = 1
newvalue.pool_YES = 1
newvalue.quality_LOW = 0
newvalue.quality_MEDIUM = 0
newvalue.highway_YES = 0
newvalue

Unnamed: 0,area,beds,baths,garage,year,lotsize,style_2,style_3,style_4,style_5,style_6,style_7,style_9,ac_YES,pool_YES,quality_LOW,quality_MEDIUM,highway_YES
0,3150,2,3,2,1996,26250,0,0,0,0,0,0,0,1,1,0,0,0


In [65]:
model2.predict(newvalue)

array([585922.52679621])