In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.regression.linear_model as sm

# algo and estimators
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('../../datasets/house_pricing.csv')
df.head()

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


In [4]:
df.describe(include='object')

Unnamed: 0,City,Type
count,814,814
unique,36,3
top,SACRAMENTO,Residential
freq,424,759


In [5]:
city_henc = OneHotEncoder(drop='first')
city_dummies = city_henc.fit_transform(df[['City']]).toarray()

In [6]:
city_dummies.shape

(814, 35)

In [7]:
type_henc = OneHotEncoder(drop='first')
type_dummies = type_henc.fit_transform(df[['Type']]).toarray()

In [8]:
type_dummies.shape

(814, 2)

In [9]:
df.drop(labels=['City','Type'],axis=1,inplace=True)

In [10]:
df.values

array([[2.0000000e+00, 1.0000000e+00, 8.3600000e+02, 1.3815985e+05],
       [3.0000000e+00, 1.0000000e+00, 1.1670000e+03, 1.6754146e+05],
       [2.0000000e+00, 1.0000000e+00, 7.9600000e+02, 1.1909512e+05],
       ...,
       [3.0000000e+00, 2.0000000e+00, 1.2160000e+03, 1.8174698e+05],
       [4.0000000e+00, 2.0000000e+00, 1.6850000e+03, 2.4538559e+05],
       [3.0000000e+00, 2.0000000e+00, 1.3620000e+03, 2.6335513e+05]])

In [11]:
y = df['Price']

In [12]:
X = df.drop(labels=['Price'],axis=1).values

In [13]:
X.shape

(814, 3)

In [14]:
X = np.concatenate((X,city_dummies,type_dummies),axis=1)

In [15]:
X.shape

(814, 40)

In [16]:
X = np.append(np.ones((814,1)),values=X,axis=1)

In [83]:
z = [0,3,4,5,6,14,15,16,17,26,28,30,31,35,36,37,38]
X_opt = X[:,z]
regressor = sm.OLS(endog=y,exog=X_opt).fit()
regressor.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.878
Method:,Least Squares,F-statistic:,367.6
Date:,"Wed, 04 Nov 2020",Prob (F-statistic):,0.0
Time:,11:40:24,Log-Likelihood:,-9640.7
No. Observations:,814,AIC:,19320.0
Df Residuals:,797,BIC:,19400.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.968e+04,3269.304,9.079,0.000,2.33e+04,3.61e+04
x1,127.9755,1.928,66.384,0.000,124.191,131.760
x2,1.143e+05,1.97e+04,5.805,0.000,7.57e+04,1.53e+05
x3,1.224e+05,3.41e+04,3.591,0.000,5.55e+04,1.89e+05
x4,7.689e+04,7716.227,9.964,0.000,6.17e+04,9.2e+04
x5,6.985e+04,1.14e+04,6.118,0.000,4.74e+04,9.23e+04
x6,1.098e+05,8966.462,12.242,0.000,9.22e+04,1.27e+05
x7,902.2751,7717.606,0.117,0.907,-1.42e+04,1.61e+04
x8,8.274e+04,1.97e+04,4.194,0.000,4.4e+04,1.21e+05

0,1,2,3
Omnibus:,988.31,Durbin-Watson:,1.86
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96003.681
Skew:,6.147,Prob(JB):,0.0
Kurtosis:,54.763,Cond. No.,50700.0


In [84]:
regressor.pvalues.sort_values(ascending=False)

x7       9.069598e-01
x15      8.414185e-01
x11      7.537006e-01
x12      2.622322e-01
x10      1.033002e-02
x3       3.494249e-04
x14      1.365296e-04
x8       3.051416e-05
x2       9.306833e-09
x5       1.488749e-09
const    8.448303e-19
x16      2.248079e-19
x9       1.473126e-20
x4       4.030878e-22
x6       1.068613e-31
x13      7.916157e-83
x1       0.000000e+00
dtype: float64