# 🕵️‍♀️ Inference analysis with OLS (Ordinary Least Squares)

In [1]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# load dataset
df = pd.read_csv('clean_df.csv')

# OLS without scaling features

In [3]:
X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

In [4]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

# ols regression without standard scaling
X_train_smOLS = sm.add_constant(X_train)
smOLS = sm.OLS(y_train, X_train_smOLS).fit()

print(smOLS.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.650
Model:                            OLS   Adj. R-squared:                  0.649
Method:                 Least Squares   F-statistic:                     764.1
Date:                Fri, 24 Sep 2021   Prob (F-statistic):               0.00
Time:                        09:47:47   Log-Likelihood:                -62145.
No. Observations:                4953   AIC:                         1.243e+05
Df Residuals:                    4940   BIC:                         1.244e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const              -1.761e+06   1.45

# OLS with scaling features

In [6]:
# standardizing the features (scaling)
X_std = StandardScaler().fit_transform(X_train)

In [7]:
# ols regression with standard scaling
X_train_smOLS_std = sm.add_constant(X_std)
smOLS = sm.OLS(y_train, X_train_smOLS_std).fit()

print(smOLS.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.647
Model:                            OLS   Adj. R-squared:                  0.646
Method:                 Least Squares   F-statistic:                     824.1
Date:                Wed, 22 Sep 2021   Prob (F-statistic):               0.00
Time:                        15:44:31   Log-Likelihood:                -62179.
No. Observations:                4953   AIC:                         1.244e+05
Df Residuals:                    4941   BIC:                         1.245e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.074e+05    974.716    212.831      0.0