In [2]:
# reading in the libraries and functions that we will need as we do this work.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
import scipy.stats as st
import statsmodels.api as sm 
import pylab as py 

# here are some of the tools we will use for our analyses
from sklearn.linear_model import LinearRegression
from sklearn.metrics import PredictionErrorDisplay
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
ames = pd.read_csv(r"C:\Users\andre\Downloads\ames2.csv", na_values={'NA'})
ames.dropna(inplace=True)
ames.head()

Unnamed: 0,LotArea,GrLivArea,OverallQual,FullBath,TotalPorchSF,BsmtFinSF1,GaragedCat,TotRmsAbvGrd,TotalBsmtSF,YearBuilt,YrSold,BsmtUnfSF,GarageArea,MoSold,PavedDrive01,MSSubClass,logSalePrice
0,9120,1820,7,2,100,329,1,8,1026,1925,2008,697,240,6,0,50,12.122691
1,4060,1337,6,2,68,266,0,5,1405,1998,2008,1139,511,8,1,120,12.106252
2,34650,1056,5,1,0,1056,0,5,1056,1955,2006,0,572,1,1,190,11.884489
3,21750,1771,5,1,0,0,0,9,0,1960,2009,0,336,11,1,20,11.652687
4,11500,845,4,1,0,0,1,5,0,1957,2009,0,290,1,0,20,11.338572


In [4]:
# Make model for all variables
X_all = ames[['LotArea','GrLivArea','OverallQual','FullBath','TotalPorchSF','BsmtFinSF1','GaragedCat','TotRmsAbvGrd','TotalBsmtSF','YearBuilt','YrSold','BsmtUnfSF','GarageArea','MoSold','PavedDrive01','MSSubClass']]
y = ames['logSalePrice']
model_all = LinearRegression()
model_all.fit(X_all, y)
model_all1 = sm.OLS(y, sm.add_constant(X_all)).fit()
print(model_all1.summary())


                            OLS Regression Results                            
Dep. Variable:           logSalePrice   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     405.7
Date:                Thu, 05 Feb 2026   Prob (F-statistic):               0.00
Time:                        15:49:06   Log-Likelihood:                 501.20
No. Observations:                1000   AIC:                            -968.4
Df Residuals:                     983   BIC:                            -885.0
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           14.5370      7.208      2.017   

In [5]:
# Checking correlations(r) between independent variables
r = X_all.corr()
print(r)

               LotArea  GrLivArea  OverallQual  FullBath  TotalPorchSF  \
LotArea       1.000000   0.271826     0.124242  0.155140      0.089529   
GrLivArea     0.271826   1.000000     0.610830  0.639111      0.255120   
OverallQual   0.124242   0.610830     1.000000  0.579177      0.174789   
FullBath      0.155140   0.639111     0.579177  1.000000      0.083149   
TotalPorchSF  0.089529   0.255120     0.174789  0.083149      1.000000   
BsmtFinSF1    0.174474   0.136687     0.215519  0.080169      0.019961   
GaragedCat   -0.130252  -0.103092    -0.246449 -0.213392     -0.006152   
TotRmsAbvGrd  0.190936   0.842012     0.461186  0.542785      0.173314   
TotalBsmtSF   0.235157   0.403473     0.539971  0.340306      0.130104   
YearBuilt     0.036196   0.220854     0.557037  0.505925     -0.135091   
YrSold       -0.023266   0.016548    -0.005187  0.046511     -0.037365   
BsmtUnfSF     0.030978   0.242545     0.311923  0.266291      0.073482   
GarageArea    0.202571   0.462122     

In [6]:
# Checking for multicollinearity with VIF
vif_data_all = pd.DataFrame()
vif_data_all["feature"] = X_all.columns
vif_data_all["VIF"] = [variance_inflation_factor(X_all.values, i) for i in range(len(X_all.columns))]
print(vif_data_all)

         feature           VIF
0        LotArea      3.241113
1      GrLivArea     54.813909
2    OverallQual     54.133025
3       FullBath     21.364582
4   TotalPorchSF      1.997963
5     BsmtFinSF1     16.152773
6     GaragedCat      2.100484
7   TotRmsAbvGrd     63.973197
8    TotalBsmtSF     57.797499
9      YearBuilt  11136.591626
10        YrSold  10366.300345
11     BsmtUnfSF     21.004217
12    GarageArea     11.100300
13        MoSold      6.591728
14  PavedDrive01     14.779276
15    MSSubClass      3.485891


In [8]:
# Making model after removing variables with high VIF
X_reduced = ames[['LotArea', 'GrLivArea', 'GarageArea', 'OverallQual', 'BsmtFinSF1', 'YearBuilt']]
y = ames['logSalePrice']
model_reduced = LinearRegression()
model_reduced.fit(X_reduced, y)
model_reduced1 = sm.OLS(y, sm.add_constant(X_reduced)).fit()
print(model_reduced1.summary())

                            OLS Regression Results                            
Dep. Variable:           logSalePrice   R-squared:                       0.856
Model:                            OLS   Adj. R-squared:                  0.855
Method:                 Least Squares   F-statistic:                     985.4
Date:                Thu, 05 Feb 2026   Prob (F-statistic):               0.00
Time:                        16:00:19   Log-Likelihood:                 456.52
No. Observations:                1000   AIC:                            -899.0
Df Residuals:                     993   BIC:                            -864.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           5.4309      0.399     13.618      

In [9]:
# Check VIF scores
vif_data_reduced = pd.DataFrame()
vif_data_reduced["feature"] = X_reduced.columns
vif_data_reduced["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(len(X_reduced.columns))]
print(vif_data_reduced)

       feature        VIF
0      LotArea   3.038864
1    GrLivArea  17.364805
2   GarageArea   9.017902
3  OverallQual  40.810212
4   BsmtFinSF1   2.233396
5    YearBuilt  22.752201
