In [None]:
# Reading in the libraries and functions that we will need as we do this work.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
import scipy.stats as st
import statsmodels.api as sm 
import pylab as py 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import PredictionErrorDisplay
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
ames = pd.read_csv(r"C:\Users\andre\Downloads\ames2.csv", na_values={'NA'})
ames.dropna(inplace=True)
ames.head()

Unnamed: 0,LotArea,GrLivArea,OverallQual,FullBath,TotalPorchSF,BsmtFinSF1,GaragedCat,TotRmsAbvGrd,TotalBsmtSF,YearBuilt,YrSold,BsmtUnfSF,GarageArea,MoSold,PavedDrive01,MSSubClass,logSalePrice
0,9120,1820,7,2,100,329,1,8,1026,1925,2008,697,240,6,0,50,12.122691
1,4060,1337,6,2,68,266,0,5,1405,1998,2008,1139,511,8,1,120,12.106252
2,34650,1056,5,1,0,1056,0,5,1056,1955,2006,0,572,1,1,190,11.884489
3,21750,1771,5,1,0,0,0,9,0,1960,2009,0,336,11,1,20,11.652687
4,11500,845,4,1,0,0,1,5,0,1957,2009,0,290,1,0,20,11.338572


In [8]:
# Make model for first few features to get a baseline for our model. We will add more features later and see how the model changes.
X_base = ames[['LotArea','GrLivArea','OverallQual']]
y = ames['logSalePrice']
model_base = LinearRegression()
model_base.fit(X_base, y)
model_base1 = sm.OLS(y, sm.add_constant(X_base)).fit()
print(model_base1.summary())


                            OLS Regression Results                            
Dep. Variable:           logSalePrice   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     1129.
Date:                Mon, 09 Feb 2026   Prob (F-statistic):          7.98e-320
Time:                        19:16:13   Log-Likelihood:                 227.66
No. Observations:                1000   AIC:                            -447.3
Df Residuals:                     996   BIC:                            -427.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          10.4722      0.028    374.411      

In [11]:
# Now will check remaining features to see if r^2 adj increases with the addition of more features. We will add features one at a time and see how the model changes.
remaining_features = [
    'FullBath','TotalPorchSF','BsmtFinSF1','GaragedCat',
    'TotRmsAbvGrd','TotalBsmtSF','YearBuilt','YrSold',
    'BsmtUnfSF','GarageArea','MoSold','PavedDrive01','MSSubClass'
]

useful_features = []

for feature in remaining_features:
    X_test = sm.add_constant(
        ames[['LotArea', 'GrLivArea', 'OverallQual', feature]]
    )
    model_test = sm.OLS(y, X_test).fit()
    
    useful_features.append(
        {
            "Variable Added": feature,
            "Adj_R2": model_test.rsquared_adj,
            "Delta_Adj_R2": model_test.rsquared_adj - model_base1.rsquared_adj
        }
    )

results_df = pd.DataFrame(useful_features).sort_values(
    "Delta_Adj_R2", ascending=False
)

results_df

Unnamed: 0,Variable Added,Adj_R2,Delta_Adj_R2
6,YearBuilt,0.821723,0.049676
2,BsmtFinSF1,0.807187,0.03514
9,GarageArea,0.806675,0.034628
5,TotalBsmtSF,0.805308,0.033261
11,PavedDrive01,0.795632,0.023585
12,MSSubClass,0.779044,0.006997
3,GaragedCat,0.777944,0.005897
0,FullBath,0.774533,0.002486
8,BsmtUnfSF,0.774092,0.002045
4,TotRmsAbvGrd,0.773226,0.001178


- Will add strongest features to make new model

In [13]:
x_final = ames[['LotArea', 'OverallQual', 'GrLivArea', 'YearBuilt', 'BsmtFinSF1', 'GarageArea', 'PavedDrive01']]
y_final = ames['logSalePrice']
model_final = sm.OLS(y_final, sm.add_constant(x_final)).fit()
print(model_final.summary())

                            OLS Regression Results                            
Dep. Variable:           logSalePrice   R-squared:                       0.859
Model:                            OLS   Adj. R-squared:                  0.858
Method:                 Least Squares   F-statistic:                     861.2
Date:                Mon, 09 Feb 2026   Prob (F-statistic):               0.00
Time:                        19:34:12   Log-Likelihood:                 465.26
No. Observations:                1000   AIC:                            -914.5
Df Residuals:                     992   BIC:                            -875.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            5.9619      0.415     14.352   

In [14]:
# Checking for multicollinearity with VIF
vif_data = pd.DataFrame()
vif_data["feature"] = x_final.columns
vif_data["VIF"] = [variance_inflation_factor(x_final.values, i) for i in range(len(x_final.columns))]
print(vif_data)

        feature        VIF
0       LotArea   3.039931
1   OverallQual  41.046476
2     GrLivArea  17.476771
3     YearBuilt  30.140681
4    BsmtFinSF1   2.275390
5    GarageArea   9.286744
6  PavedDrive01  13.123496
