In [24]:
# Common imports
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from joblib import dump, load

In [25]:
data = pd.read_csv("data_processed.csv")
data

Unnamed: 0,Overall Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Garage Cars,Garage Area,SalePrice
0,6,1080.0,1656,1656,2.0,528.0,215000
1,5,882.0,896,896,1.0,730.0,105000
2,6,1329.0,1329,1329,1.0,312.0,172000
3,7,2110.0,2110,2110,2.0,522.0,244000
4,5,928.0,928,1629,2.0,482.0,189900
...,...,...,...,...,...,...,...
2923,6,1003.0,1003,1003,2.0,588.0,142500
2924,5,864.0,902,902,2.0,484.0,131000
2925,5,912.0,970,970,0.0,0.0,132000
2926,5,1389.0,1389,1389,2.0,418.0,170000


#### Define X and Y

In [77]:
### Best model performance using only `Overral Qual`, `Total Bsmt SF`, `Gr Liv Area` and `Garage Cars`
X=data.drop(['SalePrice', 'Garage Area', '1st Flr SF'], axis=1)

y=data['SalePrice']

#### Scaling on the feature for linear model

In [78]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Model 1 - Linear model

In [79]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [80]:
# Create a linear regression model
linearmodel = LinearRegression()

# Fit the model to the training data
linearmodel.fit(X_train, y_train)


LinearRegression()

In [81]:
y_pred = linearmodel.predict(X_test)

In [82]:
MAE_linear = metrics.mean_absolute_error(y_test,y_pred)
MSE_linear = metrics.mean_squared_error(y_test,y_pred)
RMSE_linear = np.sqrt(MSE_linear)

pd.DataFrame([MAE_linear, MSE_linear, RMSE_linear],
             index=['MAE', 'MSE', 'RMSE'], columns=['metrics'])

Unnamed: 0,metrics
MAE,25877.38
MSE,1814903000.0
RMSE,42601.68


### Model 2- Polynomial model

#### Transform features into polynomial terms

In [83]:
# Create polynomial features of degree 2
polynomial_converter = PolynomialFeatures(degree=2, include_bias=False)
X_poly = polynomial_converter.fit_transform(X_scaled)


In [84]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)


#### Training a Polynomial Regression Model

In [85]:
polymodel=LinearRegression()

In [86]:
polymodel.fit(X_train, y_train)

LinearRegression()

#### Predicting testing data

In [87]:
y_pred=polymodel.predict(X_test)

In [88]:
pd.DataFrame({'Y_Test': y_test,'Y_Pred':y_pred, 'Residuals':(y_test-y_pred) }).head(5)

Unnamed: 0,Y_Test,Y_Pred,Residuals
2390,344133,276996.872565,67136.127435
196,132000,135464.226525,-3464.226525
2307,192100,249802.240131,-57702.240131
1731,198444,187668.17415,10775.82585
1100,250000,297120.103327,-47120.103327


In [89]:
MAE_Poly = metrics.mean_absolute_error(y_test,y_pred)
MSE_Poly = metrics.mean_squared_error(y_test,y_pred)
RMSE_Poly = np.sqrt(MSE_Poly)

pd.DataFrame([MAE_Poly, MSE_Poly, RMSE_Poly],
             index=['MAE', 'MSE', 'RMSE'], columns=['metrics'])


Unnamed: 0,metrics
MAE,21630.17
MSE,1036006000.0
RMSE,32187.04


### Model 3 - Random Forest Model

In [90]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
# Initialize and train the Random Forest regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # 100 trees in the forest
rf_model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [92]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [93]:
MAE_rf = metrics.mean_absolute_error(y_test,y_pred)
MSE_rf = metrics.mean_squared_error(y_test,y_pred)
RMSE_rf = np.sqrt(MSE_linear)

pd.DataFrame([MAE_rf, MSE_rf, RMSE_rf],
             index=['MAE', 'MSE', 'RMSE'], columns=['metrics'])

Unnamed: 0,metrics
MAE,20312.28
MSE,906887700.0
RMSE,42601.68
