In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler



In [3]:
f = pd.read_csv("data_engineered.csv")
df = pd.DataFrame(f)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 2616
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SalePrice        2617 non-null   float64
 1   LnSalePrice      2617 non-null   float64
 2   Age              2617 non-null   float64
 3   GrLivArea        2617 non-null   float64
 4   BaseLivArea      2617 non-null   float64
 5   Location         2617 non-null   int64  
 6   Amenities        2617 non-null   int64  
 7   RoadRail         2617 non-null   int64  
 8   BedroomAbvGr     2617 non-null   float64
 9   Bathrooms        2617 non-null   float64
 10  OverallCond      2617 non-null   float64
 11  OverallQual      2617 non-null   float64
 12  LotFrontage      2617 non-null   float64
 13  LotArea          2617 non-null   float64
 14  TwoStory_dum     2617 non-null   int64  
 15  FlatContour_dum  2617 non-null   int64  
 16  FlatRoof_dum     2617 non-null   int64  
 17  GarageArea    

In [5]:
# Deleting the single null value in 'GarageArea'
df = df[~df['GarageArea'].isnull()]
print(df)

      SalePrice  LnSalePrice   Age  GrLivArea  BaseLivArea  Location  \
0      215000.0    12.278393  50.0     1656.0        639.0         2   
1      105000.0    11.561716  49.0      896.0        612.0         2   
2      172000.0    12.055250  52.0     1329.0        923.0         2   
3      244000.0    12.404924  42.0     2110.0       1065.0         2   
4      189900.0    12.154253  13.0     1629.0        791.0         2   
...         ...          ...   ...        ...          ...       ...   
2612   142500.0    11.867097  22.0     1003.0        819.0         2   
2613   131000.0    11.782953  23.0      902.0        625.0         2   
2614   132000.0    11.790557  14.0      970.0        337.0         2   
2615   170000.0    12.043554  32.0     1389.0       1194.0         2   
2616   188000.0    12.144197  13.0     2000.0        758.0         2   

      Amenities  RoadRail  BedroomAbvGr  Bathrooms  ...  LowQualFinSF  \
0             0         0           3.0        1.0  ...       

In [6]:
df.isnull().values.any()

False

In [7]:
df = df.drop(['Bathrooms'], axis=1)

In [8]:
df_0609 = df.loc[df['YrSold_2010'] != 1]
df_0609.shape

(2307, 29)

In [9]:
# Assigning 2010 data as the holdout test set
df_2010 = df.loc[df['YrSold_2010'] == 1]

In [10]:
y_SP = df_0609['SalePrice']
y_lnSP = df_0609['LnSalePrice']

In [11]:
X = df_0609.drop(['SalePrice', 'LnSalePrice'], axis=1)

In [12]:
X.shape

(2307, 27)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lnSP, test_size=0.3, random_state=8)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
#Linear Regression model for sale price prediction
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

In [16]:
# Predict on the test set

y_pred = linear_regression_model.predict(X_test)


In [17]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [18]:
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 0.07703466855469435
Mean Squared Error (MSE): 0.011182778443829379
Root Mean Squared Error (RMSE): 0.10574865693629106


In [19]:
#Polynomial 2nd degree model for sale price prediction
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Assuming you have already split your data into train_x, train_y, test_x, test_y
train_x, test_x, train_y, test_y = train_test_split(X, y_lnSP, test_size=0.2, random_state=42)

# Create polynomial features
poly = PolynomialFeatures(degree=2)
train_x_poly = poly.fit_transform(train_x)
test_x_poly = poly.transform(test_x)

# Train the polynomial regression model
model = LinearRegression()
model.fit(train_x_poly, train_y)

# Make predictions
train_y_pred = model.predict(train_x_poly)
test_y_pred = model.predict(test_x_poly)

# Calculate metrics
train_mae = mean_absolute_error(train_y, train_y_pred)
test_mae = mean_absolute_error(test_y, test_y_pred)

train_mse = mean_squared_error(train_y, train_y_pred)
test_mse = mean_squared_error(test_y, test_y_pred)

train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Print metrics

print("Test MAE:", test_mae)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)


Test MAE: 0.08800245095837231
Test MSE: 0.053181719618027
Test RMSE: 0.23061162073500763
