In [513]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [514]:
df = pd.read_csv("../../dataset/housing_price_dataset.csv")

In [515]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [516]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2006.37468,3.4987,1.99542,1985.40442,224827.325151
std,575.513241,1.116326,0.815851,20.719377,76141.842966
min,1000.0,2.0,1.0,1950.0,-36588.165397
25%,1513.0,3.0,1.0,1967.0,169955.860225
50%,2007.0,3.0,2.0,1985.0,225052.141166
75%,2506.0,4.0,3.0,2003.0,279373.630052
max,2999.0,5.0,3.0,2021.0,492195.259972


In [517]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [518]:
# chang the year_built into the age of the building
current_year = pd.Timestamp.now().year
df['Age'] = current_year -df['YearBuilt']

df.drop(columns=['YearBuilt'], inplace=True )
df.head()


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,Price,Age
0,2126,4,1,Rural,215355.283618,57
1,2459,3,2,Rural,195014.221626,46
2,1860,2,1,Suburb,306891.012076,56
3,2294,2,1,Urban,206786.787153,30
4,2130,5,2,Suburb,272436.239065,25


In [519]:
df['Neighborhood'].value_counts()

Neighborhood
Suburb    16721
Rural     16676
Urban     16603
Name: count, dtype: int64

In [520]:
df = pd.get_dummies(
    df,
    columns=['Neighborhood'],
    drop_first=True
)

df[['Neighborhood_Suburb', 'Neighborhood_Urban']] = (
    df[['Neighborhood_Suburb', 'Neighborhood_Urban']].astype(int)
)


In [521]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Price,Age,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,215355.283618,57,0,0
1,2459,3,2,195014.221626,46,0,0
2,1860,2,1,306891.012076,56,1,0
3,2294,2,1,206786.787153,30,0,1
4,2130,5,2,272436.239065,25,1,0


In [522]:
df.corr(numeric_only=True)['Price'].sort_values(ascending=False)


Price                  1.000000
SquareFeet             0.750720
Bedrooms               0.072624
Bathrooms              0.028418
Neighborhood_Urban     0.021658
Age                    0.002288
Neighborhood_Suburb   -0.014831
Name: Price, dtype: float64

In [523]:
x = df.drop(columns=['Price'])
y= df['Price']


In [524]:
# standarized the scale
from sklearn.preprocessing  import StandardScaler

In [525]:
scaler = StandardScaler()
x= scaler.fit_transform(x)


In [526]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [527]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [528]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)



Mean Absolute Error: 39430.16533829791
Mean Squared Error: 2436249371.3072467
R^2 Score: 0.5755628630306235


In [531]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr.fit(x_train, y_train)
y_pred = gbr.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)



Mean Absolute Error: 39456.73960785616
Mean Squared Error: 2442732171.879167
R^2 Score: 0.5744334460888706


In [532]:
# trying other model to see where  it is working or not.. i am using the random forest model

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Mean Absolute Error: 41920.7556060296
Mean Squared Error: 2767092936.8387284
R^2 Score: 0.5179241428763011


In [533]:
# try with the random forest

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Absolute Error: 57851.38945891545
Mean Squared Error: 5268817222.663641
R^2 Score: 0.0820801336924103
