<a href="https://colab.research.google.com/github/bakudbilla/house_price-prediction/blob/main/multiple_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the necessary python libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [None]:
#load dataset
url="https://raw.githubusercontent.com/kennedyuche/linear-regression/main/housing_price_dataset.csv"
data = pd.read_csv(url)# use sep="," for coma separation.

In [None]:
data


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [None]:
data.describe()

Unnamed: 0,SquareFeet;Bedrooms;Bathrooms;Neighborhood;YearBuilt;Price
count,50000
unique,50000
top,2126;4;1;Rural;1969;215355.2836182
freq,1


In [None]:
# Check for missing data
data.isnull().sum()

SquareFeet;Bedrooms;Bathrooms;Neighborhood;YearBuilt;Price    0
dtype: int64

# Data Preprocessing

In [None]:
# Encode the categorical variables
label_encoder = LabelEncoder()


data["Neighborhood"] = label_encoder.fit_transform(data["Neighborhood"])

In [None]:
data

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,0,1969,215355.283618
1,2459,3,2,0,1980,195014.221626
2,1860,2,1,1,1970,306891.012076
3,2294,2,1,2,1996,206786.787153
4,2130,5,2,1,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,0,1975,100080.865895
49996,2854,2,2,1,1988,374507.656727
49997,2979,5,3,1,1962,384110.555590
49998,2596,5,2,0,1984,380512.685957


In [None]:
# Split the dataset into target and features
y = data["Price"]
x = data.drop(["Price", "YearBuilt"], axis=1)

In [None]:
x

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood
0,2126,4,1,0
1,2459,3,2,0
2,1860,2,1,1
3,2294,2,1,2
4,2130,5,2,1
...,...,...,...,...
49995,1282,5,3,0
49996,2854,2,2,1
49997,2979,5,3,1
49998,2596,5,2,0


In [None]:
# Split the features and target dataset into train and test sets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model training

In [None]:
# Fit model to the training data
model = LinearRegression()

model.fit(x_train, y_train)

# Model Evaluation

In [None]:
# Predict the target values for the features test set
y_pred = model.predict(x_test)

In [None]:
x_test

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood
33553,1894,5,1,0
9427,1001,5,3,1
199,2264,4,3,1
12447,2299,5,1,1
39489,2651,2,1,1
...,...,...,...,...
28567,2005,3,3,2
25079,1725,4,2,1
18707,2885,3,2,2
15200,1674,5,2,2


In [None]:
y_pred

array([217838.91793321, 135758.35530229, 256005.99550325, ...,
       310213.01289175, 200362.96188532, 243624.01808175])

In [None]:
#Evaluate the regression metrics for the trained model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
mse

2437198334.2361856

In [None]:
r2

0.5753975371352564

In [None]:
print(f"MSE => {round(mse, 2)}")
print(f"R2 => {round(r2, 2)}")

MSE => 2437198334.24
R2 => 0.58


In [None]:
#R2 score shows a fair value for model training hence it can be used to make predictions for prices based on the  features used for the training.