<a href="https://colab.research.google.com/github/abelcodes501/Machine-Learning-Regression-Models-using-House-Price-Dataset/blob/main/House_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Prediction

In [1]:
import pandas as pd
import numpy as np



# Load the Dataset

In [2]:
df = pd.read_csv("Housing Price.csv")
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


# Pre‑Processing (One‑Hot Encoding)

We separate numerical and categorical features.

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

df_copy = df.copy()

categorical_cols = ["mainroad","guestroom","basement","hotwaterheating",
                    "airconditioning","prefarea","furnishingstatus"]

numeric_cols = ["area","bedrooms","bathrooms","stories","parking"]

preprocess = ColumnTransformer([
    ("onehot", OneHotEncoder(drop="first"), categorical_cols)
], remainder="passthrough")

# Select Important Features



In [4]:
important_features = [
    "area", "bedrooms", "bathrooms", "stories", "parking",
    "mainroad","airconditioning","furnishingstatus"
]

df_imp = df_copy[["price"] + important_features]

categorical_cols = ["mainroad","airconditioning","furnishingstatus"]
numeric_cols = ["area","bedrooms","bathrooms","stories","parking"]

preprocess = ColumnTransformer([
    ("onehot", OneHotEncoder(drop="first"), categorical_cols)
], remainder="passthrough")


# Split into Train & Test (Unseen Data)

In [6]:
from sklearn.model_selection import train_test_split

X = df_imp.drop("price", axis=1)
y = df_imp["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**MODEL TRAINING**

---




#Simple Linear Regression (1 Feature Only)

In [8]:
from sklearn.linear_model import LinearRegression

X_train_simple = X_train[["area"]]
X_test_simple = X_test[["area"]]

simple_lr = LinearRegression()
simple_lr.fit(X_train_simple, y_train)

y_pred_simple = simple_lr.predict(X_test_simple)

 # Multiple Linear Regression

In [10]:
from sklearn.pipeline import Pipeline

multi_lr = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

multi_lr.fit(X_train, y_train)
y_pred_multi = multi_lr.predict(X_test)

 # Polynomial Regression (Degree 2)**bold text**

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

poly_model = Pipeline([
    ("preprocess", preprocess),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", LinearRegression())
])

poly_model.fit(X_train, y_train)
y_pred_poly = poly_model.predict(X_test)

# KNN Regression

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Import Pipeline if it's not already globally imported or in the same cell

knn = Pipeline([
    ("preprocess", preprocess),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Model Evaluation

Function to calculate metrics

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(y_test, y_pred):
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)
    return mae, mse, rmse, r2

In [23]:
results = {
    "Simple Linear Regression": evaluate(y_test, y_pred_simple),
    "Multiple Linear Regression": evaluate(y_test, y_pred_multi),
    "Polynomial Regression": evaluate(y_test, y_pred_poly),
    "KNN Regression": evaluate(y_test, y_pred_knn)
}

results


{'Simple Linear Regression': (1474748.1337969352,
  3675286604768.185,
  np.float64(1917103.7021424233),
  0.27287851871974644),
 'Multiple Linear Regression': (1040832.4738026386,
  1969639146224.6365,
  np.float64(1403438.3300397051),
  0.6103251017941324),
 'Polynomial Regression': (1029666.0112659814,
  2104357168857.339,
  np.float64(1450640.2616973442),
  0.5836723863175346),
 'KNN Regression': (1337276.1467889908,
  3281002836422.0186,
  np.float64(1811353.8683598018),
  0.3508839176218841)}

  Final Summary

In [22]:
for model, (mae, mse, rmse, r2) in results.items():
    print(f"\n {model}")
    print(f"MAE  : {mae}")
    print(f"MSE  : {mse}")
    print(f"RMSE : {rmse}")
    print(f"R2   : {r2}")



 Simple Linear Regression
MAE  : 1474748.1337969352
MSE  : 3675286604768.185
RMSE : 1917103.7021424233
R2   : 0.27287851871974644

 Multiple Linear Regression
MAE  : 1040832.4738026386
MSE  : 1969639146224.6365
RMSE : 1403438.3300397051
R2   : 0.6103251017941324

 Polynomial Regression
MAE  : 1029666.0112659814
MSE  : 2104357168857.339
RMSE : 1450640.2616973442
R2   : 0.5836723863175346

 KNN Regression
MAE  : 1337276.1467889908
MSE  : 3281002836422.0186
RMSE : 1811353.8683598018
R2   : 0.3508839176218841
