<center><h1>Housing Price Prediction(Boston Dataset)</h1></center>

<h4>
    Prerequisite Installations
</h4>

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install streamlit pandas numpy joblib scikit-learn seaborn math

In [None]:
%autosave 10

- The following file contains machine learning model(s) which are created using Scikit learn library. These models can be used to predict prices of houses based upon certain criteria which are taken as their inputs and given out as their outputs.
- The dataset used is based on the 'Boston Housing dataset' available on kaggle.
- Boston : <href>https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset</href>

In [None]:
# Importing Necessary python Libraries and Dependancies.
import os
import joblib
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

<h2>Phase One : Data Collection</h2>

In [None]:
dataset_link = 'https://raw.githubusercontent.com/bishnu9009/Popular-ML-Datasets/main/boston_housing.csv'

In [None]:
# Creation of DataFrame from our dataset using Pandas library.

# boston_df = pd.read_csv('./BostonHousing.csv')     # Uncomment this line if you have the dataset locally downloaded and make sure to use the same file
                                                     # name as that of your downlaoded dataset.
boston_df = pd.read_csv(dataset_link)

In [None]:
# Displaying the two datasets.
print("Boston Housing Dataset first 10 Entries:\n")
display(boston_df.head(10))

<h3>Yap about the dataset here : </h3>

<h2>Phase Two : Data Preprocessing</h2>

In [None]:
# Checking for descriptive statistics of both datasets
print("Descriptive Stats about Boston DF : \n")
display(boston_df.describe())

In [None]:
# Checking the dataTypes and other information about both datasets
print("Information on Boston DF : \n")
print(boston_df.info())

In [None]:
# Checking for missing, NULL and NaN values in each dataset.

# For Boston Dataset
print("For Boston Dataset : ")
for column in boston_df:
    print(f"Column {column} has {boston_df[column].isnull().sum()}")
print("Null/NaN value(s).")

From the above analysis we can assess that : 
- Boston Dataset contains 5 missing values in the 'rm' row.

Some considerations for the datasets : 

- The Boston Dataset's 'rm' row is average number of rooms per dwelling. Since it contains certain number of missing values we are going to use this column's mode value as our missing value imputation.

In [None]:
# Missing value imputation for both dataframes.
boston_rm_mode = boston_df['rm'].mode()[0]                          # mode() returns a Series, so we use [0] to get the first mode value
boston_df['rm'].fillna(boston_rm_mode, inplace = True)

In [None]:
# Checking for missing, NULL and NaN values in each dataset after imputation

# For Boston Dataset
print("For Boston Dataset : ")
for column in boston_df:
    print(f"Column {column} has {boston_df[column].isnull().sum()}")
print("Null/NaN value(s).")

In [None]:
# First seven entries of the dataset.
boston_df.head(7)

In [None]:
# Normalisation and Feature Scaling
scaler = StandardScaler()          # scaler is an object of StandardScaler class which belongs to the Sklearn library.

# Boston Dataset
X_boston = boston_df.drop('medv', axis = 1)
y_boston = boston_df['medv']

X_boston_scaled = scaler.fit_transform(X_boston)

In [None]:
# Data Splitting for training and testing.

# For Boston Dataset
X_train, X_test, y_train, y_test = train_test_split(X_boston_scaled, y_boston, test_size = 0.2, random_state = 42)

<h2>Phase Three : Model Development</h2>

In [None]:
# Initialize the Linear Regression model

## Boston Dataset
print("Creating the model")
linear_model = LinearRegression()

# Train the model
print("Fitting the model")
linear_model.fit(X_train, y_train)

# Display the model coefficients
coef_df = pd.DataFrame(linear_model.coef_, X_boston.columns, columns = ['Coefficient'])
print(coef_df)

# Predicting the ouput from test data.
print("Predicting using the X_test")
y_pred_lm = linear_model.predict(X_test)

In [None]:
# Evaluate the linear regression model using plots

# Plot actual vs. predicted prices
plt.figure(figsize = (10, 6))
plt.scatter(y_test, y_pred_lm, edgecolor = 'k', alpha = 0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color = 'red', linewidth = 2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices')
plt.show()

# Residual plot
plt.figure(figsize = (10, 6))
residuals = y_test - y_pred_lm
plt.scatter(y_pred_lm, residuals, edgecolor = 'k', alpha = 0.7)
plt.axhline(y = 0, color = 'red', linewidth = 2)
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
# Initialize the RandomForestRegressor model

## Boston dataset
print("Creating the model")
random_regression = RandomForestRegressor()

# Train the model
print("Fitting the model")
random_regression.fit(X_train, y_train)

# Display the model coefficients
print("Predicting using the X_test")
y_pred_rf = random_regression.predict(X_test)

In [None]:
# Evaluate the Random Forest regression model using plots

# Plot actual vs. predicted prices
plt.figure(figsize = (10, 6))
plt.scatter(y_test, y_pred_rf, edgecolor = 'k', alpha = 0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color = 'red', linewidth = 2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices')
plt.show()

# Residual plot
plt.figure(figsize = (10, 6))
residuals = y_test - y_pred_rf
plt.scatter(y_pred_rf, residuals, edgecolor = 'k', alpha = 0.7)
plt.axhline(y = 0, color = 'red', linewidth = 2)
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

<h2>Phase Four : Model Evaluation</h2>

In [None]:
print("Performance Metrics for Linear Regression Model \n","--"*25)
print(f"Linear Model R2 Score : {r2_score(y_test, y_pred_lm)}")
print(f"Mean Absolute Error (MAE) : {mean_absolute_error(y_test, y_pred_lm)}")
print(f"Mean Squared Error (MSE) : {mean_squared_error(y_test, y_pred_lm)}")
print(f"Root Mean Squared Error (RMSE) : {math.sqrt(mean_squared_error(y_test, y_pred_lm))}")

In [None]:
print("Performance Metrics for Random Forest Regressor Model \n","--"*25)
print(f"Linear Model R2 Score : {r2_score(y_test, y_pred_rf)}")
print(f"Mean Absolute Error (MAE) : {mean_absolute_error(y_test, y_pred_rf)}")
print(f"Mean Squared Error (MSE) : {mean_squared_error(y_test, y_pred_rf)}")
print(f"Root Mean Squared Error (RMSE) : {math.sqrt(mean_squared_error(y_test, y_pred_rf))}")

In [None]:
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_rf)

metrics = {
    'MAE': mae,
    'MSE': mse,
    'RMSE': rmse,
    'R2': r2
}
joblib.dump(metrics, 'metrics.pkl')

<h3>Since we can observe that our 'Random Forest Regressor' model performs better than 'Linear Regression' model, we use that as our working model.

<h2>Phase Five : Model Based Prediction</h2>

<h3>The Below Code is for an Application developed using Streamlit

In [None]:
# Using joblib to store the random forest regression model to use in prediction.
joblib.dump(random_regression, 'random_forest_model.pkl')
print("File Created and Saved Sucessfully")

In [None]:
# Creating the plots for showing in our application
plt.figure(figsize = (10, 5))

plt.subplot(1, 2, 1)
sns.regplot(x = y_test, y = y_pred_rf, line_kws = {"color": "red"})
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')

plt.subplot(1, 2, 2)
error = y_pred_rf - y_test
sns.histplot(error, kde = True)
plt.xlabel('Prediction Error')
plt.title('Prediction Error Distribution')

plt.tight_layout()
plt.savefig('model_performance.png')

In [None]:
# Calculate min and max values for each feature in your dataset
feature_min_max = {
    'crim': (boston_df['crim'].min(), boston_df['crim'].max()),
    'zn': (boston_df['zn'].min(), boston_df['zn'].max()),
    'indus': (boston_df['indus'].min(), boston_df['indus'].max()),
    'chas': (boston_df['chas'].min(), boston_df['chas'].max()),
    'nox': (boston_df['nox'].min(), boston_df['nox'].max()),
    'rm': (boston_df['rm'].min(), boston_df['rm'].max()),
    'age': (boston_df['age'].min(), boston_df['age'].max()),
    'dis': (boston_df['dis'].min(), boston_df['dis'].max()),
    'rad': (boston_df['rad'].min(), boston_df['rad'].max()),
    'tax': (boston_df['tax'].min(), boston_df['tax'].max()),
    'ptratio': (boston_df['ptratio'].min(), boston_df['ptratio'].max()),
    'b': (boston_df['b'].min(), boston_df['b'].max()),
    'lstat': (boston_df['lstat'].min(), boston_df['lstat'].max())
}

# Save the feature min and max values
joblib.dump(feature_min_max, 'feature_min_max.pkl')

In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
import joblib

# Load the trained model and feature min/max values
model = joblib.load('random_forest_model.pkl')
feature_min_max = joblib.load('feature_min_max.pkl')

st.set_page_config(page_title="Housing Price Prediction App", page_icon = ":house:",layout = "wide")

st.title('Boston Housing Price Prediction')

st.sidebar.header('Input Features')

# Function to get user input for prediction
def user_input_features():
    crim = st.sidebar.slider('crim', feature_min_max['crim'][0], feature_min_max['crim'][1], value=feature_min_max['crim'][0])
    zn = st.sidebar.slider('zn', feature_min_max['zn'][0], feature_min_max['zn'][1], value=feature_min_max['zn'][0])
    indus = st.sidebar.slider('indus', feature_min_max['indus'][0], feature_min_max['indus'][1], value=feature_min_max['indus'][0])
    chas = st.sidebar.selectbox('chas', [0, 1])
    nox = st.sidebar.slider('nox', feature_min_max['nox'][0], feature_min_max['nox'][1], value=feature_min_max['nox'][0])
    rm = st.sidebar.slider('rm', feature_min_max['rm'][0], feature_min_max['rm'][1], value=feature_min_max['rm'][0])
    age = st.sidebar.slider('age', feature_min_max['age'][0], feature_min_max['age'][1], value=feature_min_max['age'][0])
    dis = st.sidebar.slider('dis', feature_min_max['dis'][0], feature_min_max['dis'][1], value=feature_min_max['dis'][0])
    rad = st.sidebar.slider('rad', feature_min_max['rad'][0], feature_min_max['rad'][1], value=feature_min_max['rad'][0])
    tax = st.sidebar.slider('tax', feature_min_max['tax'][0], feature_min_max['tax'][1], value=feature_min_max['tax'][0])
    ptratio = st.sidebar.slider('ptratio', feature_min_max['ptratio'][0], feature_min_max['ptratio'][1], value=feature_min_max['ptratio'][0])
    b = st.sidebar.slider('b', feature_min_max['b'][0], feature_min_max['b'][1], value=feature_min_max['b'][0])
    lstat = st.sidebar.slider('lstat', feature_min_max['lstat'][0], feature_min_max['lstat'][1], value=feature_min_max['lstat'][0])
    
    data = {
        'crim': crim,
        'zn': zn,
        'indus': indus,
        'chas': chas,
        'nox': nox,
        'rm': rm,
        'age': age,
        'dis': dis,
        'rad': rad,
        'tax': tax,
        'ptratio': ptratio,
        'b': b,
        'lstat': lstat
    }
    
    features = np.array([list(data.values())])
    return features

input_df = user_input_features()

# Predict button
if st.sidebar.button('Predict'):
    # Make prediction
    prediction = model.predict(input_df)
    
    st.subheader('Prediction')
    st.write(f'The predicted median value of owner-occupied homes in $1000s is: {prediction[0]:.2f}')
    
# Display model performance metrics
st.subheader('Model Performance Metrics')
metrics = joblib.load('metrics.pkl')
st.write(f"Mean Absolute Error (MAE): {metrics['MAE']:.2f}")
st.write(f"Mean Squared Error (MSE): {metrics['MSE']:.2f}")
st.write(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f}")
st.write(f"R-squared (R2): {metrics['R2']:.2f}")

# Display model performance plots
st.subheader('Model Performance Plots')
st.image('model_performance.png')

In [None]:
# The running application can be stopped by cllicking on this cell and pressing "I + I" or stopping the kernel of the ipynb file.
!streamlit run app.py