In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
data=pd.read_csv("P:\\projects\\house price prediction\\melb_data.csv")


In [None]:
data.head(5)


In [None]:
data.describe()


In [None]:
data.info()


-----------------------------------------------------------------------------------------------------------------------------------------------------

Data Preprocessing

-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
data.isna().sum()


In [None]:
data.nunique()


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iterative_imp = IterativeImputer()
data[['BuildingArea']] = iterative_imp.fit_transform(data[['BuildingArea']])

data[['YearBuilt']]=iterative_imp.fit_transform(data[['YearBuilt']])

data[['Car']] = iterative_imp.fit_transform(data[['Car']])


In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='constant',fill_value='Unknown')
data[['CouncilArea']] = imp.fit_transform(data[['CouncilArea']])

In [None]:
data.isna().sum()

Dataset had been cleaned , pre processed and do not contain any missing value

------------------------------------------------------------------------------------------------------------------------------------------

Data Visualization

-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#Creating graphs between different columns to show the relationship between them

In [None]:
# creating numerical_dataset for heatmap
corr_graph = data.select_dtypes(include=['number'])
sns.heatmap(corr_graph.corr(),vmin=None, vmax=None,fmt='.1f',linewidths=2)
plt.figure(figsize=(5,6))
plt.show()


In [None]:
def visualize_outliers(data,column,bins=50,fliersize=10,kde=True):

    fig, axes = plt.subplots(2, 1, figsize=(10, 8))
    
    # Box Plot
    sns.boxplot(x=data[column], ax=axes[0], fliersize=fliersize)
    axes[0].set_title(f'Outliers in {column}')
    
    # Histogram
    sns.histplot(x=data[column], bins=bins, kde=kde, ax=axes[1])
    axes[1].set_title(f'Histogram for {column}')
    
    plt.tight_layout()
    plt.show()
    
visualize_outliers(data,'BuildingArea')

def skew_data(column):
    print(data[column].skew()) # >0 = Right Skewed, <0 = Left Skewed

skew_data('BuildingArea')

In [None]:
#plot for no. of houses in each council

sns.countplot(x=data['CouncilArea'])
plt.xticks(rotation=90,ha='right')
plt.title('Distribution of Houses by Council Area')
plt.show()


In [None]:

plt.figure(figsize=(5, 3))
sns.countplot(data=data, x="Type", palette="Set1")  

print('Total Values in column "Type" :', data['Type'].count())
print('Total unique values in column "Type" :',data['Type'].nunique())
print()

plt.xlabel("Property Type")
plt.ylabel("Count")
plt.title("Distribution of Property Types")
plt.show()


In [None]:
#price depends on rooms,type,no. of bathrooms
def scatter_plotting(data,column1,column2,style=None,hue=None,palette='coolwarm'):
    plt.figure(figsize=(20,10))
    sns.scatterplot(data=data,x=column1,y=column2,hue=hue,style=style,palette=palette)
    plt.show()

scatter_plotting(data,'Rooms','Price','Type','Bathroom','viridis')

In [None]:
#graph on how houses scatterd in between lattitude and longitude
plt.figure(figsize=(12, 6))
sns.scatterplot(x=data['Lattitude'], y=data['Longtitude'], hue=data['Price'], palette='coolwarm')
plt.title('House Prices by Location')
plt.show()


In [None]:
#Line Chart: To analyze trends over time.
data = data.sort_values(by="Date")

print('Total Values in column "Date" :', data['Date'].count())
print('Total unique values in column "Date" :',data['Date'].nunique())
print('Total Values in column "Price" :', data['Price'].count())
print('Total unique values in column "Price" :',data['Price'].nunique())
print()

plt.figure(figsize=(20,10))
sns.lineplot(data=data,x='Date',y='Price',marker='o',color='blue')
plt.grid()
plt.xticks(rotation=90)
plt.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------

Training, Testing and Prediction 

---------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.model_selection import train_test_split

X = data[["Rooms", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", 
          "YearBuilt", "Distance", "Lattitude", "Longtitude", "Propertycount"]]

y = data["Price"]  


print("X shape:", X.shape)
print("y shape:", y.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the split datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor

models = {'DecisionTree':(DecisionTreeRegressor(),{'max_depth': [5,15,22]}), 
          'RandomForest':(RandomForestRegressor(),{'n_estimators':[50, 100] ,'max_depth':[5, 10]})}

for name,(model,params) in models.items():
    grid=GridSearchCV(model, params, cv=4, scoring='r2')
    grid.fit(X_train, y_train)
    print(f"{name} Best Params: {grid.best_params_}")
    print(f"{name} R² Score: {grid.best_score_:.4f}\n")


In [53]:
#fiting hte data in random forest

randomforest = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=42)

randomforest.fit(X_train, y_train)

In [54]:
#checking model accuracy using different accuracy metrics

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2_check = r2_score(y_test, y_pred)

mae = mean_absolute_error(y_test, y_pred)

rmse = np.sqrt(mean_absolute_error(y_test,y_pred))


print(f"r2 metrics score: {r2_check:.4f}")
print()
print(f"Mean Absolute Error: {mae:.4f}")
print()
print(f"Squared Mean Absolute Error: {rmse:4f}")
print()


r2 metrics score: 0.7669

Mean Absolute Error: 187897.6314

Squared Mean Absolute Error: 433.471604



In [55]:
# saving model

import joblib

joblib.dump(randomforest, "house_price_model.pkl")

loaded_model = joblib.load("house_price_model.pkl")


-------------------------------------------------------------------------------------------------------------------------------------------------------

In [61]:
import warnings
warnings.filterwarnings("ignore")  # Correct way to suppress warnings

# Ensure input matches the trained model's expected format
custom_input = [[3, 4, 3, 2, 870.0, 152.45600, 
                 2013, 9.8, -47.77615, 165.08907, 6575.000]]

#[["Rooms", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", 
         # "YearBuilt", "Distance", "Lattitude", "Longtitude", "Propertycount"]]

# Make prediction
custom_prediction = randomforest.predict(custom_input)

# Display result
print(f"Predicted House Price: {custom_prediction[0]:.5f}")


Predicted House Price: 1702252.45899
