In [636]:
# Importing pandas for data manipulation and analysis
import pandas as pd

In [637]:
# Importing numpy for numerical operations
import numpy as np

In [638]:
# Importing model selection utilities:
# - train_test_split to split data into training and testing sets
# - cross_val_score to evaluate models with cross-validation
# - GridSearchCV for hyperparameter tuning using grid search with cross-validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [639]:
# Importing preprocessing tools:
# - StandardScaler to normalize numerical features
# - OneHotEncoder to convert categorical features into one-hot encoded format
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [640]:
# ColumnTransformer allows applying different preprocessing steps to different columns
from sklearn.compose import ColumnTransformer

In [641]:
# Pipeline is used to chain preprocessing and modeling steps into one object
from sklearn.pipeline import Pipeline

In [642]:
# Importing regression models:
# - LinearRegression for simple linear modeling
# - Ridge for linear regression with L2 regularization
from sklearn.linear_model import LinearRegression, Ridge

In [643]:
# RandomForestRegressor is an ensemble model that builds multiple decision trees for regression
from sklearn.ensemble import RandomForestRegressor

In [644]:
# Importing performance metrics for regression models:
# - mean_absolute_error: average absolute difference between predicted and actual values
# - r2_score: proportion of variance explained by the model
# - mean_squared_error: average squared difference between predicted and actual values
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [645]:
# Importing XGBoost, a powerful gradient boosting library for regression/classification
import xgboost as xgb

In [646]:
# Plotly Express and Graph Objects for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go

In [647]:
# Joblib is used to save and load Python objects (e.g., trained models)
import joblib

In [648]:
# Suppress all warnings to keep the output clean (useful in notebooks or production)
import warnings
warnings.filterwarnings('ignore')

In [649]:
# Step 1: Load and explore the dataset
data = pd.read_csv('/kaggle/input/house-price/house_prices.csv')

In [650]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

In [651]:
pd.set_option("display.max_columns", 500)
data.head(5)

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,Unfurnished,,,Srushti Siddhi Mangal Murti Complex,1,2.0,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,Unfurnished,,,,1,1.0,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,


In [652]:
data.describe()

Unnamed: 0,Index,Price (in rupees),Dimensions,Plot Area
count,187531.0,169866.0,0.0,0.0
mean,93765.0,7583.772,,
std,54135.681003,27241.71,,
min,0.0,0.0,,
25%,46882.5,4297.0,,
50%,93765.0,6034.0,,
75%,140647.5,9450.0,,
max,187530.0,6700000.0,,


In [653]:
data.isnull().sum()

Index                     0
Title                     0
Description            3023
Amount(in rupees)         0
Price (in rupees)     17665
location                  0
Carpet Area           80673
Status                  615
Floor                  7077
Transaction              83
Furnishing             2897
facing                70233
overlooking           81436
Society              109678
Bathroom                828
Balcony               48935
Car Parking          103357
Ownership             65517
Super Area           107685
Dimensions           187531
Plot Area            187531
dtype: int64

In [654]:
data.shape

(187531, 21)

In [655]:
data = data.drop(columns=['Plot Area', 'Dimensions'])

In [656]:
data.shape

(187531, 19)

In [657]:
# Drop columns with too many missing values (>50% missing) and not useful Columns
data = data.drop(columns=['Society', 'Description', 'Index','Title','Car Parking', 'Super Area']) 

In [658]:
data.shape

(187531, 13)

In [659]:
data.head(5)

Unnamed: 0,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership
0,42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,Unfurnished,,,1,2.0,
1,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,2,,Freehold
2,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,2,,Freehold
3,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,Unfurnished,,,1,1.0,
4,1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",2,,Co-operative Society


In [660]:
# Rename target column
data = data.rename(columns={'Amount(in rupees)': 'price'})

In [661]:
data.head(5)

Unnamed: 0,price,Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership
0,42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,Unfurnished,,,1,2.0,
1,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,2,,Freehold
2,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,2,,Freehold
3,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,Unfurnished,,,1,1.0,
4,1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",2,,Co-operative Society


In [662]:
data.price.isnull().sum()

0

# **🔹 Convert Carpet Area to numeric:**

In [663]:
#Convert Carpet Area to numeric:
data['Carpet Area'] = data['Carpet Area'].str.extract(r'(\d+\.?\d*)')  # Extract numeric part
data['Carpet Area'] = pd.to_numeric(data['Carpet Area'], errors='coerce')  # Convert to float

In [664]:
data['Carpet Area']

0          500.0
1          473.0
2          779.0
3          530.0
4          635.0
           ...  
187526       NaN
187527       NaN
187528    1250.0
187529       NaN
187530       NaN
Name: Carpet Area, Length: 187531, dtype: float64

In [665]:
data['Carpet Area'].isnull().sum()

80673

In [666]:
#80673 missing values , Best Practice: Group-wise Median Imputation

In [667]:
# Group by location and bathroom count (or just location)
data['Carpet Area'] = data.groupby(['location', 'Bathroom'])['Carpet Area'].transform(
    lambda x: x.fillna(x.median())
)

In [668]:
data['Carpet Area'].isnull().sum()

920

In [669]:
data['Carpet Area'] = data['Carpet Area'].fillna(data['Carpet Area'].median())

In [670]:
data['Carpet Area'].isnull().sum()

0

In [671]:
data['Carpet Area']

0          500.0
1          473.0
2          779.0
3          530.0
4          635.0
           ...  
187526    1300.0
187527    1300.0
187528    1250.0
187529     950.0
187530    1650.0
Name: Carpet Area, Length: 187531, dtype: float64

# **🔹 Extract Floor Info:**

In [672]:
data[['Floor Number', 'Total Floors']] = data['Floor'].str.extract(r'(\d+)\s*out\s*of\s*(\d+)', expand=True).astype(float)

In [673]:
data[['Floor Number', 'Total Floors']]

Unnamed: 0,Floor Number,Total Floors
0,10.0,11.0
1,3.0,22.0
2,10.0,29.0
3,1.0,3.0
4,20.0,42.0
...,...,...
187526,2.0,4.0
187527,4.0,6.0
187528,1.0,3.0
187529,2.0,2.0


In [674]:
data[['Floor Number', 'Total Floors']].isnull().sum()

Floor Number    19837
Total Floors    19837
dtype: int64

In [675]:
#Best way to handle missing values in Floor Number and Total Floors:
#Fill missing Floor Number and Total Floors with group-wise median (optional)
data['Floor Number'] = data.groupby('location')['Floor Number'].transform(lambda x: x.fillna(x.median()))
data['Total Floors'] = data.groupby('location')['Total Floors'].transform(lambda x: x.fillna(x.median()))

In [676]:
data[['Floor Number', 'Total Floors']].isnull().sum()

Floor Number    0
Total Floors    0
dtype: int64

# **🔹 Convert Bathroom and Balcony to numeric:**

In [677]:
data['Bathroom'] = pd.to_numeric(data['Bathroom'], errors='coerce')
data['Balcony'] = pd.to_numeric(data['Balcony'], errors='coerce')

In [678]:
data[['Bathroom','Balcony']].isnull().sum()

Bathroom      863
Balcony     48957
dtype: int64

In [679]:
data[['Bathroom','Balcony']]

Unnamed: 0,Bathroom,Balcony
0,1.0,2.0
1,2.0,
2,2.0,
3,1.0,1.0
4,2.0,
...,...,...
187526,3.0,3.0
187527,3.0,
187528,3.0,2.0
187529,2.0,


In [680]:
# Handling Missing Bathroom & Balcony Values:
data['Bathroom'] = data.groupby('location')['Bathroom'].transform(lambda x: x.fillna(x.median()))
data['Balcony'] = data.groupby('location')['Balcony'].transform(lambda x: x.fillna(x.median()))

In [681]:
data[['Bathroom','Balcony']].isnull().sum()

Bathroom    0
Balcony     0
dtype: int64

# **Encode Categorical Columns**

In [682]:
data.head()

Unnamed: 0,price,Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership,Floor Number,Total Floors
0,42 Lac,6000.0,thane,500.0,Ready to Move,10 out of 11,Resale,Unfurnished,,,1.0,2.0,,10.0,11.0
1,98 Lac,13799.0,thane,473.0,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,2.0,1.0,Freehold,3.0,22.0
2,1.40 Cr,17500.0,thane,779.0,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,2.0,1.0,Freehold,10.0,29.0
3,25 Lac,,thane,530.0,Ready to Move,1 out of 3,Resale,Unfurnished,,,1.0,1.0,,1.0,3.0
4,1.60 Cr,18824.0,thane,635.0,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",2.0,1.0,Co-operative Society,20.0,42.0


In [683]:
data[['location', 'Status', 'Transaction', 'Furnishing', 'facing', 'overlooking', 'Ownership']].isnull().sum()

location           0
Status           615
Transaction       83
Furnishing      2897
facing         70233
overlooking    81436
Ownership      65517
dtype: int64

In [684]:
categorical_cols = ['location', 'Status', 'Transaction', 'Furnishing', 'facing', 'overlooking', 'Ownership']

In [685]:
data[categorical_cols] = data[categorical_cols].fillna('Unknown')

In [686]:
data[['location', 'Status', 'Transaction', 'Furnishing', 'facing', 'overlooking', 'Ownership']].isnull().sum()

location       0
Status         0
Transaction    0
Furnishing     0
facing         0
overlooking    0
Ownership      0
dtype: int64

In [687]:
data.head(5)

Unnamed: 0,price,Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership,Floor Number,Total Floors
0,42 Lac,6000.0,thane,500.0,Ready to Move,10 out of 11,Resale,Unfurnished,Unknown,Unknown,1.0,2.0,Unknown,10.0,11.0
1,98 Lac,13799.0,thane,473.0,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,2.0,1.0,Freehold,3.0,22.0
2,1.40 Cr,17500.0,thane,779.0,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,2.0,1.0,Freehold,10.0,29.0
3,25 Lac,,thane,530.0,Ready to Move,1 out of 3,Resale,Unfurnished,Unknown,Unknown,1.0,1.0,Unknown,1.0,3.0
4,1.60 Cr,18824.0,thane,635.0,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",2.0,1.0,Co-operative Society,20.0,42.0


In [688]:
data.isnull().sum()

price                    0
Price (in rupees)    17665
location                 0
Carpet Area              0
Status                   0
Floor                 7077
Transaction              0
Furnishing               0
facing                   0
overlooking              0
Bathroom                 0
Balcony                  0
Ownership                0
Floor Number             0
Total Floors             0
dtype: int64

In [689]:
data = data.dropna(subset=['Floor'])
data = data.dropna(subset=['Price (in rupees)'])

In [690]:
data.isnull().sum()

price                0
Price (in rupees)    0
location             0
Carpet Area          0
Status               0
Floor                0
Transaction          0
Furnishing           0
facing               0
overlooking          0
Bathroom             0
Balcony              0
Ownership            0
Floor Number         0
Total Floors         0
dtype: int64

In [691]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 163023 entries, 0 to 187530
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   price              163023 non-null  object 
 1   Price (in rupees)  163023 non-null  float64
 2   location           163023 non-null  object 
 3   Carpet Area        163023 non-null  float64
 4   Status             163023 non-null  object 
 5   Floor              163023 non-null  object 
 6   Transaction        163023 non-null  object 
 7   Furnishing         163023 non-null  object 
 8   facing             163023 non-null  object 
 9   overlooking        163023 non-null  object 
 10  Bathroom           163023 non-null  float64
 11  Balcony            163023 non-null  float64
 12  Ownership          163023 non-null  object 
 13  Floor Number       163023 non-null  float64
 14  Total Floors       163023 non-null  float64
dtypes: float64(6), object(9)
memory usage: 19.9+ MB


In [692]:
data=data.drop(columns=["price"])

In [693]:
data=data.drop(columns=["Floor"])

In [694]:
data.head(5)

Unnamed: 0,Price (in rupees),location,Carpet Area,Status,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership,Floor Number,Total Floors
0,6000.0,thane,500.0,Ready to Move,Resale,Unfurnished,Unknown,Unknown,1.0,2.0,Unknown,10.0,11.0
1,13799.0,thane,473.0,Ready to Move,Resale,Semi-Furnished,East,Garden/Park,2.0,1.0,Freehold,3.0,22.0
2,17500.0,thane,779.0,Ready to Move,Resale,Unfurnished,East,Garden/Park,2.0,1.0,Freehold,10.0,29.0
4,18824.0,thane,635.0,Ready to Move,Resale,Unfurnished,West,"Garden/Park, Main Road",2.0,1.0,Co-operative Society,20.0,42.0
5,6618.0,thane,450.0,Ready to Move,Resale,Unfurnished,East,"Garden/Park, Main Road",1.0,1.0,Co-operative Society,2.0,7.0


In [695]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your data

# Drop duplicate or unnecessary columns

# Define target and features
target = 'Price (in rupees)'
X = data.drop(columns=[target])
y = data[target]

# List of categorical columns (based on your schema)
categorical_cols = ['location', 'Status', 'Transaction',
                    'Furnishing', 'facing', 'overlooking', 'Ownership']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    verbose=100,
    random_seed=42
)

# Fit model
model.fit(X_train, y_train, cat_features=categorical_cols)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.4f}")


0:	learn: 24351.3790373	total: 89.2ms	remaining: 1m 29s
100:	learn: 20521.6860061	total: 6.59s	remaining: 58.7s
200:	learn: 18123.5918014	total: 13.7s	remaining: 54.4s
300:	learn: 16910.1305045	total: 19.9s	remaining: 46.1s
400:	learn: 15807.5632472	total: 25.6s	remaining: 38.2s
500:	learn: 13937.5337583	total: 31s	remaining: 30.9s
600:	learn: 13354.1693640	total: 36.1s	remaining: 23.9s
700:	learn: 13095.5274397	total: 41.7s	remaining: 17.8s
800:	learn: 12897.2969635	total: 48.2s	remaining: 12s
900:	learn: 12502.9791897	total: 54.7s	remaining: 6s
999:	learn: 12002.1259100	total: 1m 1s	remaining: 0us
Mean Squared Error: 1510874272.97
R² Score: -0.0215


# **POOR MODEL**