## Delhi House Price Predition

In [1]:
import pandas as pd
import numpy as np

### Load the data

In [3]:
data = pd.read_csv("Delhi_v2.csv")

### Data info

In [39]:
data.head()

Unnamed: 0,price,Address,area,latitude,longitude,Bedrooms,Bathrooms,Status,neworold,type_of_building,Price_sqft
0,5600000.0,"Noida Extension, Noida, Delhi NCR",1350.0,28.60885,77.46056,3.0,3.0,Under Construction,New Property,Flat,4148.148148
1,8800000.0,"Sector 79, Gurgaon, Delhi NCR",1490.0,28.374236,76.952416,3.0,3.0,Ready to Move,New Property,Flat,5906.040268
2,16500000.0,"Vaishali, Ghaziabad, Delhi NCR",2385.0,28.645769,77.38511,4.0,5.0,Ready to Move,New Property,Flat,6918.238994
3,3810000.0,"Link Road, F Block, Sector 50, Noida, Uttar Pr...",1050.0,28.566914,77.436434,2.0,2.0,Ready to Move,New Property,Flat,3628.571429
4,6200000.0,"Jaypee Pavilion Court Sector 128, Noida, Secto...",1350.0,28.520732,77.356491,2.0,2.0,Ready to Move,Resale,Flat,4592.592593


In [9]:
data.shape

(7738, 18)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7738 entries, 0 to 7737
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        7738 non-null   int64  
 1   price             7738 non-null   float64
 2   Address           7738 non-null   object 
 3   area              7738 non-null   float64
 4   latitude          7738 non-null   float64
 5   longitude         7738 non-null   float64
 6   Bedrooms          7738 non-null   float64
 7   Bathrooms         7738 non-null   float64
 8   Balcony           5166 non-null   float64
 9   Status            7164 non-null   object 
 10  neworold          7738 non-null   object 
 11  parking           2612 non-null   float64
 12  Furnished_status  4124 non-null   object 
 13  Lift              1733 non-null   float64
 14  Landmarks         2759 non-null   object 
 15  type_of_building  7738 non-null   object 
 16  desc              7738 non-null   object 


### Chekcing for missing values

In [37]:
print(data.isnull().sum())

price               0
Address             0
area                0
latitude            0
longitude           0
Bedrooms            0
Bathrooms           0
Status              0
neworold            0
type_of_building    0
Price_sqft          0
dtype: int64


### Dropping irrelevant columns

In [15]:
data.drop(["Unnamed: 0", "desc", "Lift", "Landmarks", "parking", "Furnished_status", "Balcony"], axis=1, inplace=True)


In [19]:
data.shape

(7738, 11)

### Handling Missing values in one column only -> Status

In [27]:
print(data["Status"].unique())



['Under Construction' 'Ready to Move' nan]


In [29]:
print(data["Status"].value_counts())


Status
Ready to Move         7148
Under Construction      16
Name: count, dtype: int64


In [35]:
data["Status"].fillna("Ready to Move", inplace=True)


### Describing the data now

In [50]:
data.describe()

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Price_sqft
count,7738.0,7738.0,7738.0,7738.0,7738.0,7738.0,7738.0
mean,8320635.0,1409.506591,28.552092,77.273476,2.708193,2.501163,5543.660241
std,7223197.0,718.929581,0.10742,0.180606,0.877026,0.86705,2408.659307
min,1700000.0,501.0,28.240023,76.884101,2.0,2.0,2100.0
25%,4200000.0,990.0,28.455539,77.07859,2.0,2.0,3950.986915
50%,6000000.0,1250.0,28.574637,77.34532,3.0,2.0,4972.674332
75%,9500000.0,1650.0,28.64252,77.421054,3.0,3.0,6350.638693
max,85000000.0,9500.0,28.799748,77.688028,10.0,10.0,44378.698225


In [60]:
print(data.duplicated().sum())  # Should be 0 ideally


0


In [58]:
data.drop_duplicates(inplace=True)


In [76]:
print(data.dtypes)


price                                float64
area                                 float64
latitude                             float64
longitude                            float64
Bedrooms                             float64
Bathrooms                            float64
Status                                 int64
neworold                               int64
Price_sqft                           float64
type_of_building_Individual House       bool
dtype: object


In [64]:
data.drop(columns=["Address"], inplace=True)


### Numerical Encoding (Label Encoding)
#### column status and neworold-> converting them to numerical data

In [68]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data["Status"] = le.fit_transform(data["Status"])  # Converts "Ready to Move" → 0, "Under Construction" → 1
data["neworold"] = le.fit_transform(data["neworold"])  # Converts categories to 0/1


### converting type_of_building column 
#### Since type_of_building has multiple unique categories
#### so we cannot use Label Encoding because it would treat "Villa" > "Apartment", which makes no sense.

### 👉 Instead, we use One-Hot Encoding:

In [74]:
data = pd.get_dummies(data, columns=["type_of_building"], drop_first=True)


In [91]:
print(data.dtypes)


price                                float64
area                                 float64
latitude                             float64
longitude                            float64
Bedrooms                             float64
Bathrooms                            float64
Status                                 int64
neworold                               int64
Price_sqft                           float64
type_of_building_Individual House      int64
dtype: object


In [89]:
data["type_of_building_Individual House"] = data["type_of_building_Individual House"].astype(int)


## Split the data into training set and test set

In [94]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = data.drop(columns=["price"])  # Drop 'price' since it's what we want to predict
y = data["price"]  # Target variable

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 6151, Testing samples: 1538


## Train the Machine Learning Model
### Start with a simple model like Linear Regression:

In [98]:
from sklearn.linear_model import LinearRegression

# Initialize model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [100]:
y_pred

array([4081097.55774014, 5502341.31381713,  581193.53965276, ...,
       2230370.99095581, 5077401.78426568, 8414763.73177579])

In [102]:
from sklearn.metrics import mean_absolute_error, r2_score

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")



Mean Absolute Error: 1025821.40
R² Score: 0.93


#### 🔥 My R² Score = 0.93, which means my model explains 93% of the variance in house prices.

#### However, Mean Absolute Error (MAE) = ₹10,25,821, which means, on average, my predictions are off by ~10 lakh rupees. That could be improved. 

### Feature Scaling (for models like Linear Regression)

In [107]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [111]:
# Train the model again with scaled data
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_scaled = model.predict(X_test_scaled)

# Evaluate performance
from sklearn.metrics import mean_absolute_error, r2_score

mae_scaled = mean_absolute_error(y_test, y_pred_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)

print(f"Mean Absolute Error (After Scaling): {mae_scaled:.2f}")
print(f"R² Score (After Scaling): {r2_scaled:.2f}")


Mean Absolute Error (After Scaling): 1025821.40
R² Score (After Scaling): 0.93


# 🔹 Save the Trained Model

In [114]:
import joblib

# Save the trained model
joblib.dump(model, "house_price_model.pkl")

# Save the scaler too (if used)
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

# 🔹 Create a Python API (Flask or FastAPI)

In [117]:
pip install flask joblib pandas numpy


Note: you may need to restart the kernel to use updated packages.
