In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/real_estate_india.csv")

# Preview data
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [2]:
df.shape

(14528, 9)

In [3]:
df.columns


Index(['Name', 'Property Title', 'Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Description', 'Baths', 'Balcony'],
      dtype='object')

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14528 non-null  object 
 1   Property Title  14528 non-null  object 
 2   Price           14528 non-null  object 
 3   Location        14528 non-null  object 
 4   Total_Area      14528 non-null  int64  
 5   Price_per_SQFT  14528 non-null  float64
 6   Description     14528 non-null  object 
 7   Baths           14528 non-null  int64  
 8   Balcony         14528 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1021.6+ KB


In [5]:
df.isnull().sum()


Name              0
Property Title    0
Price             0
Location          0
Total_Area        0
Price_per_SQFT    0
Description       0
Baths             0
Balcony           0
dtype: int64

In [7]:
list(df.columns)


['Name',
 'Property Title',
 'Price',
 'Location',
 'Total_Area',
 'Price_per_SQFT',
 'Description',
 'Baths',
 'Balcony']

In [10]:
df['Location'].head(10)


0                   Kanathur Reddikuppam, Chennai
1           Ramanathan Nagar, Pozhichalur,Chennai
2        Kasthuribai Nagar, West Tambaram,Chennai
3         Naveenilaya,Chepauk, Triplicane,Chennai
4                                  Avadi, Chennai
5                               Siruseri, Chennai
6    THIRAN FLATS ,Gowrivakkam, Sembakkam,Chennai
7                    Mahindra World City, Chennai
8         Brindavan Colony, West Tambaram,Chennai
9                   New Colony, Chromepet,Chennai
Name: Location, dtype: object

In [11]:
# Extract city from Location (last word after comma)
df['city'] = df['Location'].apply(
    lambda x: x.split(',')[-1].strip() if isinstance(x, str) else x
)


In [12]:
df['city'].value_counts()


city
Bangalore    4513
Pune         2964
New Delhi    2165
Chennai      1595
Kolkata      1392
Mumbai       1353
Hyderabad     540
Thane           6
Name: count, dtype: int64

In [13]:
df[['Location', 'city']].head(10)


Unnamed: 0,Location,city
0,"Kanathur Reddikuppam, Chennai",Chennai
1,"Ramanathan Nagar, Pozhichalur,Chennai",Chennai
2,"Kasthuribai Nagar, West Tambaram,Chennai",Chennai
3,"Naveenilaya,Chepauk, Triplicane,Chennai",Chennai
4,"Avadi, Chennai",Chennai
5,"Siruseri, Chennai",Chennai
6,"THIRAN FLATS ,Gowrivakkam, Sembakkam,Chennai",Chennai
7,"Mahindra World City, Chennai",Chennai
8,"Brindavan Colony, West Tambaram,Chennai",Chennai
9,"New Colony, Chromepet,Chennai",Chennai


In [14]:
df.shape


(14528, 10)

In [15]:
df[['Price', 'Total_Area', 'Price_per_SQFT', 'Baths', 'Balcony']].describe()


Unnamed: 0,Total_Area,Price_per_SQFT,Baths
count,14528.0,14528.0,14528.0
mean,1297.916988,11719.456222,2.751239
std,1245.694305,49036.068632,0.898243
min,70.0,0.0,1.0
25%,650.0,4480.0,2.0
50%,1000.0,6050.0,3.0
75%,1439.0,9312.5,3.0
max,35000.0,999000.0,6.0


In [16]:
df['Price'].head(10)


0    ₹1.99 Cr
1    ₹2.25 Cr
2     ₹1.0 Cr
3    ₹3.33 Cr
4     ₹48.0 L
5     ₹40.0 L
6     ₹60.0 L
7    ₹72.35 L
8     ₹42.0 L
9     ₹30.0 L
Name: Price, dtype: object

In [22]:
import re
import numpy as np

def clean_price(price):
    if not isinstance(price, str):
        return np.nan
    
    price = price.lower()
    price = price.replace('₹', '').replace(',', '').strip()
    
    # crore
    if 'cr' in price or 'crore' in price:
        number = re.findall(r"[\d\.]+", price)
        return float(number[0]) * 100 if number else np.nan
    
    # lakh
    if 'l' in price or 'lac' in price or 'acs' in price or 'lakhs' in price:
        number = re.findall(r"[\d\.]+", price)
        return float(number[0]) if number else np.nan
    
    # pure number case
    number = re.findall(r"[\d\.]+", price)
    return float(number[0]) / 100000 if number else np.nan


In [23]:
df['price_lakhs'] = df['Price'].apply(clean_price)


In [24]:
df[['Price', 'price_lakhs']].head(15)


Unnamed: 0,Price,price_lakhs
0,₹1.99 Cr,199.0
1,₹2.25 Cr,225.0
2,₹1.0 Cr,100.0
3,₹3.33 Cr,333.0
4,₹48.0 L,48.0
5,₹40.0 L,40.0
6,₹60.0 L,60.0
7,₹72.35 L,72.35
8,₹42.0 L,42.0
9,₹30.0 L,30.0


In [25]:
df['price_lakhs'].isnull().sum()


np.int64(0)

In [26]:
df = df.dropna(subset=['price_lakhs'])


In [27]:
df['total_area_sqft'] = pd.to_numeric(df['Total_Area'], errors='coerce')
df['Baths'] = pd.to_numeric(df['Baths'], errors='coerce')
df['Balcony'] = pd.to_numeric(df['Balcony'], errors='coerce')

df = df.dropna(subset=['total_area_sqft'])


In [28]:
df = df[df['total_area_sqft'] < 10000]
df = df[df['price_lakhs'] < 1000]


In [29]:
final_features = [
    'total_area_sqft',
    'Baths',
    'Balcony',
    'city'
]

X = df[final_features]
y = df['price_lakhs']


In [30]:
X = pd.get_dummies(X, columns=['city'], drop_first=True)


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [33]:
X.isnull().sum()


total_area_sqft        0
Baths                  0
Balcony            14404
city_Chennai           0
city_Hyderabad         0
city_Kolkata           0
city_Mumbai            0
city_New Delhi         0
city_Pune              0
city_Thane             0
dtype: int64

In [34]:
# Fill numeric missing values
X['Baths'] = X['Baths'].fillna(X['Baths'].median())
X['Balcony'] = X['Balcony'].fillna(0)


In [35]:
X.isnull().sum()


total_area_sqft    0
Baths              0
Balcony            0
city_Chennai       0
city_Hyderabad     0
city_Kolkata       0
city_Mumbai        0
city_New Delhi     0
city_Pune          0
city_Thane         0
dtype: int64

In [37]:
# Step 1: Recreate X from df (fresh)
X = df[['total_area_sqft', 'Baths', 'Balcony', 'city']]
y = df['price_lakhs']


In [39]:
X = df[['total_area_sqft', 'Baths', 'Balcony', 'city']].copy()


In [40]:
X['Baths'] = X['Baths'].fillna(X['Baths'].median())
X['Balcony'] = X['Balcony'].fillna(0)


In [41]:
# Step 3: Encode city (ONLY ONCE)
X = pd.get_dummies(X, columns=['city'], drop_first=True)


In [42]:
# Step 4: Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [43]:
# Step 5: Train Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [44]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


(42.198549944114944, np.float64(77.47544241561233), 0.44353832981841224)

In [45]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

mae_rf, rmse_rf, r2_rf


(42.244794307285304, np.float64(81.20275760012697), 0.3887080440945273)

In [47]:
import pandas as pd

results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'MAE': [mae, mae_rf],
    'RMSE': [rmse, rmse_rf],
    'R2 Score': [r2, r2_rf]
})

results


Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,42.19855,77.475442,0.443538
1,Random Forest,42.244794,81.202758,0.388708


In [48]:
lr_importance = pd.Series(
    lr.coef_,
    index=X.columns
).sort_values(key=abs, ascending=False)

lr_importance.head(10)


city_Mumbai        9.237501e+01
city_New Delhi     3.884753e+01
city_Thane         2.299715e+01
city_Kolkata      -1.526542e+01
Baths              1.116140e+01
city_Pune          5.352787e+00
city_Chennai       5.129704e+00
city_Hyderabad     3.006597e+00
total_area_sqft    6.573363e-02
Balcony            2.842171e-14
dtype: float64

In [49]:
type(lr)

sklearn.linear_model._base.LinearRegression

In [50]:
X.columns

Index(['total_area_sqft', 'Baths', 'Balcony', 'city_Chennai', 'city_Hyderabad',
       'city_Kolkata', 'city_Mumbai', 'city_New Delhi', 'city_Pune',
       'city_Thane'],
      dtype='object')

In [55]:
import joblib


In [57]:
import os
os.makedirs("../model", exist_ok=True)


In [58]:
joblib.dump(lr, "../model/house_price_lr_model.pkl")


['../model/house_price_lr_model.pkl']

In [59]:
joblib.dump(X.columns.tolist(), "../model/feature_columns.pkl")


['../model/feature_columns.pkl']

In [60]:
import numpy as np
import pandas as pd

def predict_price(input_data):
    """
    input_data: dict with keys:
    total_area_sqft, Baths, Balcony, city
    """
    
    # Load model & columns
    model = joblib.load("../model/house_price_lr_model.pkl")
    feature_columns = joblib.load("../model/feature_columns.pkl")
    
    # Create input dataframe
    input_df = pd.DataFrame([input_data])
    
    # One-hot encode city
    input_df = pd.get_dummies(input_df, columns=['city'], drop_first=True)
    
    # Align columns with training data
    input_df = input_df.reindex(columns=feature_columns, fill_value=0)
    
    # Predict
    prediction = model.predict(input_df)[0]
    
    return prediction


In [61]:
sample_input = {
    'total_area_sqft': 1200,
    'Baths': 2,
    'Balcony': 1,
    'city': 'Mumbai'
}

predicted_price = predict_price(sample_input)
predicted_price


np.float64(70.53692458507567)

In [62]:
import os

print(os.getcwd())


C:\Users\Dell\Desktop\house_price_prediction


In [63]:
print(os.listdir(".."))


['AI Powered Excel Masterclass Practice file', 'College (MHSSCOE) - Shortcut.lnk', 'Deer.PNG', 'desktop.ini', 'house_price_prediction', 'model', 'Qskill_ai.pdf', 'sms_spam_collection', 'spam_mail_detector']


In [64]:
print(os.listdir("../model")) if os.path.exists("../model") else "NO MODEL FOLDER"


['feature_columns.pkl', 'house_price_lr_model.pkl']


In [65]:
# UPDATING THE PROJECT...

In [66]:
def derive_bhk(area):
    if area < 600:
        return 1
    elif area < 900:
        return 2
    elif area < 1300:
        return 3
    else:
        return 4

df['bhk'] = df['total_area_sqft'].apply(derive_bhk)


In [67]:
df['price_per_sqft'] = (df['price_lakhs'] * 100000) / df['total_area_sqft']


In [68]:
import numpy as np

df['property_type'] = np.random.choice(
    ['Apartment', 'Independent House', 'Villa'],
    size=len(df),
    p=[0.7, 0.2, 0.1]
)


In [69]:
df['furnishing'] = np.random.choice(
    ['Unfurnished', 'Semi-Furnished', 'Fully Furnished'],
    size=len(df),
    p=[0.5, 0.3, 0.2]
)


In [70]:
df['property_age'] = np.random.choice(
    ['0-1', '1-5', '5-10', '10+'],
    size=len(df),
    p=[0.2, 0.4, 0.25, 0.15]
)


In [71]:
df['total_floors'] = np.random.randint(1, 25, size=len(df))
df['floor_number'] = np.minimum(
    np.random.randint(0, 25, size=len(df)),
    df['total_floors']
)

df['floor_ratio'] = df['floor_number'] / df['total_floors']


In [72]:
df[['bhk', 'price_per_sqft', 'property_type', 'furnishing', 'property_age', 'floor_ratio']].head()


Unnamed: 0,bhk,price_per_sqft,property_type,furnishing,property_age,floor_ratio
0,4,7704.219899,Apartment,Semi-Furnished,10+,1.0
1,4,3214.285714,Apartment,Fully Furnished,1-5,1.0
2,4,7575.757576,Apartment,Unfurnished,1-5,1.0
3,4,7835.294118,Apartment,Unfurnished,1-5,1.0
4,3,5000.0,Apartment,Semi-Furnished,5-10,0.625


In [74]:
age_mapping = {
    '0-1': 0,
    '1-5': 1,
    '5-10': 2,
    '10+': 3
}

df['property_age_encoded'] = df['property_age'].map(age_mapping)


In [75]:
furnishing_mapping = {
    'Unfurnished': 0,
    'Semi-Furnished': 1,
    'Fully Furnished': 2
}

df['furnishing_encoded'] = df['furnishing'].map(furnishing_mapping)


In [76]:
df = pd.get_dummies(
    df,
    columns=['property_type'],
    drop_first=True
)


In [77]:
df['floor_ratio'] = df['floor_ratio'].clip(0, 1)


In [78]:
df[[
    'bhk',
    'price_per_sqft',
    'property_age_encoded',
    'furnishing_encoded',
    'floor_ratio'
]].describe()


Unnamed: 0,bhk,price_per_sqft,property_age_encoded,furnishing_encoded,floor_ratio
count,14404.0,14404.0,14404.0,14404.0,14404.0
mean,2.689947,8162.449647,1.35622,0.700778,0.727369
std,1.089908,7338.14779,0.965388,0.778823,0.341027
min,1.0,0.000556,0.0,0.0,0.0
25%,2.0,4444.444444,1.0,0.0,0.45
50%,3.0,6000.0,1.0,1.0,1.0
75%,4.0,9090.909091,2.0,1.0,1.0
max,4.0,168776.371308,3.0,2.0,1.0


In [90]:
X.isna().sum().sort_values(ascending=False).head(10)


Balcony                            14404
total_area_sqft                        0
bhk                                    0
Baths                                  0
price_per_sqft                         0
property_age_encoded                   0
furnishing_encoded                     0
floor_ratio                            0
property_type_Independent House        0
property_type_Villa                    0
dtype: int64

In [91]:
X = X.fillna(0)


In [92]:
# Target
y = df['price_lakhs']

# Feature set (ALL controlled + encoded features)
feature_cols = [
    'total_area_sqft',
    'bhk',
    'Baths',
    'Balcony',
    'price_per_sqft',
    'property_age_encoded',
    'furnishing_encoded',
    'floor_ratio'
]

# Add one-hot encoded property_type columns
property_type_cols = [c for c in df.columns if c.startswith('property_type_')]

# Add one-hot encoded city columns
city_cols = [c for c in df.columns if c.startswith('city_')]


# Rebuild X cleanly
X = df[feature_cols + property_type_cols + city_cols].copy()

# VERY IMPORTANT
X = X.fillna(0)


In [93]:
X.isna().sum().sum()   # should be 0


np.int64(0)

In [94]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

lr_v2 = LinearRegression()
lr_v2.fit(X_train, y_train)

y_pred_lr = lr_v2.predict(X_test)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
lr_r2 = r2_score(y_test, y_pred_lr)

lr_mae, lr_rmse, lr_r2


(25.444454071226605, np.float64(52.15430304464039), 0.747833484433855)

In [96]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

rf_mae, rf_rmse, rf_r2


(1.491484659679473, np.float64(9.750595702067763), 0.9911860735194956)

In [97]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [lr_mae, rf_mae],
    "RMSE": [lr_rmse, rf_rmse],
    "R2": [lr_r2, rf_r2]
})

results


Unnamed: 0,Model,MAE,RMSE,R2
0,Linear Regression,25.444454,52.154303,0.747833
1,Random Forest,1.491485,9.750596,0.991186


In [98]:
final_model = rf


In [99]:
import joblib
import os

MODEL_DIR = os.path.join(os.getcwd(), "model")
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(final_model, os.path.join(MODEL_DIR, "house_price_model.pkl"))
joblib.dump(X.columns.tolist(), os.path.join(MODEL_DIR, "feature_columns.pkl"))

os.listdir(MODEL_DIR)


['feature_columns.pkl', 'house_price_model.pkl']

In [100]:
loaded_model = joblib.load(os.path.join(MODEL_DIR, "house_price_model.pkl"))
loaded_model.predict(X_test[:5])


array([ 50.65331109,  49.9924619 , 139.543633  , 140.09412434,
        38.73826952])