In [1]:

import pandas as pd  
import numpy as np   
import pickle        
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('../data/housing.csv')
print("Original Data Size:", df.shape)

df = df.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
df = df.dropna()
print("Data Size after cleaning missing values:", df.shape)

Original Data Size: (13320, 9)
Data Size after cleaning missing values: (13246, 5)


In [3]:

df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop('size', axis=1) 

def convert_sqft_to_num(x):
    try:
        float_x = float(x)
        return float_x
    except:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df = df.dropna()  

In [4]:
df = df[~(df.total_sqft / df.bhk < 300)]
print("Size after removing small bedrooms:", df.shape)

Size after removing small bedrooms: (12456, 5)


In [5]:
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)  
        st = np.std(subdf.price_per_sqft)  
        
        reduced_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df = remove_pps_outliers(df)
print("Size after removing extreme price outliers:", df.shape)

Size after removing extreme price outliers: (9259, 6)


In [6]:

label_encoder = LabelEncoder()
df['location'] = label_encoder.fit_transform(df['location'])

X = df.drop(['price', 'price_per_sqft'], axis=1)  
y = df['price']                                   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

rf_score = model.score(X_test, y_test)
print(f"Random Forest Accuracy (R2 Score): {rf_score * 100:.2f}%")

y_pred_rf = model.predict(X_test)
print(f"Random Forest Mean Error (in Lakhs): {mean_absolute_error(y_test, y_pred_rf):.2f}")

# Decision Tree Regressor (added)
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

dt_score = dt_model.score(X_test, y_test)
print(f"Decision Tree Accuracy (R2 Score): {dt_score * 100:.2f}%")

y_pred_dt = dt_model.predict(X_test)
print(f"Decision Tree Mean Error (in Lakhs): {mean_absolute_error(y_test, y_pred_dt):.2f}")

Random Forest Accuracy (R2 Score): 76.04%
Random Forest Mean Error (in Lakhs): 23.26
Decision Tree Accuracy (R2 Score): 56.65%
Decision Tree Mean Error (in Lakhs): 28.68


In [7]:

pickle.dump(model, open('../models/trained_model.pkl', 'wb'))
pickle.dump(dt_model, open('../models/decision_tree_model.pkl', 'wb'))
pickle.dump(label_encoder, open('../models/label_encoder.pkl', 'wb'))
pickle.dump(X.columns.tolist(), open('../models/columns.pkl', 'wb'))
print("Models saved successfully!")

Models saved successfully!
