In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [3]:
df = pd.read_csv('mumbai_house_prices.csv')
top_10_rows= df.head(2)
top_10_rows

#0 New, 1 Resale, 2 Unknown

Unnamed: 0,bhk,type,locality,area,region,status,age,price_in_cr,age_new
0,3,Apartment,Lak And Hanware The Residency Tower,685,Andheri West,Ready to move,New,2.5,0
1,2,Apartment,Radheya Sai Enclave Building No 2,640,Naigaon East,Under Construction,New,0.5251,0


In [4]:
# Prepare Data
X = df[['bhk', 'region', 'age_new', 'area']]
y = df['price_in_cr']

In [5]:
# Encode categorical variable
encoder = OneHotEncoder()
X_encoded_region = encoder.fit_transform(X[['region']]).toarray()

X_encoded_region_df = pd.DataFrame(X_encoded_region, columns=encoder.get_feature_names_out(['region']))



In [6]:
# Combine encoded columns with the rest of the data
X = X.drop(columns=['region']).reset_index(drop=True)
X = pd.concat([X, X_encoded_region_df], axis=1)

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)

In [9]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)

In [10]:
print(f'Linear Regression Mean Squared Error: {lr_mse}')
print(f'Random Forest Regressor Mean Squared Error: {rf_mse}')

Linear Regression Mean Squared Error: 145694660856650.8
Random Forest Regressor Mean Squared Error: 0.4063260246337531


In [11]:
# Example Prediction
#0 New, 1 Resale, 2 Unknown
example = pd.DataFrame({'bhk': [2], 'age_new': [0], 'area': [650], 'region': ['Andheri West']})
example_encoded = encoder.transform(example[['region']]).toarray()
example_encoded_df = pd.DataFrame(example_encoded, columns=encoder.get_feature_names_out(['region']))
example = example.drop(columns=['region']).reset_index(drop=True)
example = pd.concat([example, example_encoded_df], axis=1)

lr_price_prediction = lr_model.predict(example)
rf_price_prediction = rf_model.predict(example)

In [12]:
print(f'Linear Regression Predicted Price: {lr_price_prediction[0]} Cr')
print(f'Random Forest Regressor Predicted Price: {rf_price_prediction[0]} Cr')

Linear Regression Predicted Price: 2.127359500620514 Cr
Random Forest Regressor Predicted Price: 2.1880206349206355 Cr
