In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import  RandomizedSearchCV
from scipy.stats import randint


In [2]:
df = pd.read_csv('mumbai_house_prices.csv')
#0 New, 1 Resale, 2 Unknown

In [3]:
# Prepare Data
X = df[['bhk', 'region','locality', 'age_new', 'area']]
y = df['price_in_cr']

In [4]:
# Encode categorical variables
encoder_region = OneHotEncoder()
encoder_locality = OneHotEncoder()

In [5]:
# Fit encoders
X_encoded_region = encoder_region.fit_transform(X[['region']]).toarray()
X_encoded_locality = encoder_locality.fit_transform(X[['locality']]).toarray()

X_encoded_region_df = pd.DataFrame(X_encoded_region, columns=encoder_region.get_feature_names_out(['region']))
X_encoded_locality_df = pd.DataFrame(X_encoded_locality, columns=encoder_locality.get_feature_names_out(['locality']))



In [6]:
# Combine encoded columns with the rest of the data
X = X.drop(columns=['region','locality']).reset_index(drop=True)
X = pd.concat([X, X_encoded_region_df,X_encoded_locality_df], axis=1)

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Randomized Search for Hyperparameter Tuning
param_dist = {
    'n_estimators': randint(10, 100),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2']
}

In [10]:
import gc
gc.collect()

83

In [45]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)

In [12]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=50, cv=5, verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
# Best RandomForestRegressor model
best_rf_model = random_search.best_estimator_
# Make predictions with RandomForestRegressor
rf_pred = best_rf_model.predict(X_test)
# Evaluate the RandomForestRegressor model
rf_mse = mean_squared_error(y_test, rf_pred)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


MemoryError: Unable to allocate 3.63 GiB for an array with shape (10010, 48664) and data type float64

In [21]:
print(f'Linear Regression Mean Squared Error: {lr_mse}')
print(f'Random Forest Regressor Mean Squared Error: {rf_mse}')

Linear Regression Mean Squared Error: 145694660856650.8
Random Forest Regressor Mean Squared Error: 0.4063260246337531


In [27]:
# Example Prediction
#0 New, 1 Resale, 2 Unknown
example = pd.DataFrame({'bhk': [2], 'age_new': [0], 'area': [650], 'region': ['Andheri West'], 'locality': ['Lak And Hanware The Residency Tower']})
# Encode the example with the same encoders used in training
example_encoded_region = encoder_region.transform(example[['region']]).toarray()
example_encoded_locality = encoder_locality.transform(example[['locality']]).toarray()

example_encoded_region_df = pd.DataFrame(example_encoded_region, columns=encoder_region.get_feature_names_out(['region']))
example_encoded_locality_df = pd.DataFrame(example_encoded_locality, columns=encoder_locality.get_feature_names_out(['locality']))

example = example.drop(columns=['region', 'locality']).reset_index(drop=True)
example = pd.concat([example, example_encoded_region_df, example_encoded_locality_df], axis=1)

lr_price_prediction = lr_model.predict(example)

rf_price_prediction = best_rf_model.predict(example)

In [25]:
print(f'Linear Regression Predicted Price: {lr_price_prediction[0]} Cr')
print(f'Random Forest Regressor Predicted Price: {rf_price_prediction[0]} Cr')

Linear Regression Predicted Price: 2.121846809750423 Cr
Random Forest Regressor Predicted Price: 1.7194300000000016 Cr


In [None]:
from joblib import dump, load

# Save RandomForestRegressor model
dump(best_rf_model, 'random_forest_model.joblib')

# Save LinearRegression model
dump(lr_model, 'linear_regression_model.joblib')


In [None]:
import pandas as pd
from joblib import load

# Load example data for prediction
example = pd.DataFrame({'bhk': [2], 'age_new': [0], 'area': [650], 'region': ['Andheri West'], 'locality': ['Lak And Hanware The Residency Tower']})

# Load RandomForestRegressor model
loaded_rf_model = load('random_forest_model.joblib')

# Load LinearRegression model
loaded_lr_model = load('linear_regression_model.joblib')

# Encode the example with the same encoders used in training
example_encoded_region = encoder_region.transform(example[['region']]).toarray()
example_encoded_locality = encoder_locality.transform(example[['locality']]).toarray()

example_encoded_region_df = pd.DataFrame(example_encoded_region, columns=encoder_region.get_feature_names_out(['region']))
example_encoded_locality_df = pd.DataFrame(example_encoded_locality, columns=encoder_locality.get_feature_names_out(['locality']))

example = example.drop(columns=['region', 'locality']).reset_index(drop=True)
example = pd.concat([example, example_encoded_region_df, example_encoded_locality_df], axis=1)

# Predict with RandomForestRegressor
rf_price_prediction = loaded_rf_model.predict(example)
print(f'Random Forest Regressor Predicted Price: {rf_price_prediction[0]} Cr')

# Predict with LinearRegression
lr_price_prediction = loaded_lr_model.predict(example)
print(f'Linear Regression Predicted Price: {lr_price_prediction[0]} Cr')
