<a href="https://colab.research.google.com/github/abhi78945/mumbai-house-pricee-prediction/blob/main/mumbai_house_pricee_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

Mumbai House Data

In [None]:
# loading the diabetes dataset to a pandas DataFrame
house_dataset = pd.read_csv('/content/Mumbai House Prices.csv')

In [None]:
# printing the first 5 rows of the dataset
house_dataset.head()

In [None]:
# number of rows and Columns in this dataset
house_dataset.shape

In [None]:
house_dataset.info()

In [None]:
# Converting all prices to INR
def convert_price_to_inr(price, unit):
    if unit == 'L':
        return price * 1e5  # 1 Lakh = 100,000
    elif unit == 'Cr':
        return price * 1e7  # 1 Crore = 10,000,000
    else:
        return np.nan

In [None]:
# Apply the conversion function to the dataframe
house_dataset['price_in_inr'] = house_dataset.apply(lambda x: convert_price_to_inr(x['price'], x['price_unit']), axis=1)

In [None]:
# Drop the original price and price_unit columns
house_dataset.drop(columns=['price', 'price_unit'], inplace=True)

# Rename the new column to 'price'
house_dataset.rename(columns={'price_in_inr': 'price'}, inplace=True)

In [None]:
house_dataset.head()

In [None]:
house_dataset.shape

In [None]:
house_dataset.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizing the 'area' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=house_dataset['area'])
plt.title('Box plot of Area')
plt.show()

# Visualizing the 'price' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=house_dataset['price'])
plt.title('Box plot of Price')
plt.show()

In [None]:
# Function to remove outliers using IQR method
def remove_outliers(house_dataset, column):
    Q1 = house_dataset[column].quantile(0.25)
    Q3 = house_dataset[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return house_dataset[(house_dataset[column] >= lower_bound) & (house_dataset[column] <= upper_bound)]

# Removing outliers from 'area' and 'price' columns
df_cleaned = remove_outliers(house_dataset, 'area')
df_cleaned = remove_outliers(df_cleaned, 'price')

# Displaying the shape of the dataframe after removing outliers
print(df_cleaned.shape)

# Displaying the first few rows of the cleaned dataframe
df_cleaned.head()

In [None]:
from scipy import stats

# Calculate Z-score for 'area' and 'price'
df_cleaned['area_zscore'] = np.abs(stats.zscore(df_cleaned['area']))
df_cleaned['price_zscore'] = np.abs(stats.zscore(df_cleaned['price']))

# Define threshold for Z-score (commonly 3)
threshold = 3
# Filter rows based on Z-score and create a copy of the filtered DataFrame
df2 = df_cleaned[(df_cleaned['area_zscore'] < threshold) & (df_cleaned['price_zscore'] < threshold)].copy()

# Drop the Z-score columns from the copied DataFrame
df2.drop(columns=['area_zscore', 'price_zscore'], inplace=True)

# Resetting index after dropping rows
df2.reset_index(drop=True, inplace=True)

In [None]:
# Displaying the shape of df2
print("Shape of df2 after Z-score cleaning:", df2.shape)

# Displaying the first few rows of df2
df2.head()

In [None]:
df2.describe()

In [None]:
# List of categorical variables
categorical_vars = ['locality', 'region', 'status', 'age']

# Print unique values and their counts
for var in categorical_vars:
    print(f"Unique values and counts for {var}:")
    print(df2[var].value_counts())
    print()

In [None]:
# Define a function to group less frequent values into 'Other'
def group_low_count_categories(series, threshold):
    counts = series.value_counts()
    mask = series.isin(counts[counts >= threshold].index)
    series = series.mask(~mask, 'Other')  # Use mask() to assign 'Other' to less frequent values
    return series

# Group less frequent localities and regions into 'Other'
df2['locality'] = group_low_count_categories(df2['locality'], threshold=50)  # Adjust threshold as needed
df2['region'] = group_low_count_categories(df2['region'], threshold=100)    # Adjust threshold as needed

# Print updated unique values and counts for locality and region
print("Updated unique values and counts for locality after grouping:")
print(df2['locality'].value_counts())
print()

print("Updated unique values and counts for region after grouping:")
print(df2['region'].value_counts())


In [None]:
import json
# Function to save DataFrame columns as JSON
def save_columns_as_json(house_dataset, columns, filename):
    for col in columns:
        unique_values = house_dataset[col].unique().tolist()
        with open(f'{filename}_{col}.json', 'w') as f:
            json.dump(unique_values, f)

# Example usage: Save 'type', 'status', 'age', 'locality', 'region' columns as JSON
save_columns_as_json(house_dataset, ['type', 'status', 'age', 'locality', 'region'], 'unique_values')

print("JSON files saved successfully.")

In [None]:
# Compute mean price for each locality and region
locality_means = df2.groupby('locality')['price'].mean()
region_means = df2.groupby('region')['price'].mean()

# Map mean prices back to the dataframe
df2['locality_target_encoded'] = df2['locality'].map(locality_means)
df2['region_target_encoded'] = df2['region'].map(region_means)

# Print head to verify
df2[['locality_target_encoded', 'region_target_encoded']].head()

In [None]:
import joblib
import pickle

# Extract unique values and their encoded values into dictionaries
locality_encoding_map = dict(zip(df2['locality'], df2['locality_target_encoded']))
region_encoding_map = dict(zip(df2['region'], df2['region_target_encoded']))
# Save locality encoding map to file
with open('locality_encoding_map.pkl', 'wb') as f:
    pickle.dump(locality_encoding_map, f)

# Save region encoding map to file
with open('region_encoding_map.pkl', 'wb') as f:
    pickle.dump(region_encoding_map, f)

In [None]:
# Drop original locality and region columns if desired
df2.drop(['locality', 'region'], axis=1, inplace=True)

In [None]:
# Perform one-hot encoding for status and age
df2 = pd.get_dummies(df2, columns=['type','status', 'age'], drop_first=True, dtype=int)

# Print head to verify
df2.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scaler
scaler = MinMaxScaler()

# Scale numerical features in df2 (assuming 'bhk', 'area', and possibly 'price' are numerical)
df2_scaled = df2.copy()  # Make a copy to preserve original data
df2_scaled[['bhk', 'area']] = scaler.fit_transform(df2_scaled[['bhk', 'area']])

joblib.dump(scaler, 'min_max_scaler.pkl')

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df2_scaled.drop('price', axis=1)
y = df2_scaled['price']

# Split data into training and testing sets (adjust test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression

models = {
    'Linear Regression': (LinearRegression(), {
        'fit_intercept': [True, False],
        'copy_X': [True, False]
    }),
    'Ridge Regression': (Ridge(), {
        'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']
    }),
    'Lasso Regression': (Lasso(), {
        'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
        'max_iter': [1000, 5000, 10000]
    }),
    'DecisionTree Regressor': (DecisionTreeRegressor(), {
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5, 10]
    })
}

# Track the best model and its score
best_model = None
best_score = -float('inf')
best_model_name = None

# Perform GridSearchCV for each model
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring='r2', cv=3, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation score for {name}: {grid_search.best_score_:.4f}")
    print()

    # Evaluate on test set using the R^2 score
    test_score = grid_search.best_estimator_.score(X_test, y_test)
    print(f"Test set score (R^2) for {name}: {test_score:.4f}")
    print()
# Check if this model is the best so far
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_model_name = name


In [None]:
# Save the best model using joblib
if best_model is not None:
    filename = f"{best_model_name.lower().replace(' ', '_')}_regression_model.pkl"
    joblib.dump(best_model, filename)
    print(f"Saved {best_model_name} model as {filename}")
else:
    print("No best model found.")