In [2]:
#  Importing libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    make_scorer,
)
import joblib

In [3]:
# Load dataset
data = pd.read_csv(
    "D:\\House_Price_Prediction\\House-Price-Prediction-using-ML-master\\house_price_prediction\\notebook_and_dataset\\csvdata.csv"
)
df = pd.DataFrame(data)
data.head()

Unnamed: 0.1,Unnamed: 0,City,Price,Area,Location,No. of Bedrooms
0,0,Bangalore,30000000,3340,JP Nagar Phase 1,4
1,1,Bangalore,7888000,1045,Dasarahalli on Tumkur Road,2
2,2,Bangalore,4866000,1179,Kannur on Thanisandra Main Road,2
3,3,Bangalore,8358000,1675,Doddanekundi,3
4,4,Bangalore,6845000,1670,Kengeri,3


In [4]:
# Split data into features and target variable
X = df.drop("Price", axis=1)
y = df["Price"]

In [5]:
# Define categorical and numerical features
categorical_features = ["City", "Location"]
numerical_features = ["Area", "No. of Bedrooms"]

In [6]:
# Preprocessing pipeline
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [7]:
# Base models
base_model_1 = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)

base_model_2 = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)),
    ]
)

base_model_3 = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "regressor",
            GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
        ),
    ]
)

In [8]:
# Stacking ensemble
stacked_model = StackingRegressor(
    estimators=[("lr", base_model_1), ("xgb", base_model_2), ("grad", base_model_3)],
    final_estimator=LinearRegression(),
    cv=5,
)

In [9]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Fit stacked model
stacked_model.fit(X_train, y_train)

In [11]:
# Cross-validation scores
cv_scores = cross_val_score(
    stacked_model, X_train, y_train, cv=5, scoring=make_scorer(r2_score)
)
print(f"Cross-validated R^2 scores: {cv_scores}")
print(f"Mean Cross-validated R^2 score: {cv_scores.mean()}")

Cross-validated R^2 scores: [0.0737101  0.13839881 0.05111877 0.22326376 0.21153141]
Mean Cross-validated R^2 score: 0.139604570741465


In [12]:
# Predictions on test set
y_pred = stacked_model.predict(X_test)

In [13]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Absolute Error: 7709116.54
Mean Squared Error: 626601586240176.50
R^2 Score: 0.16


In [14]:
# Predict with raw data
new_data = pd.DataFrame(
    {
        "City": ["Bangalore"],
        "Area": [3340],
        "Location": ["JP Nagar Phase 1"],
        "No. of Bedrooms": [4],
    }
)

predicted_price = stacked_model.predict(new_data)

print(f"Predicted Price: {predicted_price[0]:.2f}")

Predicted Price: 23238448.34


In [15]:
# Save the model
joblib.dump(stacked_model, '../model/hybrid_model.pkl')

['../model/hybrid_model.pkl']

In [18]:
import pandas as pd

# Load the uploaded CSV file
data = pd.read_csv(
    "D:\\House_Price_Prediction\\House-Price-Prediction-using-ML-master\\house_price_prediction\\notebook_and_dataset\\csvdata.csv"
)

# Get unique city names
unique_cities = data['City'].unique()
print(unique_cities)


['Bangalore' 'Chennai' 'Delhi' 'Hyderabad' 'Kolkata' 'Mumbai']


In [19]:
# Get unique locations under each city
unique_locations_per_city = data.groupby('City')['Location'].unique()
unique_locations_per_city = unique_locations_per_city.apply(list).to_dict()
unique_locations_per_city_sorted = {city: sorted(locations) for city, locations in unique_locations_per_city.items()}

print(unique_locations_per_city_sorted)


{'Bangalore': ['5th Phase', '5th Stage BEML Layout', '5th Stage Raja Rajeshwari Nagar', '6th phase jp nagar', '7th Phase JP Nagar', '8th Phase JP Nagar', 'AECS Layout A Block Singasandra', 'AGS Layout Arehalli', 'Abbigere', 'Adugodi', 'Akshayanagar', 'Amruthahalli', 'Anagalapura Near Hennur Main Road', 'Ananth Nagar', 'Anekal City', 'Anjanapura', 'Anjanapura Township', 'Annapurneshwari Nagar', 'Armane Nagar', 'Ashok Nagar', 'Ashoka Road', 'Ashwathkatte Road', 'Attibele', 'Avalahalli Off Sarjapur Road', 'BEML Layout', 'BEML Layout 5th Stage', 'BTM Layout', 'BTM Layout 2nd Stage', 'Babusa Palya', 'Babusabpalya', 'Badamanavarthekaval', 'Bagalur', 'Bagaluru', 'Bagaluru Near Yelahanka', 'Banashankari', 'Banashankari 5th stage', 'Banashankari Stage III', 'Banaswadi', 'Bangalore Road', 'Bannerghatta', 'Bannerghatta Main Road', 'Bannerghatta Road Jigani', 'Bannerughatta', 'Basapura Main Road', 'Basavanagudi', 'Basaveswarnagar', 'Battarahalli', 'Begur', 'Begur Road', 'Bellandur', 'Bellari Road'