In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = 'Bengaluru_House_Data.csv'
data = pd.read_csv(file_path)

# Extract number of bedrooms from 'size'
data['BHK'] = data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)

# Convert 'total_sqft' to numeric (handle ranges like "2100 - 2850")
def convert_sqft_to_num(x):
    try:
        return float(x)
    except:
        if '-' in str(x):
            parts = x.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        return None

data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)

# Define imputers for numerical and categorical columns
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Separate numerical and categorical columns
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Impute missing values
data[num_cols] = num_imputer.fit_transform(data[num_cols])
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

print("Data cleaning and preprocessing complete.")


Data cleaning and preprocessing complete.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load the dataset
file_path = 'Bengaluru_House_Data.csv'  # Update this path to your local dataset path
data = pd.read_csv(file_path)

# Ensure price column is numeric
data['price'] = pd.to_numeric(data['price'], errors='coerce')

# Extract number of bedrooms from 'size' column and create 'BHK'
data['BHK'] = data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)

# Distribution of Price
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], bins=30, kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price (in Lakhs)')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')  # Saves the image locally
plt.close()

# Distribution of Total Square Feet
plt.figure(figsize=(10, 6))
sns.histplot(data['total_sqft'], bins=30, kde=True)
plt.title('Distribution of Total Square Feet')
plt.xlabel('Total Square Feet')
plt.ylabel('Frequency')
plt.savefig('total_sqft_distribution.png')
plt.close()

# Distribution of Bathrooms
plt.figure(figsize=(10, 6))
sns.histplot(data['bath'], bins=30, kde=True)
plt.title('Distribution of Bathrooms')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Frequency')
plt.savefig('bathroom_distribution.png')
plt.close()

# Correlation Heatmap (for numerical features)
plt.figure(figsize=(12, 8))
numeric_data = data.select_dtypes(include=[float, int])
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# Bar chart of average price by BHK
plt.figure(figsize=(12, 6))
average_price_by_bhk = data.groupby('BHK')['price'].mean().sort_values()
sns.barplot(x=average_price_by_bhk.index, y=average_price_by_bhk.values)
plt.title('Average Price by BHK')
plt.xlabel('BHK')
plt.ylabel('Average Price (in Lakhs)')
plt.savefig('average_price_by_bhk.png')
plt.close()

# Bar chart of number of properties by location (Top 10 locations)
plt.figure(figsize=(12, 6))
top_locations = data['location'].value_counts().nlargest(10)
sns.barplot(x=top_locations.index, y=top_locations.values)
plt.title('Number of Properties by Location (Top 10)')
plt.xlabel('Location')
plt.ylabel('Number of Properties')
plt.xticks(rotation=45)
plt.savefig('properties_by_location.png')
plt.close()


In [2]:
import numpy as np

# Create 'price_per_sqft' feature
data['price_per_sqft'] = data['price'] / data['total_sqft']

# Log-transform 'price' to handle skewness
data['log_price'] = np.log1p(data['price'])

# One-hot encoding for categorical columns (area_type and location)
data = pd.get_dummies(data, columns=['area_type', 'location'], drop_first=True)

print("Feature engineering complete.")


Feature engineering complete.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

# Load the dataset
file_path = 'Bengaluru_House_Data.csv'
data = pd.read_csv(file_path)

# Extract number of bedrooms from 'size'
data['BHK'] = data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)

# Function to convert 'total_sqft' to numeric (handling ranges and non-numeric values)
def convert_sqft_to_num(x):
    try:
        return float(x)
    except ValueError:
        if '-' in str(x):
            parts = x.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        else:
            return None

# Apply the conversion function to 'total_sqft'
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)

# Drop rows where 'total_sqft' is still None
data = data.dropna(subset=['total_sqft'])

# Log-transform 'price' to handle skewness
data['log_price'] = np.log1p(data['price'])

# Label encode 'location'
le = LabelEncoder()
data['location'] = le.fit_transform(data['location'])

# Save the Label Encoder for future use
with open('label_encoders.pkl', 'wb') as le_file:
    joblib.dump({'location': le}, le_file)

# Define features and target (location is not numerical, so it won't be scaled)
features = data[['location', 'BHK', 'total_sqft', 'bath', 'balcony']]
target = data['log_price']

# Separate the location (categorical) from the numerical features
location = features[['location']].values  # Keeping 'location' aside for concatenation
numerical_features = features.drop('location', axis=1)

# Get the updated list of feature columns
feature_columns = list(features.columns)
print("Feature columns used for training:", feature_columns)
with open('feature_columns.pkl', 'wb') as f:
    joblib.dump(feature_columns, f)

# Split the data into training and testing sets
X_train_num, X_test_num, y_train, y_test = train_test_split(numerical_features, target, test_size=0.2, random_state=42)
X_train_loc, X_test_loc = train_test_split(location, test_size=0.2, random_state=42)

# Initialize the scaler and standardize only the numerical features (excluding location)
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

# Concatenate the scaled numerical features with the location (encoded)
X_train_scaled = np.concatenate([X_train_loc, X_train_num_scaled], axis=1)
X_test_scaled = np.concatenate([X_test_loc, X_test_num_scaled], axis=1)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

# Save the model
joblib.dump(model, 'house_price_model.pkl')

# Model Evaluation
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate MSE and R² for training and test sets
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Output evaluation metrics
print("Model training complete.")
print(f"Train R² Score: {train_r2:.4f}")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Train Mean Squared Error: {train_mse:.4f}")
print(f"Test Mean Squared Error: {test_mse:.4f}")


Feature columns used for training: ['location', 'BHK', 'total_sqft', 'bath', 'balcony']
Model training complete.
Train R² Score: 0.9623
Test R² Score: 0.7760
Train Mean Squared Error: 0.0191
Test Mean Squared Error: 0.1153


In [None]:
from flask import Flask, request, jsonify, render_template
import logging
import joblib
import numpy as np
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

# Configure logging
logging.basicConfig(level=logging.INFO)

# Initialize global variables for model, scaler, and label encoders
model = None
scaler = None
label_encoders = None

# Load the trained model, scaler, and label encoders
def load_artifacts():
    global model, scaler, label_encoders
    try:
        with open('house_price_model.pkl', 'rb') as model_file:
            model = joblib.load(model_file)
            app.logger.info("Model loaded successfully")
        
        with open('scaler.pkl', 'rb') as scaler_file:
            scaler = joblib.load(scaler_file)
            app.logger.info("Scaler loaded successfully")

        with open('label_encoders.pkl', 'rb') as encoders_file:
            label_encoders = joblib.load(encoders_file)
            app.logger.info("Label encoders loaded successfully")

    except Exception as e:
        app.logger.error(f"Error loading artifacts: {e}")
        raise e

# Load artifacts when the application starts
load_artifacts()

# Home route
@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        app.logger.info('Received data: %s', data)

        if not data or 'features' not in data:
            raise ValueError("Invalid input data: 'features' key not found.")

        features = data['features']
        app.logger.info('Raw features: %s', features)

        # Load feature columns
        with open('feature_columns.pkl', 'rb') as f:
            feature_columns = joblib.load(f)

        # Ensure the correct number of features are passed
        if len(features) != len(feature_columns):
            raise ValueError(f"Invalid number of features provided. Expected {len(feature_columns)}, got {len(features)}.")

        # Extract and encode location
        location = features[0]
        if location not in label_encoders['location'].classes_:
            raise ValueError(f"Location '{location}' not recognized.")
        
        location_encoded = label_encoders['location'].transform([location])[0]

        # Extract and scale numerical features
        numerical_features = np.array(features[1:]).reshape(1, -1)

        # Scale numerical features
        numerical_features_scaled = scaler.transform(numerical_features)

        # Combine encoded location and scaled numerical features
        final_features = np.concatenate([[location_encoded], numerical_features_scaled[0]])

        # Predict using the model
        prediction = model.predict([final_features])
        app.logger.info('Prediction: %s', prediction)

        # Convert log prediction back to normal price
        predicted_price = np.expm1(prediction[0])  # Using expm1 since we log-transformed the price

        return jsonify({'predicted_price': round(predicted_price, 2)})
    except ValueError as ve:
        error_message = f"Value Error: {str(ve)}"
        app.logger.error(error_message)
        return jsonify({'error': error_message}), 400
    except Exception as e:
        error_message = f"Error during prediction: {str(e)}"
        app.logger.error(error_message)
        return jsonify({'error': error_message}), 500

# Analytics route
@app.route('/analytics')
def analytics():
    return render_template('analytics.html')

# Run the app locally
if __name__ == '__main__':
    app.run(debug=True, port=5002, use_reloader=False)


INFO:__main__:Model loaded successfully
INFO:__main__:Scaler loaded successfully
INFO:__main__:Label encoders loaded successfully


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5002
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:37:36] "GET / HTTP/1.1" 200 -
INFO:__main__:Received data: {'features': ['Electronic City Phase II', 2, 1056, 2, 1]}
INFO:__main__:Raw features: ['Electronic City Phase II', 2, 1056, 2, 1]
INFO:__main__:Prediction: [3.8848991]
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:37:41] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:38:00] "GET /analytics HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:38:00] "[36mGET /static/images/total_sqft_distribution.png HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:38:00] "[36mGET /static/images/price_distribution.png HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:38:00] "[36mGET /static/images/average_price_by_bhk.png HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [20/Sep/2024 07:38:00] "[36mGET /static/images/properties_by_location.png HTTP/1.1[0m" 304