In [1]:
from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

## Data Loading
Load and prepare the training data:
- Read preprocessed train data from CSV
- Split features (X_train) and target variable (y_train)

In [2]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]

## Initialize Preprocessing Components
Set up preprocessing components:
- ExtendedTransformation for complex feature engineering
- SimpleFilter for feature selection

In [3]:
preprocessor = ExtendedTransformation()
filter = SimpleFilter()

## Fit Preprocessor
Fit the preprocessor to learn data characteristics:
- Analyzes feature distributions and relationships
- Prepares for transformation

In [4]:
preprocessor.fit(X_train, y_train)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37


## Apply Data Transformation
Execute the preprocessing pipeline:
- Transforms raw features into engineered features
- Includes feature scaling, encoding, and feature crossing
- Outputs processed features (X_processed) and target (y_processed)

In [5]:
X_processed, y_processed = preprocessor.transform(X_train, y_train)

X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)


## Examine Processed Data
Display the first few rows of the processed test data:
- Shows the structure and values of engineered features

In [6]:
X_processed.head()

Unnamed: 0,Area,No. of Bedrooms,city_Bangalore,city_Chennai,city_Delhi,city_Hyderabad,city_Kolkata,city_Mumbai,Resale_NO,Resale_SI,...,Stadium_NO Stadium_NO_DISPONIBLE,Stadium_NO Stadium_SI,Stadium_NO Area,Stadium_NO No. of Bedrooms,Stadium_NO_DISPONIBLE Stadium_SI,Stadium_NO_DISPONIBLE Area,Stadium_NO_DISPONIBLE No. of Bedrooms,Stadium_SI Area,Stadium_SI No. of Bedrooms,Area No. of Bedrooms
0,0.693273,-0.552165,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.693273,-0.552165,-0.382801
1,0.983529,0.897222,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.983529,0.897222,0.0,0.0,0.882444
2,-0.417346,0.897222,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,-0.0,0.0,0.0,-0.417346,0.897222,-0.0,0.0,-0.374452
3,-1.331565,-0.552165,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,-1.331565,-0.552165,0.0,-0.0,-0.0,-0.0,-0.0,0.735243
4,0.317853,0.897222,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.317853,0.897222,0.0,0.0,0.0,0.0,0.0,0.285184


## Feature Selection
Apply feature selection using SimpleFilter:
- Fits the filter to the processed data
- Identifies most relevant features for the model

In [7]:
filter.fit(X_processed, y_processed)

(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [8]:
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

(20974, 4173)
(20974, 3193)
(20974, 1635)


In [9]:
X_filtered.shape

(20974, 1635)

In [10]:
from sklearn.metrics import ( root_mean_squared_error, 
                             mean_absolute_error, 
                             mean_absolute_percentage_error )

## Train Machine Learning Model
Train a RandomForestRegressor:
- Uses processed and filtered data
- Configured with 500 estimators

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_m_2 = RandomForestRegressor(500)
rf_m_2.fit(X_filtered, y_filtered)

  return fit_method(estimator, *args, **kwargs)


## Evaluate Model Performance
Calculate the R² score of the trained model:
- Measures how well the model fits the training data
- Provides a baseline for model performance

In [None]:
rf_m_2.score(X_filtered, y_filtered)

0.8320713352254937

## Load Test Dataset
Prepare the test dataset:
- Read from CSV file
- Split features (X_test) and target variable (y_test)

In [None]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]

## Process Test Data
Apply the same preprocessing pipeline to test data:
- Transform features using preprocessor
- Apply feature selection using filter
- Output processed and filtered test data

In [None]:
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


## Generate Predictions
- Use trained model to predict test data
- Convert scaled predictions back to original scale

In [20]:
y_hat = rf_m_2.predict(X_test_filtered)
y_hat_unscaled = preprocessor.inverse_transform(y_hat.reshape(-1,1))



## Calculate Performance Metrics
Evaluate model performance on test data:
- Root Mean Squared Error (RMSE)
- Mean Absolute Error (MAE)
- Mean Absolute Percentage Error (MAPE)

In [21]:
y_true = y_test.values
y_pred = y_hat_unscaled
rmse = root_mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

# Format metrics with units and percentages
metrics = {
    "RMSE (₹)": f"{rmse:,.2f}",
    "MAE (₹)": f"{mae:,.2f}",
    "MAPE (%)": f"{mape:.2%}"
}

# Display metrics in a formatted table
print("\nModel Performance Metrics:\n")
print(f"{'Metric':<15} {'Value':>15}")
print("-" * 30)
for metric, value in metrics.items():
    print(f"{metric:<15} {value:>15}")

# Add a summary interpretation
print("\nInterpretation:")
print(f"- RMSE: The model's predictions are typically off by ₹{rmse:,.2f} on average")
print(f"- MAE: The average absolute error is ₹{mae:,.2f}")
print(f"- MAPE: The predictions are off by {mape:.1%} on average")


Model Performance Metrics:

Metric                    Value
------------------------------
RMSE (₹)          25,065,272.48
MAE (₹)            6,032,043.73
MAPE (%)                 42.60%

Interpretation:
- RMSE: The model's predictions are typically off by ₹25,065,272.48 on average
- MAE: The average absolute error is ₹6,032,043.73
- MAPE: The predictions are off by 42.6% on average
