In [25]:
import pandas as pd

import numpy as np

In [26]:
df=pd.read_csv("artifacts/test_predictions.csv")

In [27]:
df[['predicted_price','true_price']]

Unnamed: 0,predicted_price,true_price
0,599709.75,612001.0
1,367968.97,392001.0
2,365732.72,399889.0
3,385528.53,385001.0
4,307601.53,235001.0
...,...,...
3237,889323.56,1075001.0
3238,673782.60,822001.0
3239,417309.97,425001.0
3240,356566.50,421001.0


In [28]:
ids=np.array(df['id'])
y_true=np.array(df['true_price'])
price_pred=np.array(df['predicted_price'])

In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --------------------------------------------------
# ASSUMES YOU ALREADY HAVE:
# ids           -> array-like
# y_true        -> true prices (â‚¹, NOT log)
# price_pred    -> predicted prices (â‚¹, NOT log)
# --------------------------------------------------

# Build evaluation dataframe
eval_df = pd.DataFrame({
    "id": ids,
    "true_price": y_true,
    "pred_price": price_pred
})

# Absolute & percentage errors
eval_df["abs_error"] = np.abs(
    eval_df["true_price"] - eval_df["pred_price"]
)
eval_df["ape"] = eval_df["abs_error"] / eval_df["true_price"]

# --------------------------------------------------
# Price buckets (â‚¹)
# --------------------------------------------------
bins = [0, 10_00_000, 30_00_000, 50_00_000, np.inf]
labels = [
    "< 10L",
    "10L â€“ 20L",
    "30L â€“ 50L",
    "> 50L"
]

eval_df["price_bucket"] = pd.cut(
    eval_df["true_price"],
    bins=bins,
    labels=labels
)

# --------------------------------------------------
# Metrics per bucket
# --------------------------------------------------
results = []

for bucket in labels:
    subset = eval_df[eval_df["price_bucket"] == bucket]

    if len(subset) == 0:
        continue

    mae = mean_absolute_error(
        subset["true_price"],
        subset["pred_price"]
    )

    rmse = np.sqrt(mean_squared_error(
        subset["true_price"],
        subset["pred_price"]
    ))

    mape = np.mean(subset["ape"]) * 100

    results.append({
        "Price Bucket": bucket,
        "Samples": len(subset),
        "MAE (â‚¹)": round(mae, 2),
        "RMSE (â‚¹)": round(rmse, 2),
        "MAPE (%)": round(mape, 2)
    })

bucket_metrics_df = pd.DataFrame(results)

# --------------------------------------------------
# Display
# --------------------------------------------------
print("\nðŸ“Š Error vs Price Buckets\n" + "-" * 40)
display(bucket_metrics_df)

# --------------------------------------------------
# OPTIONAL: Error contribution per bucket
# --------------------------------------------------
print("\nðŸ”¥ Total Absolute Error Contribution")
display(
    eval_df.groupby("price_bucket")["abs_error"]
    .sum()
    .sort_values(ascending=False)
)



ðŸ“Š Error vs Price Buckets
----------------------------------------


Unnamed: 0,Price Bucket,Samples,MAE (â‚¹),RMSE (â‚¹),MAPE (%)
0,< 10L,3020,61587.79,85419.32,14.72
1,10L â€“ 20L,217,333811.63,423292.37,21.88
2,30L â€“ 50L,4,853657.08,926133.45,27.43
3,> 50L,1,1831041.5,1831041.5,35.83



ðŸ”¥ Total Absolute Error Contribution


  eval_df.groupby("price_bucket")["abs_error"]


price_bucket
< 10L        1.859951e+08
10L â€“ 20L    7.243712e+07
30L â€“ 50L    3.414628e+06
> 50L        1.831041e+06
Name: abs_error, dtype: float64

In [35]:
df=pd.read_csv("Data/final_train_test_data/train_df.csv")
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
TEST_PRICE_CSV="Data/price/price_reference_train.csv"
price_df = pd.read_csv(TEST_PRICE_CSV)
price_df["id"] = price_df["id"].astype(str).str.strip()
price_pred=pd.read_csv("artifacts/tabular_only_test_predictions.csv")
log_price_pred=price_pred['predicted_log_price']
id_to_logprice = dict(
        zip(price_df["id"], price_df["log_price"])
    )
price_pred=price_pred['predicted_price']

y_log_true = np.array(df['log_price'])
y_true = np.exp(y_log_true)
metrics = {
        "RMSE_log": rmse(y_log_true, log_price_pred),
        "RMSE_price": rmse(y_true, price_pred),
        "MAE_price": mean_absolute_error(y_true, price_pred),
        "R2_price": r2_score(y_true, price_pred),
    }

print("\nðŸ“Š Test Set Evaluation")
print("-" * 40)
for k, v in metrics.items():
        print(f"{k:15s}: {v:,.4f}")


ðŸ“Š Test Set Evaluation
----------------------------------------
RMSE_log       : 0.1705
RMSE_price     : 117,518.6783
MAE_price      : 67,547.7652
R2_price       : 0.8605
