# Project Introduction

# (TITLE OF THE PROJECT)
#### AIM - 
###### (LINKS)

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date and time
from datetime import datetime

# Preprocessing and modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
import scipy.stats as stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

# Utilities
import warnings
warnings.filterwarnings("ignore")

# Data Import

In [None]:
# LOAD DATASETS
domain = pd.read_csv("data/domain_properties.csv")
suburb = pd.read_csv("data/syd_sub_rev.csv")

In [None]:
# CHECKING DATA
print("Domain Properties\n")
print(domain.info(), "\n\n", domain.head, "\n\n", domain.describe)
print("\n\nSydney Suburb Review\n")
print(suburb.info(), "\n\n", suburb.head, "\n\n", suburb.describe)

# Basic Data Cleaning

In [None]:
# COLUMN STANDARDISATION
domain.columns = domain.columns.str.strip().str.lower().str.replace(" ", "_")
suburb.columns = suburb.columns.str.strip().str.lower().str.replace(" ", "_")

In [None]:
# CLEANING SUBURB FIRST
suburb.rename(columns={"name" : "suburb"}, inplace=True) # renaming the name column to suburb so we can merge datasets later

# CHECK FOR DUPLICATE SUBURBS
print(suburb["suburb"].nunique(), "unique suburbs out of", suburb.shape[0], "rows")
print(suburb["suburb"].duplicated().sum(), "duplicate suburb entries")

In [None]:
# REMOVING SYMBOLS AND CONVERTING TYPES
def clean_currency(val):
    if isinstance(val, str):
        return pd.to_numeric(val.replace("$", "").replace(",", ""), errors="coerce")
    return val

def clean_percent(val):
    if isinstance(val, str):
        return pd.to_numeric(val.replace("%", ""), errors="coerce")
    return val

def clean_int(val):
    if isinstance(val, str):
        return pd.to_numeric(val.replace(",", ""), errors="coerce")
    return val

suburb["population_(rounded)*"] = suburb["population_(rounded)*"].apply(clean_int)
suburb["median_house_price_(2020)"] = suburb["median_house_price_(2020)"].apply(clean_currency)
suburb["median_house_price_(2021)"] = suburb["median_house_price_(2021)"].apply(clean_currency)
suburb["median_house_rent_(per_week)"] = suburb["median_house_rent_(per_week)"].apply(clean_currency)
suburb["median_apartment_price_(2020)"] = suburb["median_apartment_price_(2020)"].apply(clean_currency)
suburb["median_apartment_rent_(per_week)"] = suburb["median_apartment_rent_(per_week)"].apply(clean_currency)
suburb["%_change"] = suburb["%_change"].apply(clean_percent)
suburb["public_housing_%"] = suburb["public_housing_%"].apply(clean_percent)

currency_cols = [
    "median_house_price_(2020)", "median_house_price_(2021)",
    "median_apartment_price_(2020)", "median_house_rent_(per_week)",
    "median_apartment_rent_(per_week)"
]

percent_cols = ["%_change", "public_housing_%"]
int_cols = ["population_(rounded)*"]

for col in currency_cols:
    suburb[col] = suburb[col].apply(clean_currency)

for col in percent_cols:
    suburb[col] = suburb[col].apply(clean_percent)

for col in int_cols:
    suburb[col] = suburb[col].apply(clean_int)

suburb.dtypes

In [None]:
# DROPPING IRRELEVENT COLUMNS
suburb.drop(columns=[
    "region",
    "ethnic_breakdown_2016",
    "nearest_train_station",
    "highlights/attractions",
    "ideal_for",
    "review_link"
], inplace=True, errors="ignore")

# Extracting the numerical values from time and converting to float
for col in ["time_to_cbd_(public_transport)_[town_hall_st]", "time_to_cbd_(driving)_[town_hall_st]"]:
    suburb[col] = suburb[col].str.extract(r"(\d+)").astype(float)

In [None]:
# CHECKING FOR NULLS IN SUBURB
suburb.isnull().sum().sort_values(ascending=False), suburb.dtypes

In [None]:
# MAKING NULL NUMERIC VALS TO MEDIAN WHERE APPLICABLE
median_cols = [
    'median_apartment_price_(2020)',
    'median_apartment_rent_(per_week)',
    'avg._years_held',
    'median_house_rent_(per_week)',
    'median_house_price_(2021)',
    'median_house_price_(2020)',
    '%_change',
    'traffic',
    'public_housing_%',
    'time_to_cbd_(public_transport)_[town_hall_st]',
    'time_to_cbd_(driving)_[town_hall_st]'
]

suburb[median_cols] = suburb[median_cols].fillna(suburb[median_cols].median())

# dropping more irrelevent columns
suburb.drop(columns=["things_to_see/do","postcode"], inplace=True, errors="ignore")
suburb.sample(20)

In [None]:
cols = [
    "traffic", "public_transport", "affordability_(rental)", "affordability_(buying)",
    "nature", "noise", "family-friendliness", "pet_friendliness",
    "safety", "overall_rating"
]
zero_counts = {col: (suburb[col] == 0).sum() for col in cols} # checking the 0 count in certain columns, to drop the ones with 0s
print(zero_counts)

In [None]:
suburb.drop(columns=cols, inplace=True, errors="ignore")
suburb

In [None]:
# Final check-verify no missing values remain
print(suburb.isnull().sum())

In [None]:
# CLEANING DOMAIN DATASET
print("Domain dataset info\n", domain.info())
print("\n\nDomain dataset describe\n", domain.describe(include='all'))
print("\n\nDomain dataset sample rows\n", domain.sample(20))

In [None]:
# CONVERT DATE_SOLD TO PROPER FORMAT
domain["date_sold"] = pd.to_datetime(domain["date_sold"], format="%d/%m/%y", errors="coerce")

# CHECKING FOR ANY NULLS AFTER DATE CONVERSION
invalid_dates = domain[domain["date_sold"].isna()]
print(invalid_dates)

# CHECKING SPECIFIC FEATURES FOR OUTLIERS
domain[["num_bath", "num_bed", "num_parking", "property_size"]].describe()

In [None]:
# CHECKING THE NUMBER OF OUTLIERS
# setting limits for number of bedrooms, bathrooms, parking and property size-anything outside these limits is a rare occasion making it an outlier
print("Bath outliers:", domain[domain['num_bath'] > 10].shape[0])
print("Bed outliers:", domain[domain['num_bed'] > 10].shape[0])
print("Parking outliers:", domain[domain['num_parking'] > 10].shape[0])
print("Property size outliers:", domain[domain['property_size'] > 5000].shape[0])

In [None]:
# DROPPING THE OUTLIERS
domain = domain[
    (domain['num_bath'] <= 10) &
    (domain['num_bed'] <= 10) &
    (domain['num_parking'] <= 10) &
    (domain['property_size'] <= 5000)
]

domain

In [None]:
# CHECKING DOMAIN FOR ANY REMAINING STEPS
domain.dtypes

In [None]:
# dropping irrelevent columns
drop_cols = [
    "suburb_sqkm",
    "suburb_lat",
    "suburb_lng",
    "suburb_elevation"
]

domain.drop(columns=drop_cols, inplace=True, errors="ignore")

In [None]:
# checking the different types of properties
for val in domain["type"].unique():
    print(val)

In [None]:
# CONSOLIDATING TYPE COLUMN FROM DOMAIN
# limiting the different types of properties
type_map = {
    'House' : 'House',
    'Townhouse' : 'House',
    'Semi-Detached' : 'House',
    'Duplex' : 'House',
    'Villa' : 'House',
    'Terrace' : 'House',
    'Vacant land' : 'Land',
    'New land' : 'Land',
    'Apartment / Unit / Flat' : 'Apartment',
    'Studio' : 'Apartment',
    'Block of Units' : 'Apartment',
    'New House & Land' : 'Off the Plan House',
    'New Apartments / Off the Plan' : 'Off the Plan Apartments',
    'Development Site' : 'Other',
    'Acreage / Semi-Rural' : 'Other',
    'Rural' : 'Other'
}

domain["type"] = domain["type"].map(type_map)

In [None]:
# CHECKING AFTER CONSOLIDATING
for val in domain["type"].unique():
    print(val)

print("\n",domain.isnull().sum())

# Merging Datasets

In [None]:
# MERGING DATASETS
merged = domain.merge(suburb, on="suburb", how="left")
print(merged.isnull().sum())
merged.to_csv("data/merged_dataset.csv") # saved merged dataset in /data folder

In [None]:
# DROPPING UNMATCHED ROWS
merged.dropna(inplace=True)

print(merged.isnull().sum())

In [None]:
merged.describe()

In [None]:
merged["type"].value_counts()

# Initial Exploration

In [None]:
# UNIVARIATE ANALYSIS
merged.hist(figsize=(40,25))

In [None]:
# CHECKING CORRELATION
plt.figure(figsize=(20,15))
sns.heatmap(merged.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.show()

# Deep Cleaning

In [None]:
# FILTERING CORR VALUES WITH PRICE
corr_matrix = merged.corr(numeric_only=True)

price_corr = corr_matrix["price"].sort_values(ascending=False)
print(price_corr)

In [None]:
# DROPPING COLUMNS BASED ON LOW CORR
drop_cols = [
    "avg._years_held",
    "%_change",
    "public_housing_%",
    "cash_rate"
]

merged.drop(columns=drop_cols, inplace=True, errors="ignore")

In [None]:
# CHECKING MULTICOLLINEARITY
high_corr = corr_matrix.abs() >= 0.8
np.fill_diagonal(high_corr.values, False)
high_corr_pairs = high_corr[high_corr].stack().reset_index()
high_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
print(high_corr_pairs)

In [None]:
# FILTERING COLUMNS BASED ON THE MULTICOLLINEARITY
to_drop = []
for _, row in high_corr_pairs.iterrows():
    f1, f2 = row["Feature 1"], row["Feature 2"]
    if price_corr[f1] < 0.3 and price_corr[f2] < 0.3:
        to_drop.append(f2 if price_corr[f1] >= price_corr[f2] else f1)
        
to_drop = list(set(to_drop))

merged.drop(columns=to_drop, inplace=True, errors="ignore")

merged.columns, merged.dtypes

# Exploratory Data Analysis

In [None]:
numerical_cols = merged.select_dtypes(include=["int64", "float64"]).columns

for col in numerical_cols:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(merged[col], kde=True, ax=axes[0])
    axes[0].set_title(f'Histogram of {col}')
    sns.boxplot(x=merged[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# DUPLICATING THE MERGED DATASET
merged_lr = merged.copy()

# Feature Engineering

In [None]:
# LOG TRANSFORMATION
log_transform_cols = [
    "price",
    "property_size",
    "population_(rounded)*",
    "median_house_price_(2020)",
    "median_house_price_(2021)",
    "median_house_rent_(per_week)",
    "median_apartment_price_(2020)",
    "median_apartment_rent_(per_week)"
]

for col in log_transform_cols:
    merged_lr[col] = np.log1p(merged_lr[col])

In [None]:
# CAPPING OUTLIERS
merged_lr["num_bath"] = merged_lr["num_bath"].clip(upper=6)
merged_lr["num_bed"] = merged_lr["num_bed"].clip(upper=6)
merged_lr["num_parking"] = merged_lr["num_parking"].clip(upper=6)

In [None]:
# CHECKING NUMERIC PLOTS AFTER TRANFORMATIONS
numerical_cols = merged_lr.select_dtypes(include=["int64", "float64"]).columns

for col in numerical_cols:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(merged_lr[col], kde=True, ax=axes[0])
    axes[0].set_title(f'Histogram of {col}')
    sns.boxplot(x=merged_lr[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
merged_lr = pd.get_dummies(merged_lr, columns=["type"], drop_first=True)
merged_lr.rename(columns={
    "type_House" : "House",
    "type_Land" : "Land",
    "type_Off the Plan Apartments" : "Off the Plan Apartments",
    "type_Off the Plan House" : "Off the Plan House",
    "type_Other" : "Other"
}, inplace=True)
# cant execute this again have to runall

In [None]:
merged_lr.columns

# Train-Test Split & Scaling

In [None]:
# TRAIN TEST SPLIT
X = merged_lr.drop(columns=["price", "suburb", "date_sold"])
y = merged_lr["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# SCALING
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

# Baseline Linear Regression

In [None]:
# BASELINE LINEAR REGRESSION
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

lr_y_pred = lr_model.predict(X_test_scaled)

lr_rmse = np.sqrt(mean_squared_error(y_test, lr_y_pred))
lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_mae = mean_absolute_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

print(
    "Linear Model Results:\nRMSE = ", lr_rmse,
    "\nMSE = ", lr_mse,
    "\nMAE = ", lr_mae,
    "\nR2 = ",lr_r2
    )

# Residual Diagnostics

In [None]:
# CHECKING RESIDUALS
lr_residuals = y_test - lr_y_pred

# RESIDUALS VS FITTED
plt.figure(figsize=(12, 12))
sns.scatterplot(x = lr_y_pred, y = lr_residuals)
plt.axhline(0, color = "red", linestyle = "--")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted Values")
plt.show()

In [None]:
# Q-Q PLOT
plt.figure(figsize=(12,12))
stats.probplot(lr_residuals, dist = "norm", plot = plt)
plt.title("Q-Q Plot")
plt.show()

In [None]:
# HISTOGRAM OF RESIDUAL
plt.figure(figsize=(12,12))
sns.histplot(lr_residuals, kde = True)
plt.xlabel("Residuals")
plt.title("Histogram of Residuals")
plt.show()

In [None]:
# STANDARDISING THE RESIDUALS
std_residuals = lr_residuals / np.std(lr_residuals)

plt.figure(figsize=(12,12))
sns.scatterplot(x = lr_y_pred, y = std_residuals)
plt.axhline(0, color = "red", linestyle = "--")
plt.xlabel("Fitted values")
plt.ylabel("Standardised Residuals")
plt.title("Standardised Residuals vs Fitted")
plt.show()

# Interaction Model

In [None]:
# FEATURE INTERACTION
# Chosen combos:
# num_bed × property_size
# num_bath × median_house_price_(2021)
# suburb_median_income × median_house_price_(2021)
# num_parking × suburb_median_income

X_train_interaction = X_train.copy()
X_test_interaction = X_test.copy()

X_train_interaction["bed * size"] = X_train_interaction["num_bed"] * X_train_interaction["property_size"]
X_test_interaction["bed * size"] = X_test_interaction["num_bed"] * X_test_interaction["property_size"]

X_train_interaction["bath * price"] = X_train_interaction["num_bath"] * X_train_interaction["median_house_price_(2021)"]
X_test_interaction["bath * price"] = X_test_interaction["num_bath"] * X_test_interaction["median_house_price_(2021)"]

X_train_interaction["income * price"] = X_train_interaction["suburb_median_income"] * X_train_interaction["median_house_price_(2021)"]
X_test_interaction["income * price"] = X_test_interaction["suburb_median_income"] * X_test_interaction["median_house_price_(2021)"]

X_train_interaction["parking * income"] = X_train_interaction["num_parking"] * X_train_interaction["suburb_median_income"]
X_test_interaction["parking * income"] = X_test_interaction["num_parking"] * X_test_interaction["suburb_median_income"]

In [None]:
# RETRAINING MODEL WITH INTERACTION TERMS
X_train_interaction_scaled = scalar.fit_transform(X_train_interaction)
X_test_interaction_scaled = scalar.transform(X_test_interaction)

lr_interaction_model = LinearRegression()
lr_interaction_model.fit(X_train_interaction_scaled, y_train)

lr_y_pred_interaction = lr_interaction_model.predict(X_test_interaction_scaled)

lr_rmse_interaction = np.sqrt(mean_squared_error(y_test, lr_y_pred_interaction))
lr_mse_interaction = mean_squared_error(y_test, lr_y_pred_interaction)
lr_mae_interaction = mean_absolute_error(y_test, lr_y_pred_interaction)
lr_r2_interaction = r2_score(y_test, lr_y_pred_interaction)

print(
    "Linear Model Results with Interaction terms:\nRMSE = ", lr_rmse_interaction,
      "\nMSE = ", lr_mse_interaction,
      "\nMAE = ", lr_mae_interaction,
      "\nR2 = ",lr_r2_interaction
      )

# Regularisation Models

In [None]:
# REGULARISATION
# RIDGE
ridge_alphas = np.logspace(-4, 4, 100)
ridge_cv = RidgeCV(alphas=ridge_alphas, cv=5)
ridge_cv.fit(X_train_interaction_scaled, y_train)

ridge_y_pred = ridge_cv.predict(X_test_interaction_scaled)

ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_mse = mean_squared_error(y_test, ridge_y_pred)
ridge_mae = mean_absolute_error(y_test, ridge_y_pred)
ridge_r2 = r2_score(y_test, ridge_y_pred)

print(
    "Ridge Regression Results:\nRMSE = ",ridge_rmse,
    "\nMSE = ", ridge_mse,
    "\nMAE = ", ridge_mae,
    "\nR2 = ",ridge_r2
    )

In [None]:
# LASSO
lasso_alphas = np.logspace(-4, 4, 100)
lasso_cv = LassoCV(alphas=lasso_alphas, cv=5)
lasso_cv.fit(X_train_interaction_scaled, y_train)

lasso_y_pred = lasso_cv.predict(X_test_interaction_scaled)

lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_y_pred))
lasso_mse = mean_squared_error(y_test, lasso_y_pred)
lasso_mae = mean_absolute_error(y_test, lasso_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)

print(
    "Lasso Regression Results:\nRMSE = ",lasso_rmse,
    "\nMSE = ", lasso_mse,
    "\nMAE = ", lasso_mae,
    "\nR2 = ",lasso_r2
    )

In [None]:
# ELASTICNET
elasticnet_alphas = np.logspace(-4, 4, 100)
elasticnet_l1_ratios = np.linspace(0.1, 0.9, 9)

elasticnet_cv = ElasticNetCV(alphas=elasticnet_alphas, l1_ratio=elasticnet_l1_ratios, cv=5)
elasticnet_cv.fit(X_train_interaction_scaled, y_train)

elasticnet_y_pred = elasticnet_cv.predict(X_test_interaction_scaled)

elasticnet_rmse = np.sqrt(mean_squared_error(y_test, elasticnet_y_pred))
elasticnet_mse = mean_squared_error(y_test, elasticnet_y_pred)
elasticnet_mae = mean_absolute_error(y_test, elasticnet_y_pred)
elasticnet_r2 = r2_score(y_test, elasticnet_y_pred)

print(
    "ElastiNet Regression Results:\nRMSE = ",elasticnet_rmse,
    "\nMSE = ", elasticnet_mse,
    "\nMAE = ", elasticnet_mae,
    "\nR2 = ",elasticnet_r2
    )

# Decision Tree Model

In [None]:
# BASELINE DECISION TREE
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train_interaction, y_train)

dt_y_pred = dt_model.predict(X_test_interaction)

dt_rmse = np.sqrt(mean_squared_error(y_test, dt_y_pred))
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_mae = mean_absolute_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

print(
    "Decision Tree Regression Results:\nRMSE = ",dt_rmse,
    "\nMSE = ", dt_mse,
    "\nMAE = ", dt_mae,
    "\nR2 = ",dt_r2
    )

In [None]:
# HYPERPARAMETER DECISION TREE
dt_params = {
    "max_depth" : [5, 10, 15, 20, None],
    "min_samples_split" : [2, 5, 10],
    "min_samples_leaf" : [1, 2, 4],
    "max_features" : ["auto", "sqrt", "log2", None]
}

dt_grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid=dt_params,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=1
)

dt_grid.fit(X_train_interaction, y_train)
best_dt = dt_grid.best_estimator_

dt_y_pred_best = best_dt.predict(X_test_interaction)

dt_rmse_best = np.sqrt(mean_squared_error(y_test, dt_y_pred_best))
dt_mse_best = mean_squared_error(y_test, dt_y_pred_best)
dt_mae_best = mean_absolute_error(y_test, dt_y_pred_best)
dt_r2_best = r2_score(y_test, dt_y_pred_best)

print(
    "Tuned Decision Tree Regression Results:\nRMSE = ",dt_rmse_best,
    "\nMSE = ", dt_mse_best,
    "\nMAE = ", dt_mae_best,
    "\nR2 = ",dt_r2_best
    )

In [None]:
# RANDOM FORREST
rf_params = {
    "n_estimators" : [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split" : [2, 5],
    "min_samples_leaf" : [1, 2],
    "max_features" : ["sqrt", "log2"]
}

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid=rf_params,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=1
)

rf_grid.fit(X_train_interaction, y_train)
best_rf = rf_grid.best_estimator_

rf_y_pred_best = best_rf.predict(X_test_interaction)

rf_rmse_best = np.sqrt(mean_squared_error(y_test, rf_y_pred_best))
rf_mse_best = mean_squared_error(y_test, rf_y_pred_best)
rf_mae_best = mean_absolute_error(y_test, rf_y_pred_best)
rf_r2_best = r2_score(y_test, rf_y_pred_best)

print(
    "Random Forrest Regression Results:\nRMSE = ",rf_rmse_best,
    "\nMSE = ", rf_mse_best,
    "\nMAE = ", rf_mae_best,
    "\nR2 = ",rf_r2_best
    )

# Model Comparison Plots

In [None]:
model_names = [
    "Linear (Baseline)",
    "Linear (Interaction)",
    "Ridge",
    "Lasso",
    "Elasticnet",
    "Decision Tree",
    "Decision Tree (Tuned)",
    "Random Forrest"
]

r2_scores = [lr_r2, lr_r2_interaction, ridge_r2, lasso_r2, elasticnet_r2, dt_r2, dt_r2_best, rf_r2_best]
rmse_scores = [lr_rmse, lr_rmse_interaction, ridge_rmse, lasso_rmse, elasticnet_rmse, dt_rmse, dt_rmse_best, rf_rmse_best]
mse_scores = [lr_mse, lr_mse_interaction, ridge_mse, lasso_mse, elasticnet_mse, dt_mse, dt_mse_best, rf_mse_best]
mae_scores = [lr_mae, lr_mae_interaction, ridge_mae, lasso_mae, elasticnet_mae, dt_mae, dt_mae_best, rf_mae_best]

metrics = {
    "R2 Score" : {"values": r2_scores, "color": "green", "ylabel": "R2 Score"},
    "RMSE" : {"values": rmse_scores, "color": "blue", "ylabel": "RMSE"},
    "MSE" : {"values": mse_scores, "color": "black", "ylabel": "MSE"},
    "MAE" : {"values": mae_scores, "color": "red", "ylabel": "MAE"},
}

for title, config in metrics.items():
    plt.figure(figsize=(15, 8))
    plt.plot(model_names, config["values"], marker='o', color=config["color"])
    for i, v in enumerate(config["values"]):
        plt.text(i, v + 0.005, f"{v:.3f}", ha='center', color=config["color"])
    plt.title(f"{title} Comparison")
    plt.xlabel("Model")
    plt.ylabel(config["ylabel"])
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()
