# **Objectives and Hypotheses**

## **Primary Hypotheses:**
### Cluster-Specific Modeling Hypothesis:
Training separate XGBoost regression models for distinct unsupervised data clusters will yield improved predictive performance (lower RMSE and MAE, higher R²) compared to a single model trained on the entire dataset.

### Distance Metric Hypothesis:
K-means clustering using Mahalanobis distance will produce more meaningful and effective clusters for bike rental prediction than traditional Euclidean distance, resulting in better model performance metrics.

## **Experimental Design:**
### Baseline Model:
XGBoost Regressor
### Target Variable:
Rented Bike Count
### Dataset:
Seoul Bike Sharing Demand
### Data Split:
80% training, 20% testing
### Clustering:
K-means with k=3, using both Euclidean and Mahalanobis distances

### Visualization:
PCA for dimensionality reduction and cluster visualization

### Performance Metrics:
RMSE, MAE, and R²

## **Expected Outcomes:**


*  Cluster-specific models will capture local patterns within each cluster that might be overlooked by a global model, leading to improved overall prediction accuracy.
*  Mahalanobis distance-based clustering will better account for feature correlations and variance differences in the Seoul Bike dataset, resulting in more meaningful clusters and subsequently better predictive performance than Euclidean-based clustering.




# **Import Libraries**

In [None]:
pip install ucimlrepo

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
import math
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.spatial.distance import mahalanobis
from ucimlrepo import fetch_ucirepo
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor


from matplotlib.colors import LinearSegmentedColormap
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D


plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['axes.edgecolor'] = '#333333'
mpl.rcParams['axes.linewidth'] = 0.8
mpl.rcParams['xtick.color'] = '#333333'
mpl.rcParams['ytick.color'] = '#333333'


# **Import Data**

In [None]:
# Fetch Seoul Bike Sharing Demand dataset from UCI ML Repository
seoul_bike_sharing_demand = fetch_ucirepo(id=560)

In [None]:
# Data (as pandas dataframes)
X_original = seoul_bike_sharing_demand.data.features
y_original = seoul_bike_sharing_demand.data.targets


In [None]:
# Print dataset information
print("Dataset Metadata:")
print(seoul_bike_sharing_demand.metadata)
print("\nVariable Information:")
print(seoul_bike_sharing_demand.variables)

In [None]:
# Examine feature information
print("\nOriginal feature columns:")
print(X_original.columns.tolist())
print("\nOriginal target variable:")
print(y_original.columns.tolist())

# **Data Preprocessing**

In [None]:
# Make 'Rented Bike Count' the new target if it exists

if 'Rented Bike Count' in X_original.columns:
    # Make 'Rented Bike Count' the new target
    y = X_original[['Rented Bike Count']]
    # Remove 'Rented Bike Count' from features
    X = X_original.drop('Rented Bike Count', axis=1)
    # Add original target to features
    X = pd.concat([X, y_original], axis=1)

else:
    # If 'Rented Bike Count' is already the target, just confirm
    print("'Rented Bike Count' is already the target variable.")
    y = y_original
    X = X_original

In [None]:
print("\nNew feature columns:")
print(X.columns.tolist())
print("\nNew target variable:")
print(y.columns.tolist())

In [None]:
# Check for missing values
print("\nMissing values in features:")
print(X.isnull().sum())
print("\nMissing values in target:")
print(y.isnull().sum())

In [None]:
# 1. Convert date column to datetime and extract useful components. date format is DD/MM/YYYY
if 'Date' in X.columns:
    # Specify the correct date format as DD/MM/YYYY
    X['Date'] = pd.to_datetime(X['Date'], format='%d/%m/%Y')
    X['Year'] = X['Date'].dt.year
    X['Month'] = X['Date'].dt.month
    X['Day'] = X['Date'].dt.day
    X['DayOfWeek'] = X['Date'].dt.dayofweek
    X = X.drop('Date', axis=1)

In [None]:
# 2. Convert categorical features to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [None]:
X

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_df = pd.DataFrame(X_train, columns=X_train.columns)

correlation_matrix = X_train_df.corr()

plt.figure(figsize=(12, 8))
# Use the calculated correlation matrix in the heatmap
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Function to calculate VIF (Variance Inflation Factor)
def calculate_vif(X):
    # Select only numeric columns to avoid issues with non-numeric data types
    X_numeric = X.select_dtypes(include=np.number)
    # Drop columns with any non-finite values if they exist (though the type error suggests something else)
    # X_numeric = X_numeric.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

    # Check if there are still columns left
    if X_numeric.shape[1] == 0:
        print("Warning: No numeric columns found for VIF calculation.")
        return pd.DataFrame(columns=["feature", "VIF"])

    vif_data = pd.DataFrame()
    vif_data["feature"] = X_numeric.columns
    # Ensure the values passed to variance_inflation_factor are finite and the correct type
    vif_data["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
    return vif_data.sort_values("VIF", ascending=False)

# Function to remove collinear features based on correlation threshold
def remove_collinear_features(X, threshold=0.8):
    # Select only numeric columns before calculating correlation
    X_numeric = X.select_dtypes(include=np.number)

    corr_matrix = X_numeric.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Plot correlation matrix
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=False,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.show()

    # Find features with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    print(f"Dropping {len(to_drop)} collinear features: {', '.join(to_drop)}")

    # Return dataframe with collinear features removed (applied to original X)
    return X.drop(to_drop, axis=1)


In [None]:
# Calculate VIF before feature removal
print("Variance Inflation Factors before colinearity removal:")
# Ensure only numeric columns are passed to calculate_vif
X_train_numeric = X_train.select_dtypes(include=np.number)
vif_before = calculate_vif(X_train_numeric)
print(vif_before.head(10))  # Show top 10 highest VIF values

# Remove collinear features
# The remove_collinear_features function now handles numeric selection internally for correlation
X_train_filtered = remove_collinear_features(X_train, threshold=0.8)
# Ensure test set has the same columns as the filtered training set
X_test_filtered = X_test[X_train_filtered.columns]

# Calculate VIF after feature removal
print("\nVariance Inflation Factors after collinearity removal:")
# Ensure only numeric columns are passed to calculate_vif
X_train_filtered_numeric = X_train_filtered.select_dtypes(include=np.number)
vif_after = calculate_vif(X_train_filtered_numeric)
print(vif_after.head(10))

In [None]:
# Standardize the filtered features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

# **Model Development**

In [None]:
# Baseline XGBoost model
print("Training baseline XGBoost regression model...")
baseline_model = xgb.XGBRegressor(random_state=42)
baseline_model.fit(X_train_scaled, y_train.values.ravel())
baseline_preds = baseline_model.predict(X_test_scaled)
baseline_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, baseline_preds)),
    'r2': r2_score(y_test, baseline_preds)
}


In [None]:
# Euclidean K-means clustering
print("Performing Euclidean K-means clustering...")
euclidean_kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
euclidean_clusters = euclidean_kmeans.fit_predict(X_train_scaled)

In [None]:
# Mahalanobis K-means clustering
print("Performing Mahalanobis distance-based clustering...")
# Calculate covariance matrix
cov = np.cov(X_train_scaled, rowvar=False)
# Add small value to diagonal to ensure matrix is invertible
cov += np.eye(cov.shape[0]) * 1e-6
inv_cov = np.linalg.inv(cov)

In [None]:
# Function to calculate Mahalanobis distance
def mahalanobis_distance(x, y, inv_cov):
    return mahalanobis(x, y, inv_cov)


In [None]:
# Custom K-means with Mahalanobis distance
# Initialize with Euclidean K-means
mahalanobis_clusters = euclidean_kmeans.predict(X_train_scaled)

# Refine clusters using Mahalanobis distance
centroids = np.array([X_train_scaled[mahalanobis_clusters == i].mean(axis=0) for i in range(3)])
for _ in range(5):  # Limited iterations for simplicity
    # Assign points to nearest centroid using Mahalanobis distance
    mahalanobis_clusters = np.zeros(len(X_train_scaled), dtype=int)
    for i, point in enumerate(X_train_scaled):
        distances = [mahalanobis_distance(point, centroid, inv_cov) for centroid in centroids]
        mahalanobis_clusters[i] = np.argmin(distances)

    # Update centroids
    for i in range(3):
        if sum(mahalanobis_clusters == i) > 0:
            centroids[i] = X_train_scaled[mahalanobis_clusters == i].mean(axis=0)

In [None]:
# Train cluster-specific models
print("Training cluster-specific XGBoost regression models...")
euclidean_models = []
mahalanobis_models = []

for i in range(3):
    # Euclidean cluster model
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        print(f"Training Euclidean cluster {i} model with {sum(cluster_indices)} samples")
        model = xgb.XGBRegressor(random_state=42)
        model.fit(X_train_scaled[cluster_indices], y_train.iloc[cluster_indices].values.ravel())
        euclidean_models.append(model)
    else:
        euclidean_models.append(None)

    # Mahalanobis cluster model
    cluster_indices = mahalanobis_clusters == i
    if sum(cluster_indices) > 0:
        print(f"Training Mahalanobis cluster {i} model with {sum(cluster_indices)} samples")
        model = xgb.XGBRegressor(random_state=42)
        model.fit(X_train_scaled[cluster_indices], y_train.iloc[cluster_indices].values.ravel())
        mahalanobis_models.append(model)
    else:
        mahalanobis_models.append(None)

In [None]:
# Predict test set clusters
euclidean_test_clusters = euclidean_kmeans.predict(X_test_scaled)
mahalanobis_test_clusters = np.zeros(len(X_test_scaled), dtype=int)
for i, point in enumerate(X_test_scaled):
    distances = [mahalanobis_distance(point, centroid, inv_cov) for centroid in centroids]
    mahalanobis_test_clusters[i] = np.argmin(distances)

In [None]:
# Make predictions using cluster-specific models
print("Making predictions with cluster-specific models...")
euclidean_preds = np.zeros(len(y_test))
mahalanobis_preds = np.zeros(len(y_test))

for i in range(3):
    cluster_indices = euclidean_test_clusters == i
    if sum(cluster_indices) > 0 and euclidean_models[i] is not None:
        euclidean_preds[cluster_indices] = euclidean_models[i].predict(X_test_scaled[cluster_indices])

    cluster_indices = mahalanobis_test_clusters == i
    if sum(cluster_indices) > 0 and mahalanobis_models[i] is not None:
        mahalanobis_preds[cluster_indices] = mahalanobis_models[i].predict(X_test_scaled[cluster_indices])

In [None]:
# Calculate regression metrics
euclidean_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, euclidean_preds)),
    'r2': r2_score(y_test, euclidean_preds)
}

mahalanobis_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, mahalanobis_preds)),
    'r2': r2_score(y_test, mahalanobis_preds)
}

# **Evaluation and Visualizations**

In [None]:



euclidean_palette = ['#FF5E5B', '#D8D8D8', '#39A0ED']  # Coral, Light Gray, Blue
mahalanobis_palette = ['#FFD166', '#06D6A0', '#118AB2']  # Yellow, Teal, Blue


euclidean_palette = ['#FF5E5B', '#D8D8D8', '#39A0ED']  # Coral, Light Gray, Blue
mahalanobis_palette = ['#FFD166', '#06D6A0', '#118AB2']  # Yellow, Teal, Blue


print("Generating enhanced 3D PCA visualization...")
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)


explained_variance = pca.explained_variance_ratio_ * 100


fig = plt.figure(figsize=(18, 9))
fig.patch.set_facecolor('#f8f9fa')  # Light background for the entire figure

# Euclidean clusters - 3D plot
ax1 = fig.add_subplot(121, projection='3d')
for i in range(3):
    cluster_points = X_train_pca[euclidean_clusters == i]
    ax1.scatter(
        cluster_points[:, 0],
        cluster_points[:, 1],
        cluster_points[:, 2],
        s=50,  # Marker size
        c=[euclidean_palette[i]],
        label=f'Cluster {i}',
        alpha=0.8,
        edgecolors='w',
        linewidth=0.5
    )

# Euclidean plot
ax1.set_title('Euclidean K-means Clusters', fontsize=16, pad=20, fontweight='bold', color='#333333')
ax1.set_xlabel(f'PC1 ({explained_variance[0]:.1f}%)', fontsize=12, labelpad=10)
ax1.set_ylabel(f'PC2 ({explained_variance[1]:.1f}%)', fontsize=12, labelpad=10)
ax1.set_zlabel(f'PC3 ({explained_variance[2]:.1f}%)', fontsize=12, labelpad=10)
ax1.grid(True, alpha=0.3)
ax1.xaxis.pane.fill = False
ax1.yaxis.pane.fill = False
ax1.zaxis.pane.fill = False
ax1.xaxis.pane.set_edgecolor('w')
ax1.yaxis.pane.set_edgecolor('w')
ax1.zaxis.pane.set_edgecolor('w')
ax1.view_init(elev=30, azim=45)  # Set viewing angle

# Mahalanobis clusters - 3D plot
ax2 = fig.add_subplot(122, projection='3d')
for i in range(3):
    cluster_points = X_train_pca[mahalanobis_clusters == i]
    ax2.scatter(
        cluster_points[:, 0],
        cluster_points[:, 1],
        cluster_points[:, 2],
        s=50,  # Marker size
        c=[mahalanobis_palette[i]],
        label=f'Cluster {i}',
        alpha=0.8,
        edgecolors='w',
        linewidth=0.5
    )

# Mahalanobis plot
ax2.set_title('Mahalanobis K-means Clusters', fontsize=16, pad=20, fontweight='bold', color='#333333')
ax2.set_xlabel(f'PC1 ({explained_variance[0]:.1f}%)', fontsize=12, labelpad=10)
ax2.set_ylabel(f'PC2 ({explained_variance[1]:.1f}%)', fontsize=12, labelpad=10)
ax2.set_zlabel(f'PC3 ({explained_variance[2]:.1f}%)', fontsize=12, labelpad=10)
ax2.grid(True, alpha=0.3)
ax2.xaxis.pane.fill = False
ax2.yaxis.pane.fill = False
ax2.zaxis.pane.fill = False
ax2.xaxis.pane.set_edgecolor('w')
ax2.yaxis.pane.set_edgecolor('w')
ax2.zaxis.pane.set_edgecolor('w')
ax2.view_init(elev=30, azim=45)  # Set viewing angle

# Add legends with custom styling
for ax in [ax1, ax2]:
    legend = ax.legend(
        title="Cluster Groups",
        title_fontsize=12,
        fontsize=10,
        loc='upper right',
        bbox_to_anchor=(1.15, 0.9),
        frameon=True,
        facecolor='white',
        edgecolor='#dddddd'
    )
    legend.get_frame().set_alpha(0.9)

# title
plt.suptitle('3D PCA Visualization of Clustering Results',
             fontsize=20,
             y=0.98,
             fontweight='bold',
             color='#333333')

# explanatory text
fig.text(0.5, 0.01,
         f'Total explained variance: {sum(explained_variance[:3]):.1f}%',
         ha='center',
         fontsize=12,
         color='#555555',
         style='italic')

# layout and save
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.1)
plt.savefig('3d_clusters_visualization.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

# Create an additional 2D plot with density contours for better pattern visibility
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))
fig.patch.set_facecolor('#f8f9fa')

# Function to add contours
def add_density_contour(ax, x, y, color):
    sns.kdeplot(x=x, y=y, ax=ax, levels=5, color=color, alpha=0.3, linewidths=1)

# Euclidean clusters - 2D with contours
for i in range(3):
    cluster_points = X_train_pca[euclidean_clusters == i]
    ax1.scatter(
        cluster_points[:, 0],
        cluster_points[:, 1],
        s=60,
        c=[euclidean_palette[i]],
        label=f'Cluster {i}',
        alpha=0.7,
        edgecolors='w',
        linewidth=0.5
    )
    add_density_contour(ax1, cluster_points[:, 0], cluster_points[:, 1], euclidean_palette[i])

# Mahalanobis clusters - 2D with contours
for i in range(3):
    cluster_points = X_train_pca[mahalanobis_clusters == i]
    ax2.scatter(
        cluster_points[:, 0],
        cluster_points[:, 1],
        s=60,
        c=[mahalanobis_palette[i]],
        label=f'Cluster {i}',
        alpha=0.7,
        edgecolors='w',
        linewidth=0.5
    )
    add_density_contour(ax2, cluster_points[:, 0], cluster_points[:, 1], mahalanobis_palette[i])

# Customize 2D plots
for i, ax in enumerate([ax1, ax2]):
    title = 'Euclidean K-means Clusters' if i == 0 else 'Mahalanobis K-means Clusters'
    ax.set_title(title, fontsize=16, pad=20, fontweight='bold', color='#333333')
    ax.set_xlabel(f'PC1 ({explained_variance[0]:.1f}%)', fontsize=12)
    ax.set_ylabel(f'PC2 ({explained_variance[1]:.1f}%)', fontsize=12)
    ax.grid(True, alpha=0.3)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Add legends with custom styling
    legend = ax.legend(
        title="Cluster Groups",
        title_fontsize=12,
        fontsize=10,
        loc='upper right',
        frameon=True,
        facecolor='white',
        edgecolor='#dddddd'
    )
    legend.get_frame().set_alpha(0.9)

plt.suptitle('2D PCA Visualization with Density Contours',
             fontsize=20,
             y=0.98,
             fontweight='bold',
             color='#333333')

plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.savefig('2d_clusters_with_contours.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

In [None]:
# Visualize target distribution within clusters
plt.subplot(2, 2, 3)
scatter = plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1],
                     c=y_train.values.ravel(),
                     cmap=LinearSegmentedColormap.from_list("custom_cmap", ['#FF5E5B', '#D8D8D8', '#39A0ED']),
                     s=50,
                     alpha=0.7,
                     edgecolors='w',
                     linewidth=0.5)
cbar = plt.colorbar(scatter, label='Rented Bike Count')
cbar.ax.tick_params(labelsize=10)
cbar.set_label('Rented Bike Count', fontsize=12, fontweight='bold', color='#333333')
plt.title('Bike Count Distribution in PCA Space', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.xlabel(f'PC1 ({explained_variance[0]:.1f}%)', fontsize=12)
plt.ylabel(f'PC2 ({explained_variance[1]:.1f}%)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

In [None]:

plt.subplot(2, 2, 4)
feature_importance = baseline_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[-10:]  # Top 10 features


colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(sorted_idx)))


bars = plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx],
         color=colors,
         edgecolor='white',
         linewidth=0.7)

# Add value labels to the bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 0.002,
             bar.get_y() + bar.get_height()/2,
             f'{width:.4f}',
             ha='left',
             va='center',
             fontsize=9,
             color='#333333')

# Customize appearance
plt.yticks(range(len(sorted_idx)), X.columns[sorted_idx])
plt.title('Top 10 Feature Importance', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.xlabel('Importance Score', fontsize=12, color='#333333')
plt.grid(axis='x', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

In [None]:
# Create a bar chart comparing the metrics
plt.figure(figsize=(12, 6))
fig.patch.set_facecolor('#f8f9fa')

# Define the colors from our palette
colors = ['#39A0ED', '#FF5E5B', '#D8D8D8']  # Blue, Coral, Light Gray

# RMSE comparison (lower is better)
plt.subplot(1, 2, 1)
models = ['Baseline', 'Euclidean\nClustering', 'Mahalanobis\nClustering']
rmse_values = [baseline_metrics['rmse'], euclidean_metrics['rmse'], mahalanobis_metrics['rmse']]
bars = plt.bar(models, rmse_values, color=colors,
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold')


plt.title('RMSE Comparison (lower is better)', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.ylabel('RMSE', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)


min_rmse = min(rmse_values)
plt.axhline(y=min_rmse, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, min_rmse-0.5, f'Best: {min_rmse:.2f}',
         ha='right', va='top', color='#333333', alpha=0.7)

In [None]:
# R² comparison (higher is better)
plt.subplot(1, 2, 2)
r2_values = [baseline_metrics['r2'], euclidean_metrics['r2'], mahalanobis_metrics['r2']]
bars = plt.bar(models, r2_values, color=colors,  # Using the same color palette
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
            f'{height:.4f}', ha='center', va='bottom', fontweight='bold')

# Customize appearance
plt.title('R² Comparison (higher is better)', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.ylabel('R²', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add a horizontal line at the maximum R² for reference
max_r2 = max(r2_values)
plt.axhline(y=max_r2, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, max_r2-0.02, f'Best: {max_r2:.4f}',
         ha='right', va='top', color='#333333', alpha=0.7)

plt.tight_layout()
plt.savefig('seoul_bike_metrics_comparison.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

In [None]:
# MAE comparison (lower is better)
plt.subplot(1, 2, 2)

# Calculate MAE for each model
from sklearn.metrics import mean_absolute_error
baseline_mae = mean_absolute_error(y_test, baseline_preds)
euclidean_mae = mean_absolute_error(y_test, euclidean_preds)
mahalanobis_mae = mean_absolute_error(y_test, mahalanobis_preds)

mae_values = [baseline_mae, euclidean_mae, mahalanobis_mae]
bars = plt.bar(models, mae_values, color=colors,  # Using the same color palette
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold')


plt.title('MAE Comparison (lower is better)', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.ylabel('MAE', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add a horizontal line at the minimum MAE for reference
min_mae = min(mae_values)
plt.axhline(y=min_mae, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, min_mae-0.5, f'Best: {min_mae:.2f}',
         ha='right', va='top', color='#333333', alpha=0.7)

plt.tight_layout()
plt.savefig('seoul_bike_metrics_comparison.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

In [None]:
# Visualizing Cluster Analysis
plt.figure(figsize=(18, 12))
fig.patch.set_facecolor('#f8f9fa')


euclidean_palette = ['#FF5E5B', '#D8D8D8', '#39A0ED']  # Coral, Light Gray, Blue

# 1. Cluster Sizes - Top Left
plt.subplot(2, 2, 1)
euclidean_sizes = [sum(euclidean_clusters == i) for i in range(3)]
mahalanobis_sizes = [sum(mahalanobis_clusters == i) for i in range(3)]

x = np.arange(3)  # the label locations
width = 0.35  # the width of the bars

rects1 = plt.bar(x - width/2, euclidean_sizes, width, label='Euclidean',
                color=euclidean_palette, edgecolor='white', linewidth=0.8)
rects2 = plt.bar(x + width/2, mahalanobis_sizes, width, label='Mahalanobis',
                color=['#FFD166', '#06D6A0', '#118AB2'], edgecolor='white', linewidth=0.8)

# Add labels and customize
plt.xlabel('Cluster', fontsize=12, color='#333333')
plt.ylabel('Number of Samples', fontsize=12, color='#333333')
plt.title('Cluster Sizes Comparison', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.xticks(x, ['Cluster 0', 'Cluster 1', 'Cluster 2'])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add count labels on top of bars
for rect in rects1:
    height = rect.get_height()
    plt.annotate(f'{height}',
                xy=(rect.get_x() + rect.get_width()/2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

for rect in rects2:
    height = rect.get_height()
    plt.annotate(f'{height}',
                xy=(rect.get_x() + rect.get_width()/2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# 2. Average Bike Count per Cluster - Top Right
plt.subplot(2, 2, 2)
euclidean_avg_counts = []
mahalanobis_avg_counts = []

for i in range(3):
    # Euclidean
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        avg_count = y_train.iloc[cluster_indices].values.mean()
        euclidean_avg_counts.append(avg_count)
    else:
        euclidean_avg_counts.append(0)

    # Mahalanobis
    cluster_indices = mahalanobis_clusters == i
    if sum(cluster_indices) > 0:
        avg_count = y_train.iloc[cluster_indices].values.mean()
        mahalanobis_avg_counts.append(avg_count)
    else:
        mahalanobis_avg_counts.append(0)

rects1 = plt.bar(x - width/2, euclidean_avg_counts, width, label='Euclidean',
                color=euclidean_palette, edgecolor='white', linewidth=0.8)
rects2 = plt.bar(x + width/2, mahalanobis_avg_counts, width, label='Mahalanobis',
                color=['#FFD166', '#06D6A0', '#118AB2'], edgecolor='white', linewidth=0.8)

# Add labels and customize
plt.xlabel('Cluster', fontsize=12, color='#333333')
plt.ylabel('Average Bike Count', fontsize=12, color='#333333')
plt.title('Average Bike Count per Cluster', fontsize=16, pad=20, fontweight='bold', color='#333333')
plt.xticks(x, ['Cluster 0', 'Cluster 1', 'Cluster 2'])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add count labels on top of bars
for rect in rects1:
    height = rect.get_height()
    plt.annotate(f'{height:.2f}',
                xy=(rect.get_x() + rect.get_width()/2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

for rect in rects2:
    height = rect.get_height()
    plt.annotate(f'{height:.2f}',
                xy=(rect.get_x() + rect.get_width()/2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')



plt.tight_layout()
plt.savefig('cluster_analysis_visualization.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

In [None]:
# 3. Distinctive Features Heatmap
plt.subplot(2, 1, 2)

# Get top distinctive features for each cluster
all_top_features = set()
feature_diff_values = {}
for i in range(3):
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        cluster_data = X_train.iloc[cluster_indices]
        feature_means = cluster_data.mean()
        overall_means = X_train.mean()
        feature_diff = abs(feature_means - overall_means)
        top_features = feature_diff.nlargest(5).index

        for feature in top_features:
            all_top_features.add(feature)
            key = (i, feature)
            feature_diff_values[key] = (feature_means[feature] - overall_means[feature]) / overall_means[feature] * 100 if overall_means[feature] != 0 else 0

# Create a matrix for the heatmap
all_top_features = list(all_top_features)
heatmap_data = np.zeros((len(all_top_features), 3))
for i, feature in enumerate(all_top_features):
    for cluster in range(3):
        key = (cluster, feature)
        if key in feature_diff_values:
            heatmap_data[i, cluster] = feature_diff_values[key]

# Create a custom colormap using the same colors
from matplotlib.colors import LinearSegmentedColormap

# Create a custom colormap that goes from blue (negative) to white (neutral) to red (positive)
colors_for_map = []
colors_for_map.append('#39A0ED')  # Blue for negative values
colors_for_map.append('#FFFFFF')  # White for zero
colors_for_map.append('#FF5E5B')  # Coral for positive values

custom_cmap = LinearSegmentedColormap.from_list('custom_diverging', colors_for_map, N=256)

# Increase the size of the heatmap by adjusting figure size
plt.figure(figsize=(20, 12))  # Increase the figure size
plt.subplot(1, 1, 1)  # Use the entire figure for the heatmap

# Create the heatmap with the custom colormap
im = plt.imshow(heatmap_data, cmap=custom_cmap, aspect='auto', vmin=-400, vmax=400)
cbar = plt.colorbar(im, label='% Difference from Overall Mean', fraction=0.046, pad=0.04)
cbar.ax.tick_params(labelsize=12)
cbar.set_label('% Difference from Overall Mean', fontsize=14, color='#333333')

# Add labels with larger font sizes
plt.yticks(range(len(all_top_features)), all_top_features, fontsize=12)
plt.xticks(range(3), ['Cluster 0', 'Cluster 1', 'Cluster 2'], fontsize=14)
plt.title('Distinctive Features by Cluster (Euclidean)', fontsize=18, pad=20, fontweight='bold', color='#333333')

# Add text annotations with larger font size
for i in range(len(all_top_features)):
    for j in range(3):
        value = heatmap_data[i, j]
        # Use white text for extreme values, black for moderate values
        text_color = 'white' if abs(value) > 100 else 'black'
        plt.text(j, i, f'{value:.1f}%', ha='center', va='center',
                color=text_color, fontweight='bold', fontsize=12)

plt.tight_layout()
plt.savefig('distinctive_features_heatmap.png', dpi=300, bbox_inches='tight', facecolor='#f8f9fa')
plt.show()

In [None]:
# Additional analysis: Cluster sizes and characteristics
print("\n=== CLUSTER ANALYSIS ===")
print("\nEuclidean Cluster Sizes:")
for i in range(3):
    print(f"Cluster {i}: {sum(euclidean_clusters == i)} samples")

print("\nMahalanobis Cluster Sizes:")
for i in range(3):
    print(f"Cluster {i}: {sum(mahalanobis_clusters == i)} samples")

# Average bike count per cluster
print("\nAverage Bike Count per Cluster:")
print("\nEuclidean Clusters:")
for i in range(3):
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        avg_count = y_train.iloc[cluster_indices].values.mean()
        print(f"Cluster {i}: {avg_count:.2f}")

print("\nMahalanobis Clusters:")
for i in range(3):
    cluster_indices = mahalanobis_clusters == i
    if sum(cluster_indices) > 0:
        avg_count = y_train.iloc[cluster_indices].values.mean()
        print(f"Cluster {i}: {avg_count:.2f}")

# Analyze cluster characteristics
print("\nCluster Characteristics (Average Feature Values):")
print("\nEuclidean Clusters:")
for i in range(3):
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        cluster_data = X_train.iloc[cluster_indices]
        # Get top 5 most distinctive features (largest difference from overall mean)
        feature_means = cluster_data.mean()
        overall_means = X_train.mean()
        feature_diff = abs(feature_means - overall_means)
        top_features = feature_diff.nlargest(5).index

        print(f"\nCluster {i} distinctive features:")
        for feature in top_features:
            print(f"{feature}: {feature_means[feature]:.2f} (overall: {overall_means[feature]:.2f})")