### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.spatial.distance import mahalanobis
from ucimlrepo import fetch_ucirepo

### Notebook Functions

In [None]:
def create_cyclical_encoding(X_data):
    X_cyclical = X_data.copy()
    X_cyclical['Date'] = pd.to_datetime(X_cyclical['Date'])
    X_cyclical['Month_sin'] = np.sin(2 * np.pi * X_cyclical['Month'] / 12)
    X_cyclical['Month_cos'] = np.cos(2 * np.pi * X_cyclical['Month'] / 12)
    X_cyclical['DayOfYear'] = X_cyclical['Date'].dt.dayofyear
    X_cyclical['DayOfYear_sin'] = np.sin(2 * np.pi * X_cyclical['DayOfYear'] / 365)
    X_cyclical['DayOfYear_cos'] = np.cos(2 * np.pi * X_cyclical['DayOfYear'] / 365)
    X_cyclical['Hour_sin'] = np.sin(2 * np.pi * X_cyclical['Hour'] / 24)
    X_cyclical['Hour_cos'] = np.cos(2 * np.pi * X_cyclical['Hour'] / 24)
    X_cyclical['DayOfWeek_sin'] = np.sin(2 * np.pi * X_cyclical['DayOfWeek'] / 7)
    X_cyclical['DayOfWeek_cos'] = np.cos(2 * np.pi * X_cyclical['DayOfWeek'] / 7)
    
    # Seasonal mapping and encoding
    season_mapping = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Autumn': 3}
    X_cyclical['Season_numeric'] = X_cyclical['Seasons'].map(season_mapping)
    X_cyclical['Season_sin'] = np.sin(2 * np.pi * X_cyclical['Season_numeric'] / 4)
    X_cyclical['Season_cos'] = np.cos(2 * np.pi * X_cyclical['Season_numeric'] / 4)
    
    return X_cyclical

def preprocess_data():
    """Enhanced preprocessing with cyclical encoding"""
    seoul_bike_sharing_demand = fetch_ucirepo(id=560)
    X_original = seoul_bike_sharing_demand.data.features
    y_original = seoul_bike_sharing_demand.data.targets
    
    if 'Rented Bike Count' in X_original.columns:
        y = X_original[['Rented Bike Count']]
        X = X_original.drop('Rented Bike Count', axis=1)
        X = pd.concat([X, y_original], axis=1)
    else:
        y = y_original
        X = X_original
        
    if 'Date' in X.columns:
        X['Date'] = pd.to_datetime(X['Date'], format='%d/%m/%Y')
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['DayOfWeek'] = X['Date'].dt.dayofweek
        
    X = create_cyclical_encoding(X)
    X = X.drop('Date', axis=1)
    X = pd.get_dummies(X, drop_first=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    definitely_remove = ["DayOfYear", "Month", "Day", "Dew point temperature"]
    actually_removed = [f for f in definitely_remove if f in X_train.columns]
    
    if actually_removed:
        X_train = X_train.drop(columns=actually_removed)
        X_test = X_test.drop(columns=actually_removed)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, X_train.columns.tolist()

def mahalanobis_distance(x, y, inv_cov):
    return mahalanobis(x, y, inv_cov)

### Load and Preprocess Data

In [None]:
# Load and preprocess data with cyclical encoding
print("=== CYCLICAL ENCODING + CLUSTERING EXPERIMENT ===")
print("Loading and preprocessing data with cyclical features...")
X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_names = preprocess_data()
print(f"Data preprocessed: {len(feature_names)} features, {len(X_train_scaled)} training samples")

# 1. BASELINE: Single model with cyclical encoding
print("\n1. BASELINE: Single XGBoost model with cyclical encoding")
baseline_cyclical_model = xgb.XGBRegressor(random_state=42)
baseline_cyclical_model.fit(X_train_scaled, y_train.values.ravel())
baseline_cyclical_preds = baseline_cyclical_model.predict(X_test_scaled)

baseline_cyclical_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, baseline_cyclical_preds)),
    'r2': r2_score(y_test, baseline_cyclical_preds),
    'mae': mean_absolute_error(y_test, baseline_cyclical_preds)
}

print(f"Baseline Cyclical Model - RMSE: {baseline_cyclical_metrics['rmse']:.2f}, "
      f"R²: {baseline_cyclical_metrics['r2']:.4f}, MAE: {baseline_cyclical_metrics['mae']:.2f}")

### Cluster on Cyclical-Encoded Features

In [None]:
# 2. CLUSTERING on cyclical-encoded features
print("\n2. CLUSTERING on cyclical-encoded features")

# Euclidean clustering
print("Performing Euclidean K-means clustering on cyclical features...")
euclidean_kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
euclidean_clusters = euclidean_kmeans.fit_predict(X_train_scaled)

# Mahalanobis clustering
print("Performing Mahalanobis distance-based clustering...")
cov = np.cov(X_train_scaled, rowvar=False)
cov += np.eye(cov.shape[0]) * 1e-6
inv_cov = np.linalg.inv(cov)

# Initialize with Euclidean results and refine with Mahalanobis
mahalanobis_clusters = euclidean_kmeans.predict(X_train_scaled)
centroids = np.array([X_train_scaled[mahalanobis_clusters == i].mean(axis=0) for i in range(3)])

for iteration in range(5):
    for i, point in enumerate(X_train_scaled):
        distances = [mahalanobis_distance(point, centroid, inv_cov) for centroid in centroids]
        mahalanobis_clusters[i] = np.argmin(distances)
    
    for i in range(3):
        if sum(mahalanobis_clusters == i) > 0:
            centroids[i] = X_train_scaled[mahalanobis_clusters == i].mean(axis=0)

### Train Cluster Specific Models with Cyclical Features

In [None]:
euclidean_models = []
mahalanobis_models = []

for i in range(3):
    # Euclidean cluster models
    cluster_indices = euclidean_clusters == i
    if sum(cluster_indices) > 0:
        print(f"Training Euclidean cluster {i} model with {sum(cluster_indices)} samples")
        model = xgb.XGBRegressor(random_state=42)
        model.fit(X_train_scaled[cluster_indices], y_train.iloc[cluster_indices].values.ravel())
        euclidean_models.append(model)
    else:
        euclidean_models.append(None)
    
    # Mahalanobis cluster models
    cluster_indices = mahalanobis_clusters == i
    if sum(cluster_indices) > 0:
        print(f"Training Mahalanobis cluster {i} model with {sum(cluster_indices)} samples")
        model = xgb.XGBRegressor(random_state=42)
        model.fit(X_train_scaled[cluster_indices], y_train.iloc[cluster_indices].values.ravel())
        mahalanobis_models.append(model)
    else:
        mahalanobis_models.append(None)

### Euclidean and Mahalamobis Predictions

In [None]:
# Assign test samples to clusters
euclidean_test_clusters = euclidean_kmeans.predict(X_test_scaled)
mahalanobis_test_clusters = np.zeros(len(X_test_scaled), dtype=int)

for i, point in enumerate(X_test_scaled):
    distances = [mahalanobis_distance(point, centroid, inv_cov) for centroid in centroids]
    mahalanobis_test_clusters[i] = np.argmin(distances)

# Make predictions
euclidean_preds = np.zeros(len(y_test))
mahalanobis_preds = np.zeros(len(y_test))

for i in range(3):
    # Euclidean predictions
    cluster_indices = euclidean_test_clusters == i
    if sum(cluster_indices) > 0 and euclidean_models[i] is not None:
        euclidean_preds[cluster_indices] = euclidean_models[i].predict(X_test_scaled[cluster_indices])
    
    # Mahalanobis predictions
    cluster_indices = mahalanobis_test_clusters == i
    if sum(cluster_indices) > 0 and mahalanobis_models[i] is not None:
        mahalanobis_preds[cluster_indices] = mahalanobis_models[i].predict(X_test_scaled[cluster_indices])

### Calculate Metrics

In [None]:
euclidean_cyclical_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, euclidean_preds)),
    'r2': r2_score(y_test, euclidean_preds),
    'mae': mean_absolute_error(y_test, euclidean_preds)
}

mahalanobis_cyclical_metrics = {
    'rmse': math.sqrt(mean_squared_error(y_test, mahalanobis_preds)),
    'r2': r2_score(y_test, mahalanobis_preds),
    'mae': mean_absolute_error(y_test, mahalanobis_preds)
}

# 6. RESULTS COMPARISON
print("\n" + "="*60)
print("RESULTS COMPARISON")
print("="*60)

results_df = pd.DataFrame({
    'Model': ['Baseline Cyclical', 'Euclidean + Cyclical', 'Mahalanobis + Cyclical'],
    'RMSE': [baseline_cyclical_metrics['rmse'], euclidean_cyclical_metrics['rmse'], mahalanobis_cyclical_metrics['rmse']],
    'R²': [baseline_cyclical_metrics['r2'], euclidean_cyclical_metrics['r2'], mahalanobis_cyclical_metrics['r2']],  
    'MAE': [baseline_cyclical_metrics['mae'], euclidean_cyclical_metrics['mae'], mahalanobis_cyclical_metrics['mae']]
})

print(results_df.round(4))

### Improvements vs Baseline

In [None]:
for i, model_name in enumerate(['Euclidean + Cyclical', 'Mahalanobis + Cyclical']):
    if i == 0:
        metrics = euclidean_cyclical_metrics
    else:
        metrics = mahalanobis_cyclical_metrics
    
    rmse_improvement = ((baseline_cyclical_metrics['rmse'] - metrics['rmse']) / baseline_cyclical_metrics['rmse']) * 100
    r2_improvement = ((metrics['r2'] - baseline_cyclical_metrics['r2']) / baseline_cyclical_metrics['r2']) * 100  
    mae_improvement = ((baseline_cyclical_metrics['mae'] - metrics['mae']) / baseline_cyclical_metrics['mae']) * 100
    
    print(f"{model_name}:")
    print(f"  RMSE: {rmse_improvement:+6.2f}%")
    print(f"  R²:   {r2_improvement:+6.2f}%") 
    print(f"  MAE:  {mae_improvement:+6.2f}%")

### Cluster Analysis

In [None]:
print("CLUSTER ANALYSIS (with cyclical features):")

print("\nCluster sizes:")
for i in range(3):
    euc_size = sum(euclidean_clusters == i)
    mah_size = sum(mahalanobis_clusters == i)
    print(f"Cluster {i}: Euclidean={euc_size}, Mahalanobis={mah_size}")

print("\nAverage bike count per cluster:")
for i in range(3):
    euc_indices = euclidean_clusters == i
    mah_indices = mahalanobis_clusters == i
    
    if sum(euc_indices) > 0:
        euc_avg = y_train.iloc[euc_indices].values.mean()
        print(f"Euclidean Cluster {i}: {euc_avg:.1f} bikes")
    
    if sum(mah_indices) > 0:
        mah_avg = y_train.iloc[mah_indices].values.mean()
        print(f"Mahalanobis Cluster {i}: {mah_avg:.1f} bikes")

### Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': baseline_cyclical_model.feature_importances_
}).sort_values('importance', ascending=False)

# Show top cyclical features
cyclical_features = feature_importance[feature_importance['feature'].str.contains('_sin|_cos')].head(8)
print("Top cyclical features:")
for i, (_, row) in enumerate(cyclical_features.iterrows(), 1):
    print(f"  {i}. {row['feature']:<20s}: {row['importance']:.4f}")

cyclical_importance_sum = cyclical_features['importance'].sum()
total_importance = feature_importance['importance'].sum()
cyclical_percentage = (cyclical_importance_sum / total_importance) * 100
print(f"\nCyclical features account for {cyclical_percentage:.1f}% of total importance")

### Conclusion

In [None]:
best_model = results_df.loc[results_df['RMSE'].idxmin(), 'Model']
best_rmse = results_df['RMSE'].min()
baseline_rmse = baseline_cyclical_metrics['rmse']

if best_rmse < baseline_rmse:
    improvement = ((baseline_rmse - best_rmse) / baseline_rmse) * 100
    print(f"The {best_model} model has a {improvement:.2f}% RMSE improvement")
    print(f"  The combination of cyclical encoding + clustering provides measurable benefit")
else:
    degradation = ((best_rmse - baseline_rmse) / baseline_rmse) * 100
    print(f"The Baseline cyclical model has the better RMSE score")
    print(f"  Cyclical encoding alone captures the temporal patterns effectively")

print(f"\nKey Finding: Cyclical features represent {cyclical_percentage:.1f}% of model importance,")
print(f"suggesting temporal patterns are the dominant signal in bike demand prediction.")