In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

# Cluster Analysis, Correlation Analysis, and Predictive Modeling

This section adds advanced analysis to the health and sleep dataset, including clustering, correlation analysis, and predictive modeling.

## 1. Cluster Analysis

### Step 1: Data Preprocessing for Clustering

# 🧹 Data Preprocessing <a id='Data-Preprocessing'></a>

Data preprocessing is a critical step that involves cleaning, transforming, and preparing data for analysis. This section includes handling missing values, converting categorical variables to numeric, and normalizing data.

In [None]:
# Load the dataset
data_path = '/kaggle/input/health-and-sleep-statistics/Health_Sleep_Statistics.csv'
health_sleep_data = pd.read_csv(data_path)

### 🔄 Converting Categorical Variables to Numerical Formats

To effectively use machine learning algorithms, we need to convert categorical variables into numerical formats. This process, known as **encoding**, transforms non-numeric data into a format that can be more easily processed by algorithms. Below is a summary of the conversions:

- **Gender:** Converted to binary format:
  - `'m'` → `1`
  - `'f'` → `0`

- **Physical Activity Level:** Encoded to represent different activity levels:
  - `'low'` → `0`
  - `'medium'` → `1`
  - `'high'` → `2`

- **Dietary Habits:** Transformed to indicate diet quality:
  - `'unhealthy'` → `0`
  - `'medium'` → `1`
  - `'healthy'` → `2`

- **Binary Variables (e.g., Sleep Disorders, Medication Usage):** Converted to numerical binary format:
  - `'yes'` → `1`
  - `'no'` → `0`

These conversions standardize the data, making it suitable for further analysis and machine learning models.


In [None]:
# Convert categorical variables to numerical representations
health_sleep_data_encoded = health_sleep_data.copy()
health_sleep_data_encoded['Gender'] = health_sleep_data_encoded['Gender'].map({'m': 1, 'f': 0})
activity_mapping = {'low': 0, 'medium': 1, 'high': 2}
health_sleep_data_encoded['Physical Activity Level'] = health_sleep_data_encoded['Physical Activity Level'].map(activity_mapping)
diet_mapping = {'unhealthy': 0, 'medium': 1, 'healthy': 2}
health_sleep_data_encoded['Dietary Habits'] = health_sleep_data_encoded['Dietary Habits'].map(diet_mapping)
binary_mapping = {'yes': 1, 'no': 0}
health_sleep_data_encoded['Sleep Disorders'] = health_sleep_data_encoded['Sleep Disorders'].map(binary_mapping)
health_sleep_data_encoded['Medication Usage'] = health_sleep_data_encoded['Medication Usage'].map(binary_mapping)

### 📏 Standardizing Numerical Features for Clustering

To ensure our clustering algorithm performs optimally, we need to **standardize** the numerical features in our dataset. Standardization transforms the data to have a mean of `0` and a standard deviation of `1`, which helps in achieving uniformity and improving model performance.

- **Why Standardize?** Clustering algorithms like K-Means are sensitive to the scale of data. Features with larger scales can dominate the distance calculations, skewing the results. Standardizing ensures that each feature contributes equally.

- **Numerical Features Standardized:**
  - `Age`
  - `Sleep Quality`
  - `Daily Steps`
  - `Calories Burned`

We use `StandardScaler` from `scikit-learn` to apply standardization:

```python
scaler = StandardScaler()
health_sleep_data_encoded[numerical_features] = scaler.fit_transform(health_sleep_data_encoded[numerical_features])
```

By standardizing these features, we prepare our data for more effective clustering and ensure that no single feature disproportionately influences the clustering process.

In [None]:
# Standardize numerical features for clustering
scaler = StandardScaler()
numerical_features = ['Age', 'Sleep Quality', 'Daily Steps', 'Calories Burned']
health_sleep_data_encoded[numerical_features] = scaler.fit_transform(health_sleep_data_encoded[numerical_features])

### 📊 Determining the Optimal Number of Clusters Using the Elbow Method

When clustering data, it's crucial to determine the right number of clusters (`K`) for optimal results. The **Elbow Method** helps us identify this number by plotting the **inertia** (sum of squared distances to the nearest cluster center) for different values of `K`.

- **Inertia and the Elbow Method:**
  - Inertia measures how internally coherent clusters are; lower values indicate tighter clusters.
  - The "elbow" point in the graph shows where increasing `K` yields diminishing returns in reducing inertia. This point is considered the optimal `K`.

- **Steps to Determine Optimal Clusters:**
  1. Iterate over a range of possible cluster numbers (e.g., `1` to `10`).
  2. For each value of `K`, apply the **K-Means** algorithm and compute inertia.
  3. Plot the inertia values against `K` to visualize the "elbow."

- **Applying K-Means Clustering:**
  - After determining the optimal number of clusters (`K = 3`), we apply K-Means clustering with this `K` value.
  - The `n_init` parameter is explicitly set to `10` to suppress future warnings related to this parameter's default change.


In [None]:
# Determine the optimal number of clusters using the Elbow Method
inertia = []
cluster_range = range(1, 11)

# Features to use for clustering (excluding 'User ID', 'Bedtime', and 'Wake-up Time')
clustering_features = health_sleep_data_encoded.drop(['User ID', 'Bedtime', 'Wake-up Time'], axis=1)

for k in cluster_range:
    # Explicitly set the `n_init` parameter to suppress the warning
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(clustering_features)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.xticks(cluster_range)
plt.grid(True)
plt.show()

# Apply K-Means clustering with K = 3, with explicit `n_init` to suppress the warning
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
health_sleep_data_encoded['Cluster'] = kmeans.fit_predict(clustering_features)


### 🚫 Handling Infinite Values and Visualizing Clusters

When working with real-world data, we must address issues like infinite (`inf`) values that can arise during data processing. To ensure the data is clean and ready for clustering, we perform the following steps:

1. **Convert Infinite Values to `NaN`:** Infinite values can distort analysis and model training. We convert any `inf` or `-inf` values in the dataset to `NaN` to handle them appropriately.

2. Handle Missing Values (NaN): After replacing inf values, we can either fill or drop NaN values depending on the context of the analysis. Here, we choose to drop rows with NaN values to ensure data integrity.

3. Apply K-Means Clustering: With a clean dataset, we apply K-Means clustering using the optimal number of clusters (K = 3). This step groups individuals based on similar patterns in selected features (e.g., Age, Sleep Quality).

4. Visualize Clusters Using Pair Plot: To understand the clustering results better, we use a pair plot to visualize the distribution and relationships of selected features across the clusters. This plot provides insights into how different features contribute to the clustering.

By handling inf values, ensuring clean data, and visualizing the clustering results, we provide a comprehensive approach to understanding patterns in the health and sleep data.


In [None]:
# Ensure that any inf values in the data are converted to NaN
health_sleep_data_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)

# Optionally, fill or drop NaN values depending on the context of your analysis
health_sleep_data_encoded.dropna(inplace=True)  # Drop rows with NaN values

# Apply K-Means clustering with K = 3
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
health_sleep_data_encoded['Cluster'] = kmeans.fit_predict(clustering_features)

# Visualize the clusters using a pair plot for selected features
sns.pairplot(health_sleep_data_encoded, vars=['Age', 'Sleep Quality', 'Daily Steps', 'Calories Burned'], hue='Cluster', palette='viridis', diag_kind='hist')
plt.suptitle('Pair Plot of Clusters for Selected Features', y=1.02)
plt.show()


In [None]:
# Calculate the correlation matrix for the numerical features
correlation_matrix = health_sleep_data_encoded[['Age', 'Sleep Quality', 'Daily Steps', 'Calories Burned']].corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Health and Sleep Data')
plt.show()


In [None]:
# Convert 'Bedtime' and 'Wake-up Time' to minutes since midnight for numerical processing
def time_to_minutes(time_str):
    h, m = map(int, time_str.split(':'))
    return h * 60 + m

# Apply the conversion
health_sleep_data_encoded['Bedtime'] = health_sleep_data_encoded['Bedtime'].apply(time_to_minutes)
health_sleep_data_encoded['Wake-up Time'] = health_sleep_data_encoded['Wake-up Time'].apply(time_to_minutes)

# Prepare the data for predictive modeling
# Define target variable and features
target = 'Sleep Quality'
features = health_sleep_data_encoded.drop(['Sleep Quality', 'Cluster'], axis=1)  # Exclude target and 'Cluster'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, health_sleep_data_encoded[target], test_size=0.2, random_state=42)

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror')
}

# Train models and evaluate their performance
model_performance = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test data
    mse = mean_squared_error(y_test, y_pred)  # Calculate Mean Squared Error
    r2 = r2_score(y_test, y_pred)  # Calculate R-squared value
    model_performance[model_name] = {'MSE': mse, 'R^2': r2}

# Display model performance
model_performance_df = pd.DataFrame(model_performance).T
model_performance_df
