# Phase 1 & 2: Data Loading, Cleaning, and Exploratory Data Analysis (EDA)

This notebook covers:
- **Phase 1**: Environment setup and data inspection
- **Phase 2**: Data cleaning, preprocessing, and exploratory data analysis
- **Focus**: Understanding the dataset through descriptive statistics, visualizations, and data quality assessment

In [29]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## Phase 1.2: Load and Inspect Data

In [30]:
# Load the dataset
df = pd.read_csv('data/Data Science Dataset - DATABASE.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

Dataset shape: (81, 15)

First few rows:


Unnamed: 0,Date,Day of the Week,Sleep_Hours,Work_Hours,Study_Hours,Chore_Time_Mins,Distraction_Time_Mins,Travel Time (Hours),Mode of Transport,Music_Time_Hours,Main_Music_Genre,Tasks_Completed,Mood_Rating,Focus_Rating,Notes
0,11-19-25,Wednesday,5.0,10.0,3.0,65.0,127.0,2.8,Public Transport,2.0,"Pop, R&B",8.0,4.0,3.0,"Done finishing admin, Need to layout design sa..."
1,11-20-25,Thursday,5.5,7.0,5.0,60.0,252.0,1.3,"Public Transport, Motorcycle",1.0,"Pop, News, Jazz",4.0,3.0,4.0,"maraming gagawin sa shop, deliveries, payroll,..."
2,11-21-25,Friday,6.0,9.0,4.0,60.0,174.0,1.1,Public Transport,2.0,"Jazz, Classical",5.0,5.0,2.0,"Reformat PC, create designs and payroll ng tao"
3,11-22-25,Monday,6.0,3.0,7.516667,20.0,270.0,2.65,"Public Transport, Motorcycle",1.0,"Jazz, Lofi",4.0,5.0,2.0,
4,11-23-25,Tuesday,3.0,4.0,10.1,60.0,270.0,1.4,Public Transport,4.0,"Classical, Jazz, News, Rock",10.0,4.0,5.0,"Done house clean, PC clean, Repairs, Stocks re..."


In [31]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\n" + "="*80)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*80)
print("\nDescriptive Statistics:")
df.describe()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   81 non-null     object 
 1   Day of the Week        74 non-null     object 
 2   Sleep_Hours            74 non-null     float64
 3   Work_Hours             74 non-null     float64
 4   Study_Hours            74 non-null     float64
 5   Chore_Time_Mins        74 non-null     float64
 6   Distraction_Time_Mins  74 non-null     float64
 7   Travel Time (Hours)    74 non-null     float64
 8   Mode of Transport      74 non-null     object 
 9   Music_Time_Hours       74 non-null     float64
 10  Main_Music_Genre       73 non-null     object 
 11  Tasks_Completed        74 non-null     float64
 12  Mood_Rating            74 non-null     float64
 13  Focus_Rating           74 non-null     float64
 14  Notes                  61 non-null     object 

Unnamed: 0,Sleep_Hours,Work_Hours,Study_Hours,Chore_Time_Mins,Distraction_Time_Mins,Travel Time (Hours),Music_Time_Hours,Tasks_Completed,Mood_Rating,Focus_Rating
count,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,6.898649,4.945946,4.596171,106.121622,240.108108,1.778378,4.77027,3.783784,3.432432,3.202703
std,1.942636,3.289218,2.503008,98.313442,83.727308,0.756664,2.888557,1.881928,1.11135,1.281905
min,2.0,0.0,0.0,19.0,70.0,1.0,0.0,0.0,1.0,1.0
25%,6.0,1.0,3.0,45.0,195.0,1.15,2.25,3.0,2.0,2.0
50%,6.0,6.5,4.2,60.0,240.0,1.425,5.0,3.0,4.0,3.0
75%,8.0,8.0,5.958333,120.0,284.75,2.6375,6.75,5.0,4.0,4.0
max,12.0,10.0,11.733333,480.0,500.0,3.0,13.0,10.0,5.0,5.0


## Phase 2.1: Handle Missing Values

In [32]:
# Drop rows with missing dates (per methodology)
print(f"Rows before dropping missing dates: {len(df)}")
df = df.dropna(subset=['Date'])
print(f"Rows after dropping missing dates: {len(df)}")

# Check for rows with all NaN values (completely empty rows)
empty_rows = df[df.isnull().all(axis=1)]
print(f"Completely empty rows: {len(empty_rows)}")
if len(empty_rows) > 0:
    df = df.dropna(how='all')
    print(f"Rows after dropping completely empty rows: {len(df)}")

Rows before dropping missing dates: 81
Rows after dropping missing dates: 81
Completely empty rows: 0


In [33]:
# Identify numeric columns for imputation
numeric_columns = ['Sleep_Hours', 'Work_Hours', 'Study_Hours', 'Chore_Time_Mins', 
                   'Distraction_Time_Mins', 'Travel Time (Hours)', 'Music_Time_Hours', 
                   'Tasks_Completed', 'Mood_Rating', 'Focus_Rating']

# Log missing values before imputation
missing_before = df[numeric_columns].isnull().sum()
print("Missing values before imputation:")
print(missing_before[missing_before > 0])

# Impute missing numeric values with 0 (per methodology)
df[numeric_columns] = df[numeric_columns].fillna(0)

# Verify imputation
missing_after = df[numeric_columns].isnull().sum()
print("\nMissing values after imputation:")
print(missing_after[missing_after > 0])

Missing values before imputation:
Sleep_Hours              7
Work_Hours               7
Study_Hours              7
Chore_Time_Mins          7
Distraction_Time_Mins    7
Travel Time (Hours)      7
Music_Time_Hours         7
Tasks_Completed          7
Mood_Rating              7
Focus_Rating             7
dtype: int64

Missing values after imputation:
Series([], dtype: int64)


## Phase 2.2: Parse and Validate Dates

In [34]:
# Parse Date column to datetime
# Format appears to be MM-DD-YY
df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%y')

# Validate Day of the Week against computed weekday
df['Computed_Day'] = df['Date'].dt.day_name()
df['Day_Match'] = df['Day of the Week'] == df['Computed_Day']

# Check for mismatches
mismatches = df[~df['Day_Match']]
if len(mismatches) > 0:
    print(f"Found {len(mismatches)} date/day mismatches:")
    print(mismatches[['Date', 'Day of the Week', 'Computed_Day']])
else:
    print("All dates match their day of the week!")

# Add Is_Weekend column for "Weekend Bleed" hypothesis
df['Is_Weekend'] = df['Date'].dt.dayofweek.isin([5, 6])  # Saturday=5, Sunday=6

print(f"\nWeekend days: {df['Is_Weekend'].sum()}")
print(f"Weekday days: {(~df['Is_Weekend']).sum()}")

Found 73 date/day mismatches:
         Date Day of the Week Computed_Day
3  2025-11-22          Monday     Saturday
4  2025-11-23         Tuesday       Sunday
5  2025-11-24       Wednesday       Monday
6  2025-11-25        Thursday      Tuesday
7  2025-11-26          Friday    Wednesday
..        ...             ...          ...
76 2026-02-03             NaN      Tuesday
77 2026-02-04             NaN    Wednesday
78 2026-02-05             NaN     Thursday
79 2026-02-06             NaN       Friday
80 2026-02-07             NaN     Saturday

[73 rows x 3 columns]

Weekend days: 23
Weekday days: 58


## Phase 2.3: Handle Categorical Variables

In [35]:
# Explore categorical variables
print("Mode of Transport unique values:")
print(df['Mode of Transport'].value_counts())
print("\nMain_Music_Genre unique values:")
print(df['Main_Music_Genre'].value_counts())

# Fill missing values in categorical columns with empty string for processing
df['Mode of Transport'] = df['Mode of Transport'].fillna('')
df['Main_Music_Genre'] = df['Main_Music_Genre'].fillna('')

Mode of Transport unique values:
Mode of Transport
Public Transport                47
Public Transport, Motorcycle    27
Name: count, dtype: int64

Main_Music_Genre unique values:
Main_Music_Genre
Lofi                           9
Pop                            6
R&B                            6
Oldies                         5
Classical                      5
Jazz                           5
Rock                           4
R&B, News                      3
Pop, Rock                      3
Pop, R&B                       2
News                           2
Pop, News, Jazz                1
Jazz, Lofi                     1
Lofi, R&B                      1
Pop, Podcast                   1
R&B, Rock                      1
Classical, Jazz, News, Rock    1
Jazz, Classical                1
Podcast, News, Pop             1
Pop, Classical, Jazz           1
Jazz, News, Podcast            1
Oldies, Jazz                   1
Lofi, Pop                      1
Rock, R&B                      1
Podcast, Ne

## Phase 2.3.1: Encode Categorical Variables to Numerical

Convert text/categorical data to numerical format for normalization and clustering.

In [36]:
# Encode Mode of Transport using one-hot encoding
# Split comma-separated values and create binary columns
transport_modes = set()
for transport_str in df['Mode of Transport'].dropna():
    if pd.notna(transport_str) and transport_str.strip():
        modes = [m.strip() for m in str(transport_str).split(',')]
        transport_modes.update(modes)

print("Unique Transport Modes found:", sorted(transport_modes))

# Create one-hot encoded columns for each transport mode
for mode in transport_modes:
    col_name = f'Transport_{mode.replace(" ", "_")}'
    df[col_name] = df['Mode of Transport'].apply(
        lambda x: 1 if pd.notna(x) and mode in str(x) else 0
    )

transport_encoded_cols = [f'Transport_{mode.replace(" ", "_")}' for mode in transport_modes]
print(f"\nCreated {len(transport_encoded_cols)} transport encoding columns:")
print(transport_encoded_cols)

Unique Transport Modes found: ['Motorcycle', 'Public Transport']

Created 2 transport encoding columns:
['Transport_Public_Transport', 'Transport_Motorcycle']


In [37]:
# Encode Main_Music_Genre using one-hot encoding
# Split comma-separated values and create binary columns
music_genres = set()
for genre_str in df['Main_Music_Genre'].dropna():
    if pd.notna(genre_str) and genre_str.strip():
        genres = [g.strip() for g in str(genre_str).split(',')]
        music_genres.update(genres)

print("Unique Music Genres found:", sorted(music_genres))

# Create one-hot encoded columns for each music genre
for genre in music_genres:
    col_name = f'Genre_{genre.replace(" ", "_").replace("&", "and")}'
    df[col_name] = df['Main_Music_Genre'].apply(
        lambda x: 1 if pd.notna(x) and genre in str(x) else 0
    )

genre_encoded_cols = [f'Genre_{genre.replace(" ", "_").replace("&", "and")}' for genre in music_genres]
print(f"\nCreated {len(genre_encoded_cols)} genre encoding columns:")
print(genre_encoded_cols)

# Show summary
print(f"\nTotal encoded categorical columns: {len(transport_encoded_cols) + len(genre_encoded_cols)}")
print(f"  - Transport modes: {len(transport_encoded_cols)}")
print(f"  - Music genres: {len(genre_encoded_cols)}")

Unique Music Genres found: ['Classical', 'Jazz', 'Lofi', 'News', 'OPM', 'Oldies', 'Podcast', 'Pop', 'R&B', 'Rock']

Created 10 genre encoding columns:
['Genre_Rock', 'Genre_OPM', 'Genre_Podcast', 'Genre_Jazz', 'Genre_Lofi', 'Genre_Classical', 'Genre_News', 'Genre_Pop', 'Genre_Oldies', 'Genre_RandB']

Total encoded categorical columns: 12
  - Transport modes: 2
  - Music genres: 10


## Phase 2.4: Feature Selection for Clustering

In [38]:

# Inputs: Sleep_Hours, Music_Time_Hours, Travel Time (Hours)
# Behaviors: Work_Hours, Study_Hours, Chore_Time_Mins, Distraction_Time_Mins, Tasks_Completed
# Categorical (encoded): Transport modes and Music genres (optional)

clustering_features = [
    'Sleep_Hours',           # Input
    'Music_Time_Hours',      # Input
    'Travel Time (Hours)',   # Input
    'Work_Hours',            # Behavior
    'Study_Hours',           # Behavior
    'Chore_Time_Mins',       # Behavior
    'Distraction_Time_Mins', # Behavior
    'Tasks_Completed'        # Behavior
]

# Option to include encoded categorical features in clustering
# Uncomment the line below want to include transport and genre in clustering
# clustering_features.extend(transport_encoded_cols + genre_encoded_cols)

# Create feature matrix for clustering
X = df[clustering_features].copy()

# Targets (excluded from clustering, used for validation)
targets = ['Mood_Rating', 'Focus_Rating']

print("Features selected for clustering:")
print(clustering_features)
print(f"\nFeature matrix shape: {X.shape}")
print("\nFeature statistics:")
X.describe()

Features selected for clustering:
['Sleep_Hours', 'Music_Time_Hours', 'Travel Time (Hours)', 'Work_Hours', 'Study_Hours', 'Chore_Time_Mins', 'Distraction_Time_Mins', 'Tasks_Completed']

Feature matrix shape: (81, 8)

Feature statistics:


Unnamed: 0,Sleep_Hours,Music_Time_Hours,Travel Time (Hours),Work_Hours,Study_Hours,Chore_Time_Mins,Distraction_Time_Mins,Tasks_Completed
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,6.302469,4.358025,1.624691,4.518519,4.198971,96.950617,219.358025,3.45679
std,2.69221,3.071273,0.880487,3.439154,2.721312,98.590301,104.9069,2.091945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,2.0,1.1,1.0,2.25,40.0,175.0,2.0
50%,6.0,4.0,1.4,6.0,4.0,60.0,210.0,3.0
75%,7.0,6.0,2.6,8.0,5.8,100.0,270.0,4.0
max,12.0,13.0,3.0,10.0,11.733333,480.0,500.0,10.0


## Phase 2.5: Scaling

In [39]:
# Normalize encoded categorical features (if you want to include them)
# Note: One-hot encoded features are already 0/1, but we can still scale them
# for consistency with other features

if len(transport_encoded_cols) > 0 or len(genre_encoded_cols) > 0:
    categorical_features = transport_encoded_cols + genre_encoded_cols
    X_categorical = df[categorical_features].copy()
    
    # Apply StandardScaler to categorical features
    scaler_categorical = StandardScaler()
    X_categorical_scaled = scaler_categorical.fit_transform(X_categorical)
    X_categorical_scaled_df = pd.DataFrame(
        X_categorical_scaled, 
        columns=categorical_features, 
        index=X_categorical.index
    )
    
    print("Normalized categorical features:")
    print(f"  Shape: {X_categorical_scaled_df.shape}")
    print(f"  Mean: {X_categorical_scaled_df.mean().mean():.6f}")
    print(f"  Std: {X_categorical_scaled_df.std().mean():.6f}")
    
    # Option to combine with main features
    # Uncomment if you want to include categorical features in clustering
    # X_combined = pd.concat([X_scaled_df, X_categorical_scaled_df], axis=1)
    # print(f"\nCombined feature matrix shape: {X_combined.shape}")
else:
    print("No categorical features to normalize.")

Normalized categorical features:
  Shape: (81, 12)
  Mean: 0.000000
  Std: 1.006231


In [40]:
# Apply StandardScaler (Z-score normalization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=clustering_features, index=X.index)

print("Scaled feature matrix statistics:")
print(X_scaled_df.describe())
print("\nScaled feature means (should be ~0):")
print(X_scaled_df.mean())
print("\nScaled feature std (should be ~1):")
print(X_scaled_df.std())

Scaled feature matrix statistics:
       Sleep_Hours  Music_Time_Hours  Travel Time (Hours)    Work_Hours  \
count    81.000000      8.100000e+01         8.100000e+01  8.100000e+01   
mean      0.000000      7.675616e-17         2.741291e-17  1.260994e-16   
std       1.006231      1.006231e+00         1.006231e+00  1.006231e+00   
min      -2.355588     -1.427805e+00        -1.856715e+00 -1.322032e+00   
25%      -0.113050     -7.725516e-01        -5.996231e-01 -1.029451e+00   
50%      -0.113050     -1.172984e-01        -2.567798e-01  4.334532e-01   
75%       0.260707      5.379548e-01         1.114594e+00  1.018615e+00   
max       2.129488      2.831341e+00         1.571718e+00  1.603777e+00   

        Study_Hours  Chore_Time_Mins  Distraction_Time_Mins  Tasks_Completed  
count  8.100000e+01     8.100000e+01           8.100000e+01     8.100000e+01  
mean  -8.223874e-17    -8.223874e-17          -3.289550e-17     7.949745e-17  
std    1.006231e+00     1.006231e+00           1.0062

## Phase 2.6: Normalize Encoded Categorical Features 


In [41]:
# Normalize encoded categorical features 
# Note: One-hot encoded features are already 0/1, but we can still scale them
# for consistency with other features

if 'transport_encoded_cols' in locals() and 'genre_encoded_cols' in locals():
    if len(transport_encoded_cols) > 0 or len(genre_encoded_cols) > 0:
        categorical_features = transport_encoded_cols + genre_encoded_cols
        X_categorical = df[categorical_features].copy()
        
        # Apply StandardScaler to categorical features
        scaler_categorical = StandardScaler()
        X_categorical_scaled = scaler_categorical.fit_transform(X_categorical)
        X_categorical_scaled_df = pd.DataFrame(
            X_categorical_scaled, 
            columns=categorical_features, 
            index=X_categorical.index
        )
        
        print("Normalized categorical features:")
        print(f"  Shape: {X_categorical_scaled_df.shape}")
        print(f"  Mean: {X_categorical_scaled_df.mean().mean():.6f}")
        print(f"  Std: {X_categorical_scaled_df.std().mean():.6f}")
        
        # Option to combine with main features
        # Uncomment if  want to include categorical features in clustering
        # X_combined = pd.concat([X_scaled_df, X_categorical_scaled_df], axis=1)
        # print(f"\nCombined feature matrix shape: {X_combined.shape}")
    else:
        print("No categorical features to normalize.")
else:
    print("Categorical features not yet encoded. Run encoding cells first.")

Normalized categorical features:
  Shape: (81, 12)
  Mean: 0.000000
  Std: 1.006231


## Save Processed Data

Save the cleaned and scaled data for use in subsequent notebooks.

In [42]:
# Save processed data
# Save cleaned dataframe
df.to_csv('data/cleaned_data.csv', index=False)

# Save scaled features
X_scaled_df.to_csv('data/scaled_features.csv', index=False)

# Save feature names and targets for reference
import json
metadata = {
    'clustering_features': clustering_features,
    'targets': targets,
    'n_samples': len(df)
}
with open('data/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Processed data saved successfully!")
print(f"- Cleaned data: data/cleaned_data.csv ({len(df)} rows)")
print(f"- Scaled features: data/scaled_features.csv")
print(f"- Metadata: data/metadata.json")

Processed data saved successfully!
- Cleaned data: data/cleaned_data.csv (81 rows)
- Scaled features: data/scaled_features.csv
- Metadata: data/metadata.json
