# Phase 1 & 2: Data Loading, Cleaning, and Preprocessing

This notebook covers:
- **Phase 1**: Environment setup and data inspection
- **Phase 2**: Data cleaning, preprocessing, and feature selection for clustering

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## Phase 1.2: Load and Inspect Data

In [None]:
# Load the dataset
df = pd.read_csv('data/Data Science Dataset - DATABASE.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\n" + "="*80)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*80)
print("\nDescriptive Statistics:")
df.describe()

## Phase 2.1: Handle Missing Values

In [None]:
# Drop rows with missing dates (per methodology)
print(f"Rows before dropping missing dates: {len(df)}")
df = df.dropna(subset=['Date'])
print(f"Rows after dropping missing dates: {len(df)}")

# Check for rows with all NaN values (completely empty rows)
empty_rows = df[df.isnull().all(axis=1)]
print(f"Completely empty rows: {len(empty_rows)}")
if len(empty_rows) > 0:
    df = df.dropna(how='all')
    print(f"Rows after dropping completely empty rows: {len(df)}")

In [None]:
# Identify numeric columns for imputation
numeric_columns = ['Sleep_Hours', 'Work_Hours', 'Study_Hours', 'Chore_Time_Mins', 
                   'Distraction_Time_Mins', 'Travel Time (Hours)', 'Music_Time_Hours', 
                   'Tasks_Completed', 'Mood_Rating', 'Focus_Rating']

# Log missing values before imputation
missing_before = df[numeric_columns].isnull().sum()
print("Missing values before imputation:")
print(missing_before[missing_before > 0])

# Impute missing numeric values with 0 (per methodology)
df[numeric_columns] = df[numeric_columns].fillna(0)

# Verify imputation
missing_after = df[numeric_columns].isnull().sum()
print("\nMissing values after imputation:")
print(missing_after[missing_after > 0])

## Phase 2.2: Parse and Validate Dates

In [None]:
# Parse Date column to datetime
# Format appears to be MM-DD-YY
df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%y')

# Validate Day of the Week against computed weekday
df['Computed_Day'] = df['Date'].dt.day_name()
df['Day_Match'] = df['Day of the Week'] == df['Computed_Day']

# Check for mismatches
mismatches = df[~df['Day_Match']]
if len(mismatches) > 0:
    print(f"Found {len(mismatches)} date/day mismatches:")
    print(mismatches[['Date', 'Day of the Week', 'Computed_Day']])
else:
    print("All dates match their day of the week!")

# Add Is_Weekend column for "Weekend Bleed" hypothesis
df['Is_Weekend'] = df['Date'].dt.dayofweek.isin([5, 6])  # Saturday=5, Sunday=6

print(f"\nWeekend days: {df['Is_Weekend'].sum()}")
print(f"Weekday days: {(~df['Is_Weekend']).sum()}")

## Phase 2.3: Handle Categorical Variables

In [None]:
# Explore categorical variables
print("Mode of Transport unique values:")
print(df['Mode of Transport'].value_counts())
print("\nMain_Music_Genre unique values:")
print(df['Main_Music_Genre'].value_counts())

# Note: Per methodology, we'll exclude categorical variables from clustering
# but keep them for exploratory analysis and hypothesis testing
# Optionally, we can one-hot encode them if needed later

## Phase 2.4: Feature Selection for Clustering

In [None]:
# Select features for clustering based on methodology
# Inputs: Sleep_Hours, Music_Time_Hours, Travel Time (Hours)
# Behaviors: Work_Hours, Study_Hours, Chore_Time_Mins, Distraction_Time_Mins, Tasks_Completed

clustering_features = [
    'Sleep_Hours',           # Input
    'Music_Time_Hours',      # Input
    'Travel Time (Hours)',   # Input
    'Work_Hours',            # Behavior
    'Study_Hours',           # Behavior
    'Chore_Time_Mins',       # Behavior
    'Distraction_Time_Mins', # Behavior
    'Tasks_Completed'        # Behavior
]

# Create feature matrix for clustering
X = df[clustering_features].copy()

# Targets (excluded from clustering, used for validation)
targets = ['Mood_Rating', 'Focus_Rating']

print("Features selected for clustering:")
print(clustering_features)
print(f"\nFeature matrix shape: {X.shape}")
print("\nFeature statistics:")
X.describe()

## Phase 2.5: Scaling

In [None]:
# Apply StandardScaler (Z-score normalization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=clustering_features, index=X.index)

print("Scaled feature matrix statistics:")
print(X_scaled_df.describe())
print("\nScaled feature means (should be ~0):")
print(X_scaled_df.mean())
print("\nScaled feature std (should be ~1):")
print(X_scaled_df.std())

## Save Processed Data

Save the cleaned and scaled data for use in subsequent notebooks.

In [None]:
# Save processed data
# Save cleaned dataframe
df.to_csv('data/cleaned_data.csv', index=False)

# Save scaled features
X_scaled_df.to_csv('data/scaled_features.csv', index=False)

# Save feature names and targets for reference
import json
metadata = {
    'clustering_features': clustering_features,
    'targets': targets,
    'n_samples': len(df)
}
with open('data/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Processed data saved successfully!")
print(f"- Cleaned data: data/cleaned_data.csv ({len(df)} rows)")
print(f"- Scaled features: data/scaled_features.csv")
print(f"- Metadata: data/metadata.json")