In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
raw_df = pd.read_csv('dailyActivity_merged.csv')
raw_df.info()
print("\nMissing values:\n", raw_df.isnull().sum())

In [None]:
sleep_df = pd.read_csv('sleepDay_merged.csv')

# Rename columns
activity_df = raw_df.rename(columns={
    'ActivityDate': 'Date',
    'TotalSteps': 'Steps',
    'TotalDistance': 'Distance',
    'VeryActiveMinutes': 'VeryActiveMinutes',
    'FairlyActiveMinutes': 'FairlyActiveMinutes',
    'LightlyActiveMinutes': 'LightlyActiveMinutes',
    'SedentaryMinutes': 'SedentaryMinutes',
    'Calories': 'Calories'
})

sleep_df = sleep_df.rename(columns={
    'SleepDay': 'Date',
    'TotalSleepRecords': 'SleepRecords',
    'TotalMinutesAsleep': 'TotalMinutesAsleep',
    'TotalTimeInBed': 'TimeInBed'
})

activity_df['Date'] = pd.to_datetime(activity_df['Date'], format='%m/%d/%Y')
sleep_df['Date'] = pd.to_datetime(sleep_df['Date'], format='%m/%d/%Y %I:%M:%S %p')

# Merge
merged_df = pd.merge(activity_df, sleep_df[['Id', 'Date', 'TotalMinutesAsleep']], on=['Id', 'Date'], how='left')
merged_df['SleepHours'] = merged_df['TotalMinutesAsleep'] / 60

merged_df['SleepHours'] = merged_df.groupby('Id')['SleepHours'].transform(lambda x: x.fillna(x.mean()))
merged_df.dropna(inplace=True)

merged_df.info()
merged_df.head()

In [None]:
print(merged_df[['Steps', 'Calories', 'SleepHours']].describe())

plt.figure(figsize=(10, 6))
sns.histplot(merged_df['Steps'], bins=30, kde=True)
plt.title('Distribution of Daily Steps')
plt.xlabel('Steps')
plt.ylabel('Frequency')
plt.show()

# heatMap
plt.figure(figsize=(10, 8))
correlation_matrix = merged_df[['Steps', 'Calories', 'SleepHours', 'VeryActiveMinutes', 'SedentaryMinutes']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Between Health Metrics')
plt.show()

In [None]:
df_featured = merged_df.copy()
df_featured['DayOfWeek'] = df_featured['Date'].dt.day_name()

bins = [0, 7500, 10000, float('inf')]
labels = ['Low', 'Moderate', 'High']
df_featured['ActivityLevel'] = pd.cut(df_featured['Steps'], bins=bins, labels=labels, right=False)

conditions = [
    (df_featured['SleepHours'] < 7),
    (df_featured['SleepHours'] >= 7) & (df_featured['SleepHours'] <= 8),
    (df_featured['SleepHours'] > 8)
]
outcomes = ['Poor', 'Optimal', 'Good']
df_featured['SleepQuality'] = np.select(conditions, outcomes, default='Poor')

df_featured.head()

In [10]:
df_featured.to_csv('../data/fitbit_data_processed.csv', index=False)
print("Processed data saved to ../data/fitbit_data_processed.csv")