In [3]:
from google.colab import drive

# This command initiates the authentication process
drive.mount('/content/drive')

Mounted at /content/drive


# **1.Load datasets**

In [4]:
import pandas as pd

# Load files safely
sleep_df = pd.read_csv("/content/minuteSleep_merged.csv", low_memory=False)
steps_df = pd.read_csv("/content/minuteStepsNarrow_merged.csv", low_memory=False)
hr_df = pd.read_csv(
    "/content/heartrate_seconds_merged.csv",
    on_bad_lines="skip",
    low_memory=False
)

# Rename columns
sleep_df = sleep_df.rename(columns={"date": "Time", "value": "Sleep"})
steps_df = steps_df.rename(columns={"ActivityMinute": "Time"})
hr_df = hr_df.rename(columns={"Value": "HeartRate"})

# Fix datatypes
for df in [sleep_df, steps_df, hr_df]:
    df["Id"] = df["Id"].astype(str)

# Convert time
sleep_df["Time"] = pd.to_datetime(sleep_df["Time"], errors="coerce")
steps_df["Time"] = pd.to_datetime(steps_df["Time"], errors="coerce")
hr_df["Time"] = pd.to_datetime(hr_df["Time"], errors="coerce").dt.floor("min")

# Drop bad rows
sleep_df = sleep_df.dropna(subset=["Time"])
steps_df = steps_df.dropna(subset=["Time"])
hr_df = hr_df.dropna(subset=["Time"])

# AGGREGATE FIRST (THIS IS WHAT WAS MISSING)
steps_df = steps_df.groupby(
    ["Id", "Time"], as_index=False
)["Steps"].sum()

sleep_df = sleep_df.groupby(
    ["Id", "Time"], as_index=False
)["Sleep"].max()

hr_df = hr_df.groupby(
    ["Id", "Time"], as_index=False
)["HeartRate"].mean()

# NOW MERGE (ONE ROW PER MINUTE)
merged = steps_df.merge(
    sleep_df, on=["Id", "Time"], how="inner"
).merge(
    hr_df, on=["Id", "Time"], how="inner"
)

# Final cleanup
merged = merged.dropna().sort_values(["Id", "Time"])

# Save output
merged.to_csv("/content/fitbit_minute_OPTIMAL.csv", index=False)

print("FINAL OPTIMAL DATASET")
print("Shape:", merged.shape)


  steps_df["Time"] = pd.to_datetime(steps_df["Time"], errors="coerce")


FINAL OPTIMAL DATASET
Shape: (1095, 5)


In [5]:
d=pd.read_csv("/content/fitbit_minute_OPTIMAL.csv")
d

Unnamed: 0,Id,Time,Steps,Sleep,HeartRate
0,4020332650,2016-04-12 00:00:00,0,1,63.500000
1,4020332650,2016-04-12 00:01:00,0,1,65.913043
2,4020332650,2016-04-12 00:02:00,0,1,67.000000
3,4020332650,2016-04-12 00:03:00,0,1,67.000000
4,4020332650,2016-04-12 00:04:00,0,1,67.000000
...,...,...,...,...,...
1090,8792009665,2016-04-12 09:55:00,0,1,55.200000
1091,8792009665,2016-04-12 09:56:00,0,1,55.000000
1092,8792009665,2016-04-12 09:57:00,0,1,57.333333
1093,8792009665,2016-04-12 09:58:00,0,1,56.333333


# **Milestone 1: Data Collection and Preprocessing**

Objective:

The goal of this milestone is to implement a data ingestion and preprocessing pipeline for fitness tracker data. We will:

1.Ingest raw CSV files (Heart Rate, Steps, Sleep).

2.Validate the schema and clean column names.

3.Normalize all timestamps to UTC.

4.Align disparate data frequencies (seconds vs. minutes) to a consistent 1-minute interval.

5.Handle missing values using interpolation and zero-filling.


# **1.Data Ingestion & Schema Validation**

In [6]:
import pandas as pd

def load_and_validate(filepath):
    df = pd.read_csv(filepath, low_memory=False)

    required_cols = {"Id", "Time", "Steps", "Sleep", "HeartRate"}
    if not required_cols.issubset(df.columns):
        raise ValueError("Invalid schema: Missing required columns")

    return df

df = load_and_validate("/content/fitbit_minute_OPTIMAL.csv")
print("Data loaded & schema validated")


Data loaded & schema validated


# **2.Convert timestamps(UTC):**

In [7]:
def normalize_timestamps(df):
    df["Time"] = pd.to_datetime(df["Time"], errors="coerce", utc=True)
    df = df.dropna(subset=["Time"])
    return df

df = normalize_timestamps(df)
print("Timestamps normalized to UTC")


Timestamps normalized to UTC


# **3.Data Type Cleaning**

In [8]:
def clean_dtypes(df):
    df["Id"] = df["Id"].astype(str)
    df["Steps"] = pd.to_numeric(df["Steps"], errors="coerce")
    df["Sleep"] = pd.to_numeric(df["Sleep"], errors="coerce")
    df["HeartRate"] = pd.to_numeric(df["HeartRate"], errors="coerce")
    return df

df = clean_dtypes(df)
print("Data types cleaned")


Data types cleaned


# 4.Handling Missing Values

In [9]:
def handle_missing_values(df):
    df["Steps"] = df["Steps"].fillna(0)                  # No movement
    df["Sleep"] = df["Sleep"].fillna(0)                  # Awake
    df["HeartRate"] = df["HeartRate"].fillna(
        df["HeartRate"].median()                          # Robust estimate
    )
    return df

df = handle_missing_values(df)
print("Missing values handled")


Missing values handled


# 5.Resample & Align to 1-Minute

In [13]:
def align_to_minute(df):
    df = (
        df.set_index("Time")
          .groupby("Id")
          .resample("1min")
          .agg({
              "Steps": "sum",
              "Sleep": "max",
              "HeartRate": "mean"
          })
          .reset_index()
    )
    return df

df = align_to_minute(df)
print("Data aligned to 1-minute intervals")


Data aligned to 1-minute intervals


# 6.Final Cleanup & Ordering

In [14]:
def final_cleanup(df):
    df = df.dropna()
    df = df.sort_values(["Id", "Time"])
    return df

df = final_cleanup(df)
print("Final cleanup completed")
df.to_csv("/content/fitbit_clean_final.csv", index=False)
print("Final dataset saved as fitbit_clean_final.csv")



Final cleanup completed
Final dataset saved as fitbit_clean_final.csv


In [12]:
df

Unnamed: 0,Id,Time,Steps,Sleep,HeartRate
0,4020332650,2016-04-12 00:00:00+00:00,0,1.0,63.500000
1,4020332650,2016-04-12 00:01:00+00:00,0,1.0,65.913043
2,4020332650,2016-04-12 00:02:00+00:00,0,1.0,67.000000
3,4020332650,2016-04-12 00:03:00+00:00,0,1.0,67.000000
4,4020332650,2016-04-12 00:04:00+00:00,0,1.0,67.000000
...,...,...,...,...,...
1127,8792009665,2016-04-12 09:55:00+00:00,0,1.0,55.200000
1128,8792009665,2016-04-12 09:56:00+00:00,0,1.0,55.000000
1129,8792009665,2016-04-12 09:57:00+00:00,0,1.0,57.333333
1130,8792009665,2016-04-12 09:58:00+00:00,0,1.0,56.333333
