In [1]:
#importing librabries
import json
import pandas as pd
from pathlib import Path

In [2]:
#Pointing to my JSON data folder
DATA_DIR = Path("C:/Users/bisar/My fitbit dashboard project/Global Export Data")
DATA_DIR

WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data')

In [3]:
#Checking my JSON data
all_json_files = list(DATA_DIR.glob("*.json"))
all_json_files

[WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/badge.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-05-04.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-06-03.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-07-03.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-08-02.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-09-01.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-10-01.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-10-31.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/calories-2025-11-30.json'),
 WindowsPath('C:/Users/bisar/My fitbit dashboard project/Global Export Data/ca

###Problem exsists with with name convention for every file since the file names end with date at last###

In [6]:
from collections import defaultdict
metric_files = defaultdict(list)

In [7]:
metric_files

defaultdict(list, {})

In [8]:
SKIP_FILES = {"badge.json", "height-2025-05-04.json", "weight-2025-05-04.json"} 
for file in all_json_files:
    if file.name in SKIP_FILES:              #file.name takes the last part of the WindowsPath which is the name of the file
        continue                               #skips the badge.json coz we don't need it
    metric_name = file.stem.split("-")[0]      #splits file name and takes only the first part for a clean name
    metric_files[metric_name].append(file)     #.append(file) puts the file in the list

metric_files.keys() #just showing the keys in the metric_files dic

dict_keys(['calories', 'distance', 'lightly_active_minutes', 'moderately_active_minutes', 'resting_heart_rate', 'sedentary_minutes', 'steps', 'very_active_minutes'])

In [9]:
def ingest_fitbit_json(file_path: Path) -> pd.DataFrame:
    try:
        with open(file_path, "r") as f:   # Opens the json file safely and converts it from JSON to python object     
            data = json.load(f)

        if not isinstance(data, list):    # This one checks if the datatype for data is list or not, if other than list then it's an error
            raise ValueError("Expected list of records")

        df = pd.DataFrame(data)

        expected_cols = {"dateTime", "value"}
        if not expected_cols.issubset(df.columns):   #checks if the columns of dataframe are "dateTime" and "value"
            raise ValueError("Invalid schema")

        df["source_file"] = file_path.name     #Adding new "source_file" with names of the source file to trace back to the data where it came from

        return df

    except Exception as e:
        print(f"[FAILED] {file_path.name} | {e}")
        return pd.DataFrame()

        

In [10]:
raw_data = {}

for metric, files in metric_files.items():
    dfs = []

    for file in files:
        df = ingest_fitbit_json(file)
        if not df.empty:
            dfs.append(df)

        if dfs:
            combined_df = pd.concat(dfs, ignore_index = True)
            combined_df["metric"] = metric
            raw_data[metric] = combined_df

            print(f"[OK] {metric}: {combined_df.shape}")

[OK] calories: (43200, 4)
[OK] calories: (86400, 4)
[OK] calories: (129600, 4)
[OK] calories: (172800, 4)
[OK] calories: (216000, 4)
[OK] calories: (259200, 4)
[OK] calories: (302400, 4)
[OK] calories: (345600, 4)
[OK] calories: (351182, 4)
[OK] distance: (5123, 4)
[OK] distance: (11654, 4)
[OK] distance: (17278, 4)
[OK] distance: (23774, 4)
[OK] distance: (29386, 4)
[OK] distance: (34296, 4)
[OK] distance: (38981, 4)
[OK] distance: (44069, 4)
[OK] distance: (44615, 4)
[OK] lightly_active_minutes: (30, 4)
[OK] lightly_active_minutes: (60, 4)
[OK] lightly_active_minutes: (90, 4)
[OK] lightly_active_minutes: (120, 4)
[OK] lightly_active_minutes: (150, 4)
[OK] lightly_active_minutes: (180, 4)
[OK] lightly_active_minutes: (210, 4)
[OK] lightly_active_minutes: (240, 4)
[OK] lightly_active_minutes: (270, 4)
[OK] moderately_active_minutes: (30, 4)
[OK] moderately_active_minutes: (60, 4)
[OK] moderately_active_minutes: (90, 4)
[OK] moderately_active_minutes: (120, 4)
[OK] moderately_active_min

In [None]:
raw_data

**Cleaning and saving Calories dataframe**

In [11]:
df = raw_data["calories"].copy()

In [12]:
df.head()

Unnamed: 0,dateTime,value,source_file,metric
0,05/04/25 00:00:00,0.99,calories-2025-05-04.json,calories
1,05/04/25 00:01:00,0.99,calories-2025-05-04.json,calories
2,05/04/25 00:02:00,0.99,calories-2025-05-04.json,calories
3,05/04/25 00:03:00,0.99,calories-2025-05-04.json,calories
4,05/04/25 00:04:00,0.99,calories-2025-05-04.json,calories


In [13]:
df["dateTime"] = pd.to_datetime(df["dateTime"], errors = "coerce")

  df["dateTime"] = pd.to_datetime(df["dateTime"], errors = "coerce")


In [15]:
df["dateTime"].head()

0   2025-05-04 00:00:00
1   2025-05-04 00:01:00
2   2025-05-04 00:02:00
3   2025-05-04 00:03:00
4   2025-05-04 00:04:00
Name: dateTime, dtype: datetime64[ns]

In [16]:
df["dateTime"].dtype

dtype('<M8[ns]')

In [17]:
df["value"] = pd.to_numeric(df["value"], errors="coerce")

In [22]:
df = df.dropna(subset=["dateTime", "value"])

In [23]:
df = df.sort_values("dateTime").reset_index(drop=True)

In [24]:
df.head()

Unnamed: 0,dateTime,value,source_file,metric
0,2025-05-04 00:00:00,0.99,calories-2025-05-04.json,calories
1,2025-05-04 00:01:00,0.99,calories-2025-05-04.json,calories
2,2025-05-04 00:02:00,0.99,calories-2025-05-04.json,calories
3,2025-05-04 00:03:00,0.99,calories-2025-05-04.json,calories
4,2025-05-04 00:04:00,0.99,calories-2025-05-04.json,calories


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351182 entries, 0 to 351181
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   dateTime     351182 non-null  datetime64[ns]
 1   value        351182 non-null  float64       
 2   source_file  351182 non-null  object        
 3   metric       351182 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 10.7+ MB


In [26]:
cleaned_data = {}
cleaned_data["calories"] = df

In [32]:
cleaned_data["calories"].to_parquet("C:/Users/bisar/My fitbit dashboard project/Cleaned/calories.parquet", index=False)

**Cleaning and saving distance dataframe**

In [12]:
df_1 = raw_data["distance"].copy()

In [31]:
df_1.head()

Unnamed: 0,dateTime,value,source_file,metric
0,01/01/26 03:55:00,69,distance-2025-12-30.json,distance
1,01/01/26 03:56:00,411,distance-2025-12-30.json,distance
2,01/01/26 03:57:00,137,distance-2025-12-30.json,distance
3,01/01/26 03:58:00,1370,distance-2025-12-30.json,distance
4,01/01/26 05:33:00,69,distance-2025-12-30.json,distance


In [32]:
df_1["dateTime"] = pd.to_datetime(df_1["dateTime"], errors = "coerce")

  df_1["dateTime"] = pd.to_datetime(df_1["dateTime"], errors = "coerce")


In [33]:
df_1["value"] = pd.to_numeric(df_1["value"], errors="coerce")

In [34]:
df_1 = df_1.dropna(subset=["dateTime", "value"])

In [35]:
df_1 = df_1.sort_values("dateTime").reset_index(drop=True)

In [36]:
df_1.head()

Unnamed: 0,dateTime,value,source_file,metric
0,2025-05-05 18:06:00,69,distance-2025-05-04.json,distance
1,2025-05-05 18:07:00,480,distance-2025-05-04.json,distance
2,2025-05-05 18:34:00,137,distance-2025-05-04.json,distance
3,2025-05-05 18:35:00,1028,distance-2025-05-04.json,distance
4,2025-05-05 19:20:00,69,distance-2025-05-04.json,distance


In [37]:
df_1["dateTime"].dtype

dtype('<M8[ns]')

In [38]:
cleaned_data = {}
cleaned_data["distance"] = df_1

In [39]:
cleaned_data["distance"].to_parquet("C:/Users/bisar/My fitbit dashboard project/Cleaned/distance.parquet", index=False)

In [40]:
cleaned_data["distance"].to_csv("C:/Users/bisar/My fitbit dashboard project/Cleaned/distance.csv", index=False)