In [None]:
import sys
from pathlib import Path

if "src" not in sys.path:
    sys.path.append(str(Path.cwd() / "src"))

from sleepiness import SleepinessData

file_paths = [
    "sleep-data/phase-1-1/data-streams.json",
    "sleep-data/phase-2-1/data-streams.json",
    "sleep-data/phase-3-1/data-streams.json"
]
# OR
# file_paths = "data/phase-1-1/data-streams.json"

sd = SleepinessData(file_paths)

## Participant Data Integration
When loading multiple data folders, the library automatically loads `participant-data.json` from each folder and unifies participants across folders (using email/SSN as identifiers).

In [None]:
# View all participants across all loaded data folders
sd.print_participants()

### Data with Participant Info
Iterate through data items enriched with participant information:

In [None]:
# Get participant info for a specific deployment
# for item in sd._get_item_generator():
#     deployment_id = item.get('studyDeploymentId')
#     if deployment_id:
#         participant = sd.get_participant(deployment_id)
#         if participant:
#             print(f"Deployment: {deployment_id[:30]}...")
#             print(f"  Unified ID: {participant.unified_participant_id}")
#             print(f"  Email: {participant.email}")
#             print(f"  Source folder: {participant.source_folder}")
#         break

# Get participant info
sd.participant("test@example.com").info()
sd.participant("test@example.com").print_info()

# Get all data for this participant
count = 0
for item in sd.participant("test@example.com").all_data():
    print(item)
    count += 1
    if count >= 5:
        print("Limit output for demo")
        break

# Filter by data type
for item in sd.participant("test@example.com").all_data("dk.cachet.carp.location"):
    print(item)

# See available fields
sd.participant("test@example.com").available_fields()
sd.participant("test@example.com").print_available_fields()

# See data types available
sd.participant("test@example.com").data_types()
sd.participant("test@example.com").print_data_types()

# Get count
sd.participant("test@example.com").count()

# Get DataFrame
df = sd.participant("test@example.com").dataframe("dk.cachet.carp.stepcount")

# Check if exists
sd.participant("test@example.com").exists

### DataFrame with Participant Info
Get a DataFrame enriched with participant columns:

In [None]:
# Get DataFrame with participant columns
df = sd.get_dataframe_with_participants("dk.cachet.carp.stepcount")
if df is not None and not df.empty:
    print(df[['participant_id', 'participant_email', 'participant_folder']].head())

### Visualize Participant Data on Map
Generate a heatmap aggregating data for a specific participant across all their deployments:

In [None]:
from sleepiness.plotting import LocationVisualizer

# Create visualizer
viz = LocationVisualizer(sd)

# Plot heatmap for a specific participant (e.g., P0002 who appears in all 3 phases)
viz.plot_participant_heatmap(
    unified_participant_id="P0002",  # Choose a participant from the summary table
    output_file="participant_heatmap.html",
    location_type="dk.cachet.carp.location"
)

## 1. Schema Discovery
Scan the file to understand the structure of the data.

In [None]:
sd.print_schema()

### Generate Type Definitions
You can generate a Python module with dataclasses representing the data schema. This allows for type-safe access to the data, including nested JSON objects.

In [None]:
import importlib
import sleepiness.reader
importlib.reload(sleepiness.reader)

# Re-initialize sd to ensure latest code is used
sd = sleepiness.reader.SleepinessData(file_paths)
sd.generate_type_definitions(output_file="generated_types.py", sample_size=500)

In [None]:
# Example usage of generated types
try:
    import generated_types
    import importlib
    importlib.reload(generated_types)
    
    # Read one item and convert
    gen = sd._get_item_generator()
    item = next(gen)
    
    obj = generated_types.SleepinessItem.from_dict(item)
    print(f"Converted object type: {type(obj)}")
    if obj.dataStream and obj.dataStream.dataType:
        print(f"Data Stream: {obj.dataStream.dataType.name}")
except ImportError:
    print("Could not import generated_types. Please restart kernel or check file.")
except Exception as e:
    print(f"Error: {e}")

In [None]:
item = next(sd._get_item_generator())
obj = generated_types.SleepinessItem.from_dict(item)
sd.generate_type_definitions(output_file="generated_types.py", sample_size=500)

item = next(sd._get_item_generator())
obj = generated_types.SleepinessItem.from_dict(item)

# Type-safe access
print(obj.dataStream.dataType.name)

## 2. Count Items
Count the total number of records in the file.

In [None]:
count = sd.count_items()
print(f"Total items: {count}")

## 3. Grouping Data
Split the large JSON file into smaller files based on the data type.

### Explore Available Fields
You can scan a sample of the data to list all available fields in dot-notation. This is helpful for deciding which field to group by.

In [None]:
fields = sd.list_all_fields(sample_size=500)
print("Available fields for grouping:")
for f in fields:
    print(f" - {f}")

In [None]:
output_groups = "output_groups"
# sd.group_by_field("dataStream.studyDeploymentId", output_groups)
sd.group_by_email(output_groups)

## 4. Export to JSON
Export a specific data type to a separate JSON file.

In [None]:
sd.export_to_json("heartbeat.json", data_type="dk.cachet.carp.heartbeat")

## 5. Convert to Parquet
Convert the data to Parquet format for efficient storage and loading.

In [None]:
parquet_dir = "output_parquet"
sd.convert_to_parquet(parquet_dir)

## 6. Load DataFrame
Load data into a pandas DataFrame, utilizing the Parquet files if available.

In [None]:
# Load stepcount data
df = sd.get_dataframe("dk.cachet.carp.completedtask", parquet_dir)

if df is not None:
    print(f"Loaded {len(df)} records")
    display(df.head())

In [None]:
# df first row
df.iloc[313].measurement

## 7. Plotting
Generate a heatmap of user locations and overlay step count data.

In [None]:
from sleepiness.plotting import LocationVisualizer

# Initialize visualizer
viz = LocationVisualizer(sd)

# Pick a user ID (you can find one from the grouping step or list_all_fields)
# For demo purposes, let's try to find a valid ID from the loaded dataframe if available, 
# or just use a hardcoded one if you know it.
study_deployment_id = "0efd5a7f-6428-48db-8099-8d65a62606b4" # Example ID

# Generate heatmap
# Note: Ensure you have 'dk.cachet.carp.geolocation' and 'dk.cachet.carp.stepcount' data available
# You might need to run convert_to_parquet first if you haven't.


viz.plot_user_heatmap(
    study_deployment_id=study_deployment_id,
    location_type="dk.cachet.carp.location", # Adjust type name if different
    step_type="dk.cachet.carp.stepcount",       # Adjust type name if different
    output_file="user_heatmap.html"
)

# Display the map in the notebook
# from IPython.display import IFrame
# IFrame(src='user_heatmap.html', width=700, height=600)

### Plotting with Type-Safe Objects
You can also convert the data to type-safe objects and pass them directly to the visualizer. This is useful if you want to manipulate the objects before plotting.

In [None]:
# 1. Get DataFrames
df_loc = sd.get_dataframe("dk.cachet.carp.location", parquet_dir)
df_steps = sd.get_dataframe("dk.cachet.carp.stepcount", parquet_dir)

# 2. Filter by User
# Using the same ID as above
if df_loc is not None and not df_loc.empty:
    df_loc_user = df_loc[df_loc['studyDeploymentId'] == study_deployment_id]
    df_steps_user = df_steps[df_steps['studyDeploymentId'] == study_deployment_id] if df_steps is not None else pd.DataFrame()

    # 3. Convert to Objects
    # Note: generated_types.SleepinessItem.from_dict expects a dictionary structure matching the JSON.
    # If df_loc comes from Parquet, it might have nested columns as dicts (if read correctly) or flat columns.
    # Let's assume it has nested columns or we convert it.
    
    # If the dataframe has nested dicts (e.g. 'measurement' column contains dicts):
    location_items = [generated_types.SleepinessItem.from_dict(row) for row in df_loc_user.to_dict('records')]
    step_items = [generated_types.SleepinessItem.from_dict(row) for row in df_steps_user.to_dict('records')]
    
    print(f"Converted {len(location_items)} location items and {len(step_items)} step items.")

    # 4. Plot
    viz.plot_heatmap_from_items(
        location_items=location_items,
        step_items=step_items,
        output_file="user_heatmap_objects.html"
    )
    
    # Display
    # IFrame(src='user_heatmap_objects.html', width=700, height=600)
else:
    print("No data found to plot.")