In [65]:
# manually copy all the data to one file with a subject name
# add =TEXT(A1, "mm/dd/yyyy hh:mm:ss")
# add the record start time from sys sheet (manually) for each subject

In [40]:
import pandas as pd
from datetime import datetime, timedelta

# Load input data from CSV (Actual Systolic file)
df = pd.read_csv("/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_v2.csv", delimiter="\t")

df['TimeStart'] = pd.to_datetime(df['TimeStart'], format="%m/%d/%Y %H:%M:%S")
df['TimeEnd'] = pd.to_datetime(df['TimeEnd'], format="%m/%d/%Y %H:%M:%S")
df['reco_start'] = pd.to_datetime(df['reco_start'], format="%m/%d/%Y %H:%M:%S")
df['reco_end'] = pd.to_datetime(df['reco_end'], format="%m/%d/%Y %H:%M:%S")

time_diff_seconds = (df["TimeStart"].max() - df["TimeStart"].min()).total_seconds()
print(time_diff_seconds, "Seconds between min and max time")

time_diff_per_subject = (
    df.groupby('Subject')['TimeStart']
      .agg(lambda x: (x.max() - x.min()).total_seconds() / 60) 
      .reset_index()
      .rename(columns={'TimeStart': 'time_diff_seconds'})
)

print(time_diff_per_subject)


14123284.0 Seconds between min and max time
    Subject  time_diff_seconds
0         1         707.466667
1         2         584.950000
2         4         328.066667
3         5         564.200000
4         6         705.816667
5         7         667.750000
6         8         499.733333
7         9         506.583333
8        10         734.533333
9        11         448.716667
10       12         465.833333
11       13         403.533333
12       14         544.116667
13       15         443.783333
14       16         348.900000
15       18         480.816667
16       19         488.216667
17       20         530.400000
18       23         570.966667
19       25         156.433333
20       26         128.233333
21       27         384.466667
22       28         525.850000
23       30         545.516667


In [41]:
import pandas as pd
from datetime import timedelta

results = []
for subject, grp in df.groupby('Subject'):
    subject_seconds = 0
    start = grp['reco_start'].iloc[0]
    end   = grp['reco_end'].iloc[0]
    t = start

    while t <= end:
        subject_seconds += 1

        hit = grp[(grp['TimeStart'] <= t) & (grp['TimeEnd']   >= t)]
        if not hit.empty:
            row   = hit.iloc[0]
            val   = row['Value']
            ev    = row['Event name']
        else:
            val, ev, stage = None, None, None

        results.append({
            'Subject':     subject,
            'Timestamp':   t,
            'Value':       val,
            'Event name':  ev
        })

        t += timedelta(seconds=1)

    print(f"Subject {subject}: expanded {subject_seconds} seconds")

expanded_df = pd.DataFrame(results)

#Save output to CSV
output_file = "/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_v3.csv"
expanded_df.to_csv(output_file, index=False, na_rep="None") 




Subject 1: expanded 42600 seconds
Subject 2: expanded 36064 seconds
Subject 4: expanded 20883 seconds
Subject 5: expanded 34657 seconds
Subject 6: expanded 42992 seconds
Subject 7: expanded 41340 seconds
Subject 8: expanded 31516 seconds
Subject 9: expanded 31802 seconds
Subject 10: expanded 44791 seconds
Subject 11: expanded 27298 seconds
Subject 12: expanded 30240 seconds
Subject 13: expanded 25523 seconds
Subject 14: expanded 33061 seconds
Subject 15: expanded 26970 seconds
Subject 16: expanded 24572 seconds
Subject 18: expanded 32401 seconds
Subject 19: expanded 29610 seconds
Subject 20: expanded 32611 seconds
Subject 23: expanded 36809 seconds
Subject 25: expanded 29882 seconds
Subject 26: expanded 35880 seconds
Subject 27: expanded 23821 seconds
Subject 28: expanded 31917 seconds
Subject 30: expanded 35551 seconds


In [42]:

expanded_df_re = pd.read_csv("/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_v3.csv", delimiter=",")
expanded_df_re.head(3)



Unnamed: 0,Subject,Timestamp,Value,Event name
0,1,2024-10-31 22:21:30,,
1,1,2024-10-31 22:21:31,,
2,1,2024-10-31 22:21:32,,


In [13]:
import pandas as pd
from csv import QUOTE_NONE

# 1) Load the original events file (v2)
df_v2 = df

df_expanded = expanded_df_re

# 4) Compute unique event counts per subject
counts_v2 = (
    df_v2
    .groupby('Subject')['Event name']
    .nunique()
    .reset_index(name='unique_events_v2')
)

df_expanded_events = df_expanded[
    df_expanded['Event name'].notna() &
    (df_expanded['Event name'] != '') &
    (df_expanded['Event name'] != 'None')
]
counts_expanded = (
    df_expanded_events
    .groupby('Subject')['Event name']
    .nunique()
    .reset_index(name='unique_events_expanded_v3')
)

# 5) Merge and compare
comparison = counts_v2.merge(
    counts_expanded,
    on='Subject',
    how='outer'
).fillna(0)

comparison['unique_events_v2'] = comparison['unique_events_v2'].astype(int)
comparison['unique_events_expanded_v3'] = comparison['unique_events_expanded_v3'].astype(int)

# 6) Output the result
print(comparison.sort_values('Subject').to_string(index=False))

 Subject  unique_events_v2  unique_events_expanded_v3
       1                 5                          5
       2                 5                          5
       4                 3                          3
       5                 3                          3
       6                 5                          5
       7                 5                          5
       8                 5                          5
       9                 4                          4
      10                 4                          4
      11                 5                          5
      12                 4                          4
      13                 3                          3
      14                 5                          5
      15                 4                          4
      16                 3                          3
      18                 3                          3
      19                 5                          5
      20                 5  

In [67]:
# Consolidate in Excel

In [14]:
import pandas as pd

expanded_df_re = pd.read_csv("/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_v3.csv", delimiter=",")

df = expanded_df_re
expanded_df_re.head(3)


Unnamed: 0,Subject,Timestamp,Value,Event name
0,1,2024-10-31 22:21:30,,
1,1,2024-10-31 22:21:31,,
2,1,2024-10-31 22:21:32,,


Rank

In [7]:
import pandas as pd

df['Rank'] = pd.NA

# 3. Iterate, resetting per subject and treating the first event in each block
current_rank     = 0
prev_subject     = None
prev_event_flag  = False

for idx, row in df.iterrows():
    subj        = row['Subject']
    event_flag  = pd.notna(row['Event name'])
    
    # Reset when subject changes
    if subj != prev_subject:
        current_rank    = 0
        prev_event_flag = False
    
    # If this row is the first event of a new block, bump rank
    if event_flag and not prev_event_flag:
        current_rank += 1
    
    # Assign rank
    df.at[idx, 'Rank'] = current_rank
    
    # Update trackers
    prev_subject    = subj
    prev_event_flag = event_flag

# 4. Convert Rank to integer type
df['Rank'] = df['Rank'].astype('Int64')

# 5. Preview
df.head(20)

Unnamed: 0,Subject,Timestamp,Value,Event name,Rank
0,1,2024-10-31 22:21:30,,,0
1,1,2024-10-31 22:21:31,,,0
2,1,2024-10-31 22:21:32,,,0
3,1,2024-10-31 22:21:33,,,0
4,1,2024-10-31 22:21:34,,,0
5,1,2024-10-31 22:21:35,,,0
6,1,2024-10-31 22:21:36,,,0
7,1,2024-10-31 22:21:37,,,0
8,1,2024-10-31 22:21:38,,,0
9,1,2024-10-31 22:21:39,,,0


pre and post

In [15]:
import numpy as np
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format='%Y-%m-%d %H:%M:%S')

# 3. Iterate, resetting per subject and treating the first event in each block
current_rank     = 0
prev_subject     = None
prev_event_flag  = False

for idx, row in df.iterrows():
    subj        = row['Subject']
    event_flag  = pd.notna(row['Event name'])
    
    # Reset when subject changes
    if subj != prev_subject:
        current_rank    = 0
        prev_event_flag = False
    
    # If this row is the first event of a new block, bump rank
    if event_flag and not prev_event_flag:
        current_rank += 1
    
    # Assign rank
    df.at[idx, 'Rank'] = current_rank
    
    # Update trackers
    prev_subject    = subj
    prev_event_flag = event_flag

# 4. Convert Rank to integer type
df['Rank'] = df['Rank'].astype('Int64')


block_times = (
    df[df['Event name'].notna()]
      .groupby(['Subject','Rank'])['Timestamp']
      .agg(block_start='min', block_end='max')
      .reset_index()
)
# Merge back so we have block_start/block_end on each row
df = df.merge(block_times, on=['Subject','Rank'], how='left')

# 4. Bring in next block’s start time (older logic)
first_event_times_df = (
    block_times
      .loc[:, ['Subject','Rank','block_start']]
      .rename(columns={'Rank':'Next_Rank', 'block_start':'Next_block_start'})
)
df['RankPlus1'] = df['Rank'] + 1
df = df.merge(
    first_event_times_df,
    left_on=['Subject','RankPlus1'],
    right_on=['Subject','Next_Rank'],
    how='left'
)

# 5. Define your (post, pre) cutoff pairs
cutoff_pairs = [(5,5), (10,5), (5,10), (10,10)]

for post_sec, pre_sec in cutoff_pairs:
    post_col = f'Post_{post_sec}'
    pre_col  = f'Pre_{pre_sec}'
    rank_col = f'rank_{post_sec}_{pre_sec}'

    # Post‐cutoff (no change)
    df[post_col] = df['Rank']
    df.loc[df['Timestamp'] > df['block_end'] + pd.Timedelta(seconds=post_sec), post_col] = 0

    # PRE‐cutoff: compare against Next_block_start, not block_start
    cond = (
        ((df[post_col] == 0) | (df['Rank'] == 0)) &
        (df['Timestamp'] >= df['Next_block_start'] - pd.Timedelta(seconds=pre_sec)) &
        (df['Timestamp'] <  df['Next_block_start'])
    )
    df[pre_col] = np.where(cond, df['Next_Rank'], 0)

# 1. Identify all Post_* and Pre_* columns
post_pre_cols = [c for c in df.columns if c.startswith('Post_') or c.startswith('Pre_')]

# 2. Define the columns you want to keep
cols_to_keep = ['Subject', 'Timestamp', 'Value', 'Event name', 'Rank'] + post_pre_cols

# 3. Subset the DataFrame
df_selected = df[cols_to_keep]

# 4. Inspect
print(df_selected.head())

   Subject           Timestamp  Value Event name  Rank  Post_5  Pre_5  \
0        1 2024-10-31 22:21:30    NaN        NaN     0       0    0.0   
1        1 2024-10-31 22:21:31    NaN        NaN     0       0    0.0   
2        1 2024-10-31 22:21:32    NaN        NaN     0       0    0.0   
3        1 2024-10-31 22:21:33    NaN        NaN     0       0    0.0   
4        1 2024-10-31 22:21:34    NaN        NaN     0       0    0.0   

   Post_10  Pre_10  
0        0     0.0  
1        0     0.0  
2        0     0.0  
3        0     0.0  
4        0     0.0  


In [18]:
df_selected.head(5)

Unnamed: 0,Subject,Timestamp,Value,Event name,Rank,Post_5,Pre_5,Post_10,Pre_10
0,1,2024-10-31 22:21:30,,,0,0,0.0,0,0.0
1,1,2024-10-31 22:21:31,,,0,0,0.0,0,0.0
2,1,2024-10-31 22:21:32,,,0,0,0.0,0,0.0
3,1,2024-10-31 22:21:33,,,0,0,0.0,0,0.0
4,1,2024-10-31 22:21:34,,,0,0,0.0,0,0.0


In [19]:
#Save output to CSV
output_file = "/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_Rank.csv"
df_selected.to_csv(output_file, index=False, na_rep="None") 

In [183]:

Data_consolidated_expanded_Rank = pd.read_csv("/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_Rank.csv", delimiter=",")
Data_consolidated_expanded_Rank.head(3)



Unnamed: 0,Subject,Timestamp,Value,Event name,Rank,Post_5,Pre_5,Post_10,Pre_10
0,1,2024-10-31 22:21:30,,,0,0,0.0,0,0.0
1,1,2024-10-31 22:21:31,,,0,0,0.0,0,0.0
2,1,2024-10-31 22:21:32,,,0,0,0.0,0,0.0


In [None]:
import pandas as pd
import glob
import os

# 1. Grab all .xlsx files
files = glob.glob('/Users/ebenezer/Documents/SleepApnea/Alldata/4hz Data/loop/*.xlsx')
files


** Systolic **

In [109]:

all_summaries = []

for path in files:
    print(f"Processing file: {os.path.basename(path)}")
    
    sheet = "Systolic"
    
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',
        usecols=['Time','Value','Sleep stage'],
        parse_dates=['Time']
    )


    df["Time"] = df["Time"].astype(str).str.split('.').str[0]
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')    
    df['Time_str'] = df['Time'].dt.strftime('%m/%d/%Y %H:%M:%S')
    
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
    
    
    summary = (
        df
        .groupby('Time_str', as_index=False)
        .agg(
            Value_median=('Value', 'median'),
            Stage_mode=('Sleep stage', lambda x: x.mode().iat[0] if not x.mode().empty else 0)
        )
    )

    # 7. Tag with source filename
    summary['subject'] = int((os.path.basename(path)).split('_', 1)[0])
    all_summaries.append(summary)

# 8. Concatenate and write out
if all_summaries:
    final_df = pd.concat(all_summaries, ignore_index=True)
    output_csv = f'/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_{sheet}.csv'
    final_df.to_csv(output_csv, index=False, na_rep='None')
    print(f"Saved combined summary to {output_csv}")
else:
    print("No valid Systolic data found in the specified folder.")




Processing file: 014_HzData.xlsx
Processing file: 008_HzData.xlsx
Processing file: 026_HzData.xlsx
Processing file: 002_SecData.xlsx
Processing file: 018_HzData.xlsx
Processing file: 027_SecData.xlsx
Processing file: 004_HzData.xlsx
Processing file: 011_HzData.xlsx
Processing file: 023_HzData.xlsx
Processing file: 020_HzData.xlsx
Processing file: 012_HzData.xlsx
Processing file: 007_HzData.xlsx
Processing file: 030_HzData.xlsx
Processing file: 025_HzData.xlsx
Processing file: 010_HzData.xlsx
Processing file: 005_HzData.xlsx
Processing file: 019_HzData.xlsx
Processing file: 009_HzData.xlsx
Processing file: 015_HzData.xlsx
Processing file: 016_HzData.xlsx
Processing file: 001_SecData.xlsx
Processing file: 006_HzData.xlsx
Processing file: 028_HzData.xlsx
Processing file: 013_HzData.xlsx
Saved combined summary to /Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Systolic.csv


In [None]:

all_summaries = []

for path in files:
    print(f"Processing file: {os.path.basename(path)}")
    
    sheet = "Diastolic"
    
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',
        usecols=['Time','Value','Sleep stage'],
        parse_dates=['Time']
    )


    df["Time"] = df["Time"].astype(str).str.split('.').str[0]
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')    
    df['Time_str'] = df['Time'].dt.strftime('%m/%d/%Y %H:%M:%S')
    
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
    
    
    summary = (
        df
        .groupby('Time_str', as_index=False)
        .agg(
            Value_median=('Value', 'median'),
            Stage_mode=('Sleep stage', lambda x: x.mode().iat[0] if not x.mode().empty else 0)
        )
    )

    # 7. Tag with source filename
    summary['subject'] = int((os.path.basename(path)).split('_', 1)[0])
    all_summaries.append(summary)

# 8. Concatenate and write out
if all_summaries:
    final_df = pd.concat(all_summaries, ignore_index=True)
    output_csv = f'/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_{sheet}.csv'
    final_df.to_csv(output_csv, index=False, na_rep='None')
    print(f"Saved combined summary to {output_csv}")
else:
    print("No valid Systolic data found in the specified folder.")




Processing file: 014_HzData.xlsx
Processing file: 008_HzData.xlsx
Processing file: 026_HzData.xlsx
Processing file: 002_SecData.xlsx
Processing file: 018_HzData.xlsx
Processing file: 027_SecData.xlsx
Processing file: 004_HzData.xlsx
Processing file: 011_HzData.xlsx
Processing file: 023_HzData.xlsx
Processing file: 020_HzData.xlsx
Processing file: 012_HzData.xlsx
Processing file: 007_HzData.xlsx
Processing file: 030_HzData.xlsx
Processing file: 025_HzData.xlsx
Processing file: 010_HzData.xlsx
Processing file: 005_HzData.xlsx
Processing file: 019_HzData.xlsx
Processing file: 009_HzData.xlsx


In [None]:

all_summaries = []

for path in files:
    print(f"Processing file: {os.path.basename(path)}")
    
    sheet = "Heart Rate Curve"
    
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',
        usecols=['Time','Value','Sleep stage'],
        parse_dates=['Time']
    )


    df["Time"] = df["Time"].astype(str).str.split('.').str[0]
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')    
    df['Time_str'] = df['Time'].dt.strftime('%m/%d/%Y %H:%M:%S')
    
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
    
    
    summary = (
        df
        .groupby('Time_str', as_index=False)
        .agg(
            Value_median=('Value', 'median'),
            Stage_mode=('Sleep stage', lambda x: x.mode().iat[0] if not x.mode().empty else 0)
        )
    )

    # 7. Tag with source filename
    summary['subject'] = int((os.path.basename(path)).split('_', 1)[0])
    all_summaries.append(summary)

# 8. Concatenate and write out
if all_summaries:
    final_df = pd.concat(all_summaries, ignore_index=True)
    output_csv = f'/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_{sheet}.csv'
    final_df.to_csv(output_csv, index=False, na_rep='None')
    print(f"Saved combined summary to {output_csv}")
else:
    print("No valid Systolic data found in the specified folder.")




In [None]:

all_summaries = []

for path in files:
    print(f"Processing file: {os.path.basename(path)}")
    
    sheet = "SpO2"
    
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',
        usecols=['Time','Value','Sleep stage'],
        parse_dates=['Time']
    )


    df["Time"] = df["Time"].astype(str).str.split('.').str[0]
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')    
    df['Time_str'] = df['Time'].dt.strftime('%m/%d/%Y %H:%M:%S')
    
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
    
    
    summary = (
        df
        .groupby('Time_str', as_index=False)
        .agg(
            Value_median=('Value', 'median'),
            Stage_mode=('Sleep stage', lambda x: x.mode().iat[0] if not x.mode().empty else 0)
        )
    )

    # 7. Tag with source filename
    summary['subject'] = int((os.path.basename(path)).split('_', 1)[0])
    all_summaries.append(summary)

# 8. Concatenate and write out
if all_summaries:
    final_df = pd.concat(all_summaries, ignore_index=True)
    output_csv = f'/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_{sheet}.csv'
    final_df.to_csv(output_csv, index=False, na_rep='None')
    print(f"Saved combined summary to {output_csv}")
else:
    print("No valid Systolic data found in the specified folder.")




In [177]:

all_summaries = []

for path in files:
    print(f"Processing file: {os.path.basename(path)}")
    
    sheet = "Position"
    
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',
        usecols=['Time','Value','Sleep stage'],
        parse_dates=['Time']
    )


    df["Time"] = df["Time"].astype(str).str.split('.').str[0]
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')    
    df['Time_str'] = df['Time'].dt.strftime('%m/%d/%Y %H:%M:%S')
    
    # df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
    
    
    summary = (
        df
        .groupby('Time_str', as_index=False)
        .agg(
            Stage_mode=('Sleep stage', lambda x: x.mode().iat[0] if not x.mode().empty else 0),
            Value_median = ('Value', lambda x: x.mode().iat[0] if not x.mode().empty else 0)
        )
    )

    # 7. Tag with source filename
    summary['subject'] = int((os.path.basename(path)).split('_', 1)[0])
    all_summaries.append(summary)

# 8. Concatenate and write out
if all_summaries:
    final_df = pd.concat(all_summaries, ignore_index=True)
    output_csv = f'/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_{sheet}.csv'
    final_df.to_csv(output_csv, index=False, na_rep='None')
    print(f"Saved combined summary to {output_csv}")
else:
    print("No valid Systolic data found in the specified folder.")




Processing file: 014_HzData.xlsx
Processing file: 008_HzData.xlsx
Processing file: 026_HzData.xlsx
Processing file: 002_SecData.xlsx
Processing file: 018_HzData.xlsx
Processing file: 027_SecData.xlsx
Processing file: 004_HzData.xlsx
Processing file: 011_HzData.xlsx
Processing file: 023_HzData.xlsx
Processing file: 020_HzData.xlsx
Processing file: 012_HzData.xlsx
Processing file: 007_HzData.xlsx
Processing file: 030_HzData.xlsx
Processing file: 025_HzData.xlsx
Processing file: 010_HzData.xlsx
Processing file: 005_HzData.xlsx
Processing file: 019_HzData.xlsx
Processing file: 009_HzData.xlsx
Processing file: 015_HzData.xlsx
Processing file: 016_HzData.xlsx
Processing file: 001_SecData.xlsx
Processing file: 006_HzData.xlsx
Processing file: 028_HzData.xlsx
Processing file: 013_HzData.xlsx
Saved combined summary to /Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Position.csv


Combine

In [185]:
import pandas as pd
import glob
import os

# 1. Read the master file
master_path = '/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/Data_consolidated_expanded_Rank.csv'
master = pd.read_csv(master_path, parse_dates=['Timestamp'], delimiter=",")
master.head(4)


# # 2. Find the 5 summary files (assuming they start with "summary2_")
df_systolic = pd.read_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Systolic.csv')
df_systolic['Timestamp'] = pd.to_datetime(df_systolic['Time_str'], format="%m/%d/%Y %H:%M:%S")

df_systolic = df_systolic.rename(columns={
    'subject': 'Subject',
    'Value_median': 'Systolic_median',
    'Stage_mode':   'Systolic_mode'
})[['Subject', 'Timestamp', 'Systolic_median', 'Systolic_mode']]
df_systolic = df_systolic[['Subject', 'Timestamp', 'Systolic_median', 'Systolic_mode']]

# SpO2: keep only median
df_spo2 = pd.read_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_SpO2.csv')
df_spo2['Timestamp'] = pd.to_datetime(df_spo2['Time_str'], format="%m/%d/%Y %H:%M:%S")

df_spo2 = df_spo2.rename(columns={
    'subject':      'Subject',
    'Value_median': 'SpO2_median'
})[['Subject', 'Timestamp', 'SpO2_median']]
df_spo2 = df_spo2[['Subject', 'Timestamp', 'SpO2_median']]

# Position: keep only median
df_position = pd.read_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Position.csv')
df_position['Timestamp'] = pd.to_datetime(df_position['Time_str'], format="%m/%d/%Y %H:%M:%S")

df_position = df_position.rename(columns={
    'subject':      'Subject',
    'Value_median': 'Position_median'
})[['Subject', 'Timestamp', 'Position_median']]
df_position = df_position[['Subject', 'Timestamp', 'Position_median']]

# Heart Rate Curve: keep only median
df_hr = pd.read_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Heart Rate Curve.csv')
df_hr['Timestamp'] = pd.to_datetime(df_hr['Time_str'], format="%m/%d/%Y %H:%M:%S")
df_hr = df_hr.rename(columns={
    'subject':      'Subject',
    'Value_median': 'HeartRate_median'
})[['Subject', 'Timestamp', 'HeartRate_median']]
df_hr = df_hr[['Subject', 'Timestamp', 'HeartRate_median']]

# Diastolic: keep only median
df_diastolic = pd.read_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary2_Diastolic.csv')
df_diastolic['Timestamp'] = pd.to_datetime(df_diastolic['Time_str'], format="%m/%d/%Y %H:%M:%S")
df_diastolic = df_diastolic.rename(columns={
    'subject':      'Subject',
    'Value_median': 'Diastolic_median'
})[['Subject', 'Timestamp', 'Diastolic_median']]


merged = master.merge(df_systolic,   on=['Subject','Timestamp'], how='left')
merged = merged.merge(df_spo2,       on=['Subject','Timestamp'], how='left')
merged = merged.merge(df_position,   on=['Subject','Timestamp'], how='left')
merged = merged.merge(df_hr,         on=['Subject','Timestamp'], how='left')
merged = merged.merge(df_diastolic,  on=['Subject','Timestamp'], how='left')

numeric_cols = merged.select_dtypes(include=['number']).columns
merged[numeric_cols] = merged[numeric_cols].fillna(0)

string_cols = merged.select_dtypes(include=['object']).columns
merged[string_cols] = merged[string_cols].fillna('NA')

# # 5. (Optional) Save out the merged DataFrame
merged.to_csv('/Users/ebenezer/Documents/SleepApnea/Alldata/consolidated/summary_all_df.csv', index=False)

In [167]:
df_hr.head(4)

Unnamed: 0,Subject,Timestamp,HeartRate_median
0,14,2025-01-16 21:46:30,70.5
1,14,2025-01-16 21:46:31,69.0
2,14,2025-01-16 21:46:32,69.0
3,14,2025-01-16 21:46:33,69.0


In [171]:
master.head(4)

Unnamed: 0,Subject,Timestamp,Value,Event name
0,1,2024-10-31 22:21:30,,
1,1,2024-10-31 22:21:31,,
2,1,2024-10-31 22:21:32,,
3,1,2024-10-31 22:21:33,,


In [None]:
merged.head(4)

In [169]:
df_position.head(4)

Unnamed: 0,Subject,Timestamp,Position_median
0,14,2025-01-16 21:46:30,0.0
1,14,2025-01-16 21:46:31,0.0
2,14,2025-01-16 21:46:32,0.0
3,14,2025-01-16 21:46:33,0.0


In [180]:
merged.head(200)

Unnamed: 0,Subject,Timestamp,Value,Event name,Systolic_median,Systolic_mode,SpO2_median,Position_median,HeartRate_median,Diastolic_median
0,1,2024-10-31 22:21:30,,,189.0,Wake,0.0,Supine,83.5,125.0
1,1,2024-10-31 22:21:31,,,189.5,Wake,0.0,Supine,83.0,124.0
2,1,2024-10-31 22:21:32,,,188.0,Wake,92.0,Supine,83.0,121.0
3,1,2024-10-31 22:21:33,,,183.0,Wake,92.0,Supine,83.0,120.0
4,1,2024-10-31 22:21:34,,,179.0,Wake,92.0,Supine,83.0,117.5
...,...,...,...,...,...,...,...,...,...,...
195,1,2024-10-31 22:24:45,,,189.0,Wake,95.0,Supine,92.0,124.0
196,1,2024-10-31 22:24:46,,,189.5,Wake,95.0,Supine,90.0,123.0
197,1,2024-10-31 22:24:47,,,192.5,Wake,95.0,Supine,90.5,123.0
198,1,2024-10-31 22:24:48,,,196.0,Wake,95.0,Supine,89.0,124.0
