# Initial Data Preparation for CT413 FYP

1. Mounting Google Drive
2. Read in individual data files
3.

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


### Mounting Google Drive

This will popup a prompt for permission to access google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


### Read in individual data files

In [None]:
import pandas as pd

# Path to folder of files
folder_path = '/content/drive/MyDrive/SleepHealthDataTables/'

# TODO:
#
# Read in certain columns from the data
df_aeb = pd.read_csv(folder_path + 'active_energy_burned.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
#df_aet = pd.read_csv(folder_path + 'apple_exercise_time.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_beb = pd.read_csv(folder_path + 'basal_energy_burned.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_bm = pd.read_csv(folder_path + 'body_mass.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
#df_dwr = pd.read_csv(folder_path + 'distance_walking_running.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
#df_fcl = pd.read_csv(folder_path + 'flights_climbed.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_hr = pd.read_csv(folder_path + 'heart_rate.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_h = pd.read_csv(folder_path + 'height.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_nt = pd.read_csv(folder_path + 'nap_tracker.csv', usecols=['participantId', 'timestamp', 'NapDuration', 'NapQuality'], parse_dates=['timestamp'])
df_sqc = pd.read_csv(folder_path + 'sleep_quality_checker.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_sc = pd.read_csv(folder_path + 'step_count.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])

print(df_sqc.head())

'''df_aeb = df_aeb.rename(columns={'value' : 'aeb'})
df_beb = df_beb.rename(columns={'value' : 'beb'})
df_bm = df_bm.rename(columns={'value' : 'bm'})
df_hr = df_hr.rename(columns={'value' : 'hr'})
df_h = df_h.rename(columns={'value' : 'h'})
df_nt = df_nt.rename(columns={'value' : 'nt'})
df_sqc = df_sqc.rename(columns={'value' : 'sqc'})
df_sc = df_sc.rename(columns={'value' : 'sc'})'''

#timestamped_dfs = [df_aeb, df_aet, df_beb, df_bm, df_dwr, df_fcl, df_hr, df_h, df_nt, df_sqc, df_sc]
timestamped_dfs = [df_aeb, df_beb, df_bm, df_hr, df_h, df_nt, df_sqc, df_sc]

for i, df in enumerate(timestamped_dfs):
    print(f'Data type of timestamp column in timestamped_dfs[{i}]: {df["timestamp"].dtype}')


# Make a list of Strings with the names of the dataframes
df_names = ['active_energy_burned', 'apple_exercise_time', 'basal_energy_burned', 'body_mass', 'distance_walking_running', 'flights_climbed', 'heart_rate', 'height', 'nap_tracker', 'sleep_quality_checker', 'step_count']



                          participantId  value                  timestamp
0  b4ebf7dd-4e30-4f7b-8ee8-5493a19c8c9f      4  2016-09-26 21:18:24-05:00
1  c3993552-69cb-45e4-b18a-5e6eecefb825      4  2016-03-07 09:16:09-05:00
2  78f60bd3-34f3-489e-a352-f9df564641c3      4  2016-03-05 17:21:46-05:00
3  9da1a89a-2145-4cca-b356-7b58aa7be8b0      4  2016-09-26 21:41:55-05:00
4  4aad9dbe-dd9e-4832-a198-3bd563457124      4  2016-03-03 10:52:42-08:00
Data type of timestamp column in timestamped_dfs[0]: datetime64[ns, UTC]
Data type of timestamp column in timestamped_dfs[1]: datetime64[ns, UTC]
Data type of timestamp column in timestamped_dfs[2]: datetime64[ns, UTC]
Data type of timestamp column in timestamped_dfs[3]: datetime64[ns, UTC]
Data type of timestamp column in timestamped_dfs[4]: datetime64[ns, UTC]
Data type of timestamp column in timestamped_dfs[5]: object
Data type of timestamp column in timestamped_dfs[6]: object
Data type of timestamp column in timestamped_dfs[7]: datetime64[ns, UTC

In [None]:
def daily_average(df):
    # Calculate the average of all numerical columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    daily_avg_df = df.groupby(['participantId', df.index])[numeric_columns].mean().reset_index()
    daily_avg_df = daily_avg_df.drop_duplicates()

    return daily_avg_df


In [None]:
def adjust_date(df):
    # Set the timestamp column as the index and convert to datetime
    df.set_index('timestamp', inplace=True, drop=False)
    df.index = pd.to_datetime(df.index, utc=True)
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

    # Ensure the file is sorted by participantId
    df.sort_values('participantId', inplace=True)

    # Change the timestamp to just the date
    df.index = df.index.date
    df['timestamp'] = df['timestamp'].dt.date

    # Filter out some of the later data
    df = df[df.index < pd.Timestamp('2016-07-01').date()]


In [None]:
import numpy as np

# Sort each dataframe by participantId
avg_dfs = []

for df in timestamped_dfs:
  if 'timestamp' in df.columns:
    adjust_date(df)
    df = daily_average(df)

    avg_dfs.append(df)
    #print(avg_dfs)
  else:
    print("'timestamp' column does not exist in the DataFrame.")
    print(df.columns)

#print(df_sqc.head())



[                             participantId     level_1     value
0     00d6d2ee-ccea-45c7-9772-b19fd9bef2bf  2018-04-20  0.457336
1     00fd4039-9b5e-4bbb-8295-4983a3f58371  2018-08-15  0.162529
2     017702d7-e540-4e1a-ab19-ca5f1c34e70b  2017-12-23  0.321469
3     01866570-f91f-4d3b-9d41-0a8cdcbfa922  2018-02-25  0.244880
4     02778d46-6618-4915-97d9-dc000b39a088  2018-05-13  0.725437
...                                    ...         ...       ...
2131  ff4f01fb-1636-4687-934d-58f18fae4f6a  2018-12-26  0.760850
2132  ff6df296-8405-4398-804e-be282447795c  2018-09-04  0.221592
2133  ff724463-e3c3-4da9-83cc-3f57d81b095a  2018-09-29  0.488800
2134  ff724463-e3c3-4da9-83cc-3f57d81b095a  2019-04-27  0.358377
2135  fffd204f-a681-45e7-87d6-48156361675f  2017-11-10  0.678459

[2136 rows x 3 columns]]
[                             participantId     level_1     value
0     00d6d2ee-ccea-45c7-9772-b19fd9bef2bf  2018-04-20  0.457336
1     00fd4039-9b5e-4bbb-8295-4983a3f58371  2018-08-15  0.1625

In [None]:
import numpy as np

# Determine the set of participants common to all DataFrames
common_participants = set(avg_dfs[0]['participantId'].unique())
for df in avg_dfs[1:]:
    common_participants = common_participants.intersection(df['participantId'].unique())

print(common_participants)

# Probability of keeping a participant
keep_probability = 0.3

# Determine which participants to keep based on the probability
participants_to_keep = np.random.choice(list(common_participants), size=int(len(common_participants) * keep_probability), replace=False)

# Filter each original DataFrame to keep only the common participants
filtered_dfs = [df[df['participantId'].isin(participants_to_keep)] for df in avg_dfs]

print('Index of original DataFrame:', avg_dfs[0].index)

# Checking values
for i, df in enumerate(filtered_dfs):

    print(f'Columns of filtered_df[{i}]: {df.columns}')

    unique_participants = df['participantId'].unique()
    print(f"Unique participants in DataFrame {i}:")
    print(unique_participants)
    print(f"Number of unique participants: {len(unique_participants)}")
    print("\n" + "-"*30 + "\n")

    print(f"DataFrame {i}:")
    #print(df)
    print(f"Number of rows: {len(df)}")
    print("\n" + "-"*30 + "\n")

del timestamped_dfs


{'8ba24d82-17f7-45aa-8b94-9664816d9035', '46a1eb30-cc2a-4a13-9bc9-56feb6b7a490', 'abb460fd-7620-4792-9f22-381061b176e1', 'e80cb956-01ff-4992-bb12-25472eda057d', 'f6b0f802-6212-4824-a63f-987e4a796bb5', '83decec2-6588-48b2-8adc-1d55d82abb3b', 'd96f3530-9573-4e62-ba04-e669013e32f4', 'd6ccf53c-7720-453c-a378-47bea7248d6a', 'dc21e43d-f04b-4b41-9dff-454ad433a4ec', '56a4512f-5929-4e11-9dea-faa958a3e6dd', 'd779a516-f476-4094-9495-21acd7688a4e', 'c58a5380-9d00-4c26-b7ee-a2460378e96a', 'ed42ffc9-6f83-4c12-b3c8-852be9ed049a', '6c95a170-a888-4f60-bf2e-ff433e68d901', 'ee0cb856-6ad2-433b-94b1-99ff01807551', '55d49397-8aae-4c41-be90-a892217a9d1d', 'e72fbbd0-36c3-460a-b597-5a8fcc365dae', '977c3462-62c6-43b8-adcf-145e7bc6d16e', 'd02538ab-c95e-4ba3-99ee-65dc48ec76a5', 'b55a1704-aa26-4592-9b83-6f74f8dd435e', '6c52af7f-cef0-4ad2-bad4-74d3d99daee8', '692b3029-d138-42ed-8ef2-a323206b69ca', '86bd116f-9074-4200-a481-52b351f0cf53', 'ff2dd69c-be97-4a41-8b0a-8711988f9155', 'fbd4d6f8-4471-4757-9a94-b187ce9b71b2',

In [None]:

# Merge and print the dataframe
merged_dfs = filtered_dfs[0]

for i, df in enumerate(filtered_dfs):
    print(f'Data type of timestamp column in filtered_df[{i}]: {df["level_1"].dtype}')

for df in filtered_dfs[1:]:
  merged_dfs = pd.merge(merged_dfs, df, on=['participantId', 'level_1'], how='outer', suffixes=('', '_y'))
  del df
  print('merged')


print("Merged Data:")
print(merged_dfs)


Data type of timestamp column in filtered_df[0]: object
Data type of timestamp column in filtered_df[1]: object
Data type of timestamp column in filtered_df[2]: object
Data type of timestamp column in filtered_df[3]: object
Data type of timestamp column in filtered_df[4]: object
Data type of timestamp column in filtered_df[5]: object
Data type of timestamp column in filtered_df[6]: object
Data type of timestamp column in filtered_df[7]: object
merged
merged
merged
merged
merged
merged
merged
Merged Data:
                            participantId     level_1     value    value_y  \
0    29e0bb9f-d896-4a30-af5f-5928f925fda9  2018-10-13  0.525918  41.146600   
1    8ba24d82-17f7-45aa-8b94-9664816d9035  2018-09-06  0.194807  14.680886   
2    977c3462-62c6-43b8-adcf-145e7bc6d16e  2017-11-29  0.165386  17.050136   
3    a42a874b-1895-401e-8ab7-aaac4b1e0501  2017-11-08  0.199212        NaN   
4    a42a874b-1895-401e-8ab7-aaac4b1e0501  2017-11-10  0.461262  18.623250   
..                    

In [None]:
'''# Create summary statistics for each dataframe for value column
for idx, curr_df in enumerate(timestamped_dfs, start=1):
    # Print the name of the dataframe
    print(f"\nDataFrame {idx} Name: {df_names[idx-1]}")
    if 'value' in curr_df.columns:
        print(curr_df['value'].describe())
    elif 'sq_score' in curr_df.columns:
        print(curr_df['sq_score'].describe())
    elif 'NapQuality' in curr_df.columns:
        print(curr_df['NapQuality'].describe())
        print(curr_df['NapDuration'].describe())'''

In [None]:
# save to csv

merged_dfs.to_csv(folder_path + 'merged_data.csv', index=True)

