## Total Energy Burned
Active energy burned and basal energy burned

In [None]:
import pandas as pd

# Load file into dataframe
folder_path = 'c:/Users/aoife/Documents/Project/DataTables/'

df_aeb = pd.read_csv(folder_path + 'active-energy-burned.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])
df_beb = pd.read_csv(folder_path + 'basal-energy-burned.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

print(df_aeb.head())
print('----------------\n')
print(df_beb.head())

# Create a list of dataframes

energy_dfs = [df_aeb, df_beb]


In [None]:
# Create a new column which contains the difference between the start and end time

for df in energy_dfs:
    if 'startTime' and 'endTime' in df.columns:
        df['duration'] = df['endTime'] - df['startTime']
        
        df['duration'] = df['duration'].dt.total_seconds()

        
        print(df.head())





In [None]:
# Adjust the startTime to contain just the date

for df in energy_dfs:
    if 'startTime' in df.columns:
        df['startTime'] = pd.to_datetime(df['startTime'], format='%Y-%m-%dT%H:%M:%SZ')

        df['startDate'] = df['startTime'].dt.date

        print(df.head())
        print('----------------\n')

'''df_aeb['startTime'] = pd.to_datetime(df_aeb['startTime'], format='%Y-%m-%dT%H:%M:%SZ')

df_aeb['startDate'] = df_aeb['startTime'].dt.date

print(df_aeb.head())'''

In [None]:
# Get the total duration per day for each participant

for df in energy_dfs:
    if 'duration' in df.columns:
        df['duration'] = df.groupby(['participantId', 'startDate'])['duration'].transform('sum')
        df['value'] = df.groupby(['participantId', 'startDate'])['value'].transform('sum')

        # Drop duplicates
        df.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

        print(df.head())
        #print(df['value'].head())
        print('----------------\n')

'''df_aeb['duration'] = df_aeb.groupby(['participantId', 'startDate'])['duration'].transform('sum')
df_aeb['value'] = df_aeb.groupby(['participantId', 'startDate'])['value'].transform('sum')

print(df_aeb['duration'].head())'''

In [None]:
# Calculate total energy burned and total duration per day

# Sort by participantId
df_aeb.sort_values(by=['participantId'], inplace=True)
df_beb.sort_values(by=['participantId'], inplace=True)

# Rename value columns
df_aeb.rename(columns={'value': 'active_energy'}, inplace=True)
df_beb.rename(columns={'value': 'basal_energy'}, inplace=True)

# Merge on participantId and startDate

df_energy = pd.merge(df_aeb, df_beb, on=['participantId', 'startDate'])

# Create a new column which contains the total energy burned per day

df_energy['total_energy'] = df_energy['active_energy'] + df_energy['basal_energy']

# Create a new column which contains the total duration per day

df_energy['total_duration'] = df_energy['duration_x'] + df_energy['duration_y']

# Rename duration columns
df_energy.rename(columns={'duration_x': 'active_duration'}, inplace=True)
df_energy.rename(columns={'duration_y': 'basal_duration'}, inplace=True)

# Drop unnecessary columns

df_energy.drop(columns=['timestamp_x', 'startTime_x', 'endTime_x', 'timestamp_y', 'startTime_y', 'endTime_y'], inplace=True)

print(df_energy.head())
print('----------------\n')
print(df_energy['total_duration'].head())



## BMI

Body Mass and Height

In [None]:
# Use the groupby function to get the average height/body mass for each participant

df_bm = pd.read_csv(folder_path + 'body-mass.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_h = pd.read_csv(folder_path + 'height.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])

bmi_dfs = [df_bm, df_h]

for df in bmi_dfs:
    if 'value' in df.columns:

        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
        # Convert timestamp to date

        # Get the average value for each participant
        df['value'] = df.groupby(['participantId'])['value'].transform('mean')

        # Drop duplicates
        df.drop_duplicates(subset=['participantId'], inplace=True)

        print(df.head())
        print('----------------\n')

In [None]:
# Calculate the BMI for each participant and create a new df

# Sort by participantId
df_bm.sort_values(by=['participantId'], inplace=True)
df_h.sort_values(by=['participantId'], inplace=True)

# Rename value columns
df_bm.rename(columns={'value': 'bodyMass_kg'}, inplace=True)
df_h.rename(columns={'value': 'height_m'}, inplace=True)

# Merge on participantId

df_bmi = pd.merge(df_bm, df_h, on='participantId')

df_bmi['startDate'] = df_bmi['timestamp_x'].dt.date

df_bmi['bmi'] = df_bmi['bodyMass_kg'] / (df_bmi['height_m'] ** 2)

# drop unnecessary columns and duplicates
df_bmi.drop(columns=['timestamp_x', 'timestamp_y'], inplace=True)
df_bmi.drop_duplicates(subset=['participantId'], inplace=True)

print(df_bmi.head())


## Heart Rate

In [None]:

df_hr = pd.read_csv(folder_path + 'heart-rate.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])


In [None]:
# Convert startTime to datetime and extract date
df_hr['startTime'] = pd.to_datetime(df_hr['startTime'], format='%Y-%m-%dT%H:%M:%SZ')
df_hr['endTime'] = pd.to_datetime(df_hr['endTime'], format='%Y-%m-%dT%H:%M:%SZ')
df_hr['startDate'] = df_hr['startTime'].dt.date
print(df_hr.head()) 

In [None]:


# Remove rows where participantId, startTime and endTime are duplicated

df_hr.drop_duplicates(subset=['participantId', 'startTime', 'endTime'], inplace=True)

print(df_hr.head())


In [None]:
# Count the number of rows per person per day

df_hr['hr_duration_(s)'] = df_hr.groupby(['participantId', 'startDate'])['value'].transform('count')
df_hr['hr_duration_(s)'] = df_hr['hr_duration_(s)'].astype(float)

# get the maximum heart rate per day
df_hr['max_hr'] = df_hr.groupby(['participantId', 'startDate'])['value'].transform('max')

# get the minimum heart rate per day
df_hr['min_hr'] = df_hr.groupby(['participantId', 'startDate'])['value'].transform('min')

# get the average heart rate per day
df_hr['value'] = df_hr.groupby(['participantId', 'startDate'])['value'].transform('mean')



# Drop duplicates
df_hr.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)


# Drop unnecessary columns
df_hr.drop(columns=['timestamp', 'startTime', 'endTime'], inplace=True)

# Rename value column
df_hr.rename(columns={'value': 'mean_hr/s'}, inplace=True)

print(df_hr)

# Nap Tracker

Total duration

Average nap quality

In [None]:
df_nt = pd.read_csv(folder_path + 'nap-tracker.csv', usecols=['participantId', 'timestamp', 'NapDuration', 'NapQuality'], parse_dates=['timestamp'])

# calculate total nap duration per day for each participant

df_nt['timestamp'] = pd.to_datetime(df_nt['timestamp'], utc=True)
df_nt['startDate'] = df_nt['timestamp'].dt.date

# Remove rows where startDate, participantId and NapDuration are duplicated
df_nt.drop_duplicates(subset=['participantId', 'startDate', 'NapDuration'], inplace=True)

df_nt['totalNapTime_(s)'] = df_nt.groupby(['participantId', 'startDate'])['NapDuration'].transform('sum')

# Get the average nap quality per day for each participant

df_nt['avgNapQuality'] = df_nt.groupby(['participantId', 'startDate'])['NapQuality'].transform('mean')

# Drop unnecessary columns
df_nt.drop(columns=['timestamp', 'NapDuration', 'NapQuality'], inplace=True)

# If NaN is present in the avgNapQuality or totalNapTime column, fill with 0
df_nt['avgNapQuality'] = df_nt['avgNapQuality'].fillna(0)
df_nt['totalNapTime_(s)'] = df_nt['totalNapTime_(s)'].fillna(0)

print(df_nt.head())

# Step Count

In [None]:
df_sc = pd.read_csv(folder_path + 'step-count.csv', usecols=['participantId', 'startTime', 'endTime', 'value'], parse_dates=['startTime', 'endTime'])

# get total step count per day for each participant

df_sc['startTime'] = pd.to_datetime(df_sc['startTime'], utc=True)
df_sc['endTime'] = pd.to_datetime(df_sc['endTime'], utc=True)
df_sc['startDate'] = df_sc['startTime'].dt.date

df_sc['duration'] = df_sc['endTime'] - df_sc['startTime']
df_sc['duration'] = df_sc['duration'].dt.total_seconds()

df_sc['totalSteps'] = df_sc.groupby(['participantId', 'startDate'])['value'].transform('sum')
df_sc['totalSteps'] = df_sc['totalSteps'].astype(float)

# Get the total duration per day for each participant
df_sc['stepsTotalDuration'] = df_sc.groupby(['participantId', 'startDate'])['duration'].transform('sum')


# Get rid of duplicates

df_sc.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

# Drop unnecessary columns
df_sc.drop(columns=['value', 'duration', 'startTime', 'endTime'], inplace=True)

print(df_sc.head())

# Distance Walking/Running

In [None]:
# Read in data

df_dwr = pd.read_csv(folder_path + 'distance-walking-running.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp'])

# Convert timestamp to datetime and extract date

df_dwr['startTime'] = pd.to_datetime(df_dwr['startTime'], utc=True)
df_dwr['endTime'] = pd.to_datetime(df_dwr['endTime'], utc=True)
df_dwr['timestamp'] = pd.to_datetime(df_dwr['timestamp'], utc=True)

df_dwr['startDate'] = df_dwr['startTime'].dt.date

# Get the total distance walked/ran per day for each participant
df_dwr['totalDistance'] = df_dwr.groupby(['participantId', 'startDate'])['value'].transform('sum')

# Get the duration
df_dwr['duration'] = df_dwr['endTime'] - df_dwr['startTime']
df_dwr['duration'] = df_dwr['duration'].dt.total_seconds()

df_dwr['dwrTotalDuration'] = df_dwr.groupby(['participantId', 'startDate'])['duration'].transform('sum')

# Drop duplicates
df_dwr.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

# Drop unnecessary columns
df_dwr.drop(columns=['timestamp', 'startTime', 'endTime', 'duration', 'value'], inplace=True)


print(df_dwr.head())


# Sleep Quality

For sleep quality, I will change the timestamp to just include the date, and ensure there are no duplicates

In [None]:
df_sq = pd.read_csv(folder_path + 'sleep-quality-checker.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])

df_sq['timestamp'] = pd.to_datetime(df_sq['timestamp'], utc=True)

df_sq['startDate'] = df_sq['timestamp'].dt.date

# drop duplicates
df_sq.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

# drop unnecessary columns
df_sq.drop(columns=['timestamp'], inplace=True)

df_sq.rename(columns={'value': 'ssq_score'}, inplace=True)

print(df_sq.head())


# Sampling Participants

The amount of rows in these dataframes are still too large, so I will randomly sample a portion of the participants.

I will first of all find the participants which have data in each dataframe, on the same days (except BMI).

Then I use a probability to keep a certain portion of the data.

In [None]:
import numpy as np

def random_sampling():
    # Create a list of DataFrames
    dfs_list = [df_bmi, df_energy, df_hr, df_nt, df_sc, df_dwr, df_sq]	


    # Create a list of all participants
    participants = set(dfs_list[0]['participantId'].unique())

    # Check that the common participants have a value for sleep quality score
    participants = participants.intersection(set(df_sq[~df_sq['ssq_score'].isnull()]['participantId'].unique()))

    #print(participants)

    # Probability of keeping a participant
    keep_probability = 0.7

    # Determine which participants to keep based on the probability
    participants_to_keep = np.random.choice(list(participants), size=int(len(participants) * keep_probability), replace=False)

    # Filter each original DataFrame to keep only the common participants
    filtered_dfs = [df[df['participantId'].isin(participants_to_keep)] for df in dfs_list]

    # Checking values
    for i, df in enumerate(filtered_dfs):

        #print(f'Columns of filtered_df[{i}]: {df.columns}')

        unique_participants = df['participantId'].unique()
        '''print(f"Unique participants in DataFrame {i}:")
        print(unique_participants)
        print(f"Number of unique participants: {len(unique_participants)}")
        print("\n" + "-"*30 + "\n")

        print(f"DataFrame {i}:")
        #print(df)
        print(f"Number of rows: {len(df)}")
        print("\n" + "-"*30 + "\n")'''




# Alternative to Randomly Sampling

For this alternative, rather than randomly sampling from the pool of participants that are common across all dataframes, I will first sort participants alphabetically in each dataframe, and add them to a list. 

Then, I will iterate through the first $i$ participants in that list, and merge the data present for those.

I will then get the next $i$ participants and perform a merge, and continute until there are $i$ or less participants - then I will merge those.

With each of these subsets, I will remove rows where there is missing data. 

Then, I will merge all of the complete subsets together, RAM permitting.
    If the data is still to large to perform a merge, I will randomly sample participants from the complete dataset

In [None]:
# merge function

def merge_dfs(dfs_list, participants):
    merged_df = None
    for df in dfs_list:
        # Filter dataframe for participants we currently work with
        curr_df = df[df['participantId'].isin(participants)]
        if merged_df is None:
            merged_df = curr_df
        else:
            merged_df = pd.merge(merged_df, curr_df, on=['participantId', 'startDate'], how='outer', suffixes=('', '_y'))
    return merged_df


In [None]:
# Another method of gathering a smaller dataset

def get_separate_tables(num_participants):
    
    # Create a list of DataFrames
    dfs_list = [df_bmi, df_hr, df_sc, df_dwr, df_sq]	

    # Create a list of all participants
    participants = set(dfs_list[0]['participantId'].unique())

    # Sort the list of participants alphabetically
    participants = sorted(list(participants))

    result_dfs = []

    while participants:
        # Get the next group of participants
        next_group = participants[:num_participants]

        # Merge the DataFrames for the next group of participants
        merge_df = merge_dfs(dfs_list, next_group)

        print(merge_df.head())

        # Add the merged DataFrame to the list of results
        result_dfs.append(merge_df)

        # Remove the participants that were just used
        participants = participants[num_participants:]

    return result_dfs


In [None]:
# Call get_separate_tables() and check the resulting dfs

subset_dfs = get_separate_tables(20)

for i, df in enumerate(subset_dfs):
    print(f"DataFrame {i}:")
    print(df.head())
    print(f"Number of rows: {len(df)}")
    print("\n" + "-"*30 + "\n")


In [None]:
# For each unique participant, check if they have a value for weight, height, and bmi. If so, copy values to each row for that participant

for df in subset_dfs:

    # Loop through participants
    for participant in df['participantId'].unique():
        # Find the index of the row containing non NaN values for weight, height, and bmi
        non_nan_index = df[(df['participantId'] == participant) & (~df['bodyMass_kg'].isnull()) & (~df['height_m'].isnull()) & (~df['bmi'].isnull())].index

        # If there is a row with non NaN values, copy the values to all rows for that participant
        if len(non_nan_index) > 0:
            non_nan_index = non_nan_index[0]
            non_nan_row = df.loc[non_nan_index]
            df.loc[df['participantId'] == participant, 'bodyMass_kg'] = non_nan_row['bodyMass_kg']
            df.loc[df['participantId'] == participant, 'height_m'] = non_nan_row['height_m']
            df.loc[df['participantId'] == participant, 'bmi'] = non_nan_row['bmi']


In [None]:
import pandas as pd

# TODO: Read in data from surveys
# For each unique participnat in subset_dfs, check if they have answered both surveys. If so, copy the values to each row for that participant
# Note see above code

folder_path = 'c:/Users/aoife/Documents/Project/DataTables/'


df_about_me = pd.read_csv(folder_path + 'about-me.csv', usecols=['participantId', 'timestamp', 'alcohol', 'basic_expenses', 'caffeine', 'daily_activities', 'daily_activities', 'daily_smoking', 'education', 'flexible_work_hours', 'gender', 'good_life', 'hispanic', 'income', 'marital', 'race', 'smoking_status', 'menopause', 'recent_births', 'current_pregnant', 'work_schedule'], parse_dates=['timestamp'])

df_sleep_habits = pd.read_csv(folder_path + 'sleep-habits.csv', usecols=['participantId', 'timestamp', 'alarm_dependency', 'driving_sleepy', 'falling_asleep', 'morning_person', 'nap_duration', 'sleep_lost', 'sleep_needed', 'sleep_partner', 'sleep_time_workday', 'sleep_time_weekend', 'wake_up_choices', 'wake_ups', 'weekly_naps', 'what_wakes_you'], parse_dates=['timestamp'])

df_sleep_assessment = pd.read_csv(folder_path + 'sleep-assessment.csv', usecols=['participantId', 'timestamp', 'alcohol', 'concentrating_problem_one', 'concentrating_problem_two', 'discomfort_in_sleep', 'exercise', 'fatigue_limit', 'feel_tired_frequency', 'felt_alert', 'had_problem', 'hard_times', 'medication_by_doctor', 'poor_sleep_problems', 'sleep_aids', 'sleep_problem', 'think_clearly', 'tired_easily', 'told_by_doctor', 'told_by_doctor_specify', 'told_to_doctor'], parse_dates=['timestamp'])



In [None]:
# rename the alcolhol column in about_me and sleep_assessment to be more detailed

df_about_me.rename(columns={'alcohol': 'alcohol_consumption'}, inplace=True)
df_sleep_assessment.rename(columns={'alcohol': 'alcohol_sleep_help'}, inplace=True)

# Convert timestamp to datetime
df_about_me['timestamp'] = pd.to_datetime(df_about_me['timestamp'], utc=True)
df_sleep_habits['timestamp'] = pd.to_datetime(df_sleep_habits['timestamp'], utc=True)
df_sleep_assessment['timestamp'] = pd.to_datetime(df_sleep_assessment['timestamp'], utc=True)

# Change the timestamp to only contain the date
df_about_me['date'] = df_about_me['timestamp'].dt.date
df_sleep_habits['date'] = df_sleep_habits['timestamp'].dt.date
df_sleep_assessment['date'] = df_sleep_assessment['timestamp'].dt.date


In [None]:
# Merge survey dfs

surveys_list = [df_about_me, df_sleep_habits, df_sleep_assessment]

# Merge the DataFrames
df_surveys = pd.merge(surveys_list[0], surveys_list[1], on=['participantId', 'date'], how='outer', suffixes=('', '_y'))
#df_surveys = pd.merge(df_surveys, surveys_list[2], on=['participantId', 'date'], how='outer', suffixes=('', '_y'))


In [None]:
# Remove duplicates
df_surveys.drop_duplicates(subset=['participantId'], inplace=True)

# For the 'menopause' column, replace NaN with 3
df_surveys['menopause'] = df_surveys['menopause'].fillna(3)

# For the recent_births column, replace NaN with 4
df_surveys['recent_births'] = df_surveys['recent_births'].fillna(4)

# Replace current_pregnant NaN with 0
df_surveys['current_pregnant'] = df_surveys['current_pregnant'].fillna(0)

# replace driving_sleepy NaN with 6
df_surveys['driving_sleepy'] = df_surveys['driving_sleepy'].fillna(6)

# replace falling_asleep NaN with 0
df_surveys['falling_asleep'] = df_surveys['falling_asleep'].fillna(0)

# replace morning_person NaN with 3
df_surveys['morning_person'] = df_surveys['morning_person'].fillna(3)

# replace nap_duration NaN with 
df_surveys['nap_duration'] = df_surveys['nap_duration'].fillna(6)

# replace sleep_lost NaN with 0
df_surveys['sleep_lost'] = df_surveys['sleep_lost'].fillna(0)

# replace what_wakes_you NaN with 13
df_surveys['what_wakes_you'] = df_surveys['what_wakes_you'].fillna(13)

In [None]:
# Remove unnecessary columns

df_surveys.drop(columns=['timestamp', 'date', 'timestamp_y'], inplace=True)

In [None]:

# Save to csv
df_surveys.to_csv(folder_path + 'surveys.csv', index=False)

# Save to CSV's

In [None]:
# Save the filtered DataFrames to new CSV files

save_path = 'C:\\Users\\aoife\Documents\\Project\\filtered_data\\'

In [None]:
# Remove rows in each subset_df where there are NaN values

total_rows = 0

for i, df in enumerate(subset_dfs):
    subset_dfs[i] = subset_dfs[i].dropna()
    
    print(subset_dfs[i].head())
    # print the number of rows in the cleaned dataframe
    print(f"Number of rows in cleaned DataFrame {i}: {len(subset_dfs[i])}")

    #print the total rows
    total_rows += len(subset_dfs[i])

print(f"Total number of rows: {total_rows}")

# Concatenate Activity dataframes

In [None]:
# Concatenate the DataFrames in subset_dfs into a single DataFrame

concatenated_df = pd.concat(subset_dfs, ignore_index=True)

print(concatenated_df.head())

# print total number of rows in concatenated_df
print(f"Total number of rows in concatenated_df: {len(concatenated_df)}")


In [None]:
# Save the merged DataFrame to a new CSV file

concatenated_df.to_csv(save_path + 'concatenated_data.csv', index=False)

# Merge Surveys with Concatenated Activity Data

In [None]:
# Merge the surveys DataFrame with the concatenated DataFrame

activity_and_survey_df = pd.merge(concatenated_df, df_surveys, on=['participantId'], how='outer', suffixes=('', '_y'))

print(activity_and_survey_df.head())

# Print num rows
print(f"Number of rows in activity_and_survey_df before NaN removed: {len(activity_and_survey_df)}")

# Remove rows that have NaN values
activity_and_survey_df = activity_and_survey_df.dropna()

print(f"Number of rows in activity_and_survey_df after NaN removed: {len(activity_and_survey_df)}")



# Refactor Columns

The columns 'hispanic', 'race', 'sleep_partner', and 'what_wakes_you' have multiple values. I need to refactor these columns to only include 1 value per cell.

For the 'hispanic' and 'race' columns, I will replace any multiple answers with '6', which in the survey indicated multiple races/multiple hispanic ethnicities

In [None]:
def join_multiple_race(row):
    if ',' in row:
        return 6
    return row

activity_and_survey_df['hispanic'] = activity_and_survey_df['hispanic'].apply(join_multiple_race)
activity_and_survey_df['race'] = activity_and_survey_df['race'].apply(join_multiple_race)

For the sleep_partner column I will add an option 'multiple' = 6, to indicate that this participant sleeps with multiple of the options in their room (e.g., they sleep with their pets and their significant other)

In [None]:
def join_multiple_sleep_partner(row):
    if ',' in row:
        return 6
    return row

activity_and_survey_df['sleep_partner'] = activity_and_survey_df['sleep_partner'].apply(join_multiple_sleep_partner)

In [None]:

# Save the merged DataFrame to a new CSV file

activity_and_survey_df.to_csv(save_path + 'activity_and_survey_data.csv', index=False)

Now I check what reasons each participant provided for waking up at night.

I add them to new rows where 1 indicates they suffer from this reason, 0 indicates they do not.

In [None]:
# create new columns for wakeup reasons

activity_and_survey_df['noise_light'] = 0
activity_and_survey_df['stress_thinking'] = 0
activity_and_survey_df['other_person'] = 0
activity_and_survey_df['pain_discomfort'] = 0
activity_and_survey_df['nightmares'] = 0
activity_and_survey_df['bathroom_urges'] = 0
activity_and_survey_df['other_reasons'] = 0


In [None]:
def check_wakeup_reason(row, number):
    if number == 1:
        activity_and_survey_df.loc[row.name, 'noise_light'] = 1
    elif number == 2:
        activity_and_survey_df.loc[row.name, 'stress_thinking'] = 1
    elif number == 3:
        activity_and_survey_df.loc[row.name, 'other_person'] = 1
    elif number == 4:
        activity_and_survey_df.loc[row.name, 'pain_discomfort'] = 1
    elif number == 5:
        activity_and_survey_df.loc[row.name, 'nightmares'] = 1
    elif number == 6:
        activity_and_survey_df.loc[row.name, 'bathroom_urges'] = 1
    else:
        activity_and_survey_df.loc[row.name, 'other_reasons'] = 1

In [None]:
for i, row in activity_and_survey_df.iterrows():
    if ',' in row['what_wakes_you']:
        nums = map(int, row['what_wakes_you'].split(','))
        for number in nums:
            check_wakeup_reason(row, number)
    else:
        check_wakeup_reason(row, int(row['what_wakes_you']))

# Remove what_wakes_you
activity_and_survey_df.drop(columns=['what_wakes_you'], inplace=True)

print(activity_and_survey_df.head())

In [None]:

# Save the merged DataFrame to a new CSV file

activity_and_survey_df.to_csv(save_path + 'activity_and_survey_data.csv', index=False)