In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# create file with first and last day of sleep tracking per user

df_sleep = pd.read_csv("working_data/sleep_userid_day.csv")
df_sleep["LOWER_DAYS"] = pd.to_datetime(df_sleep["LOWER_DAYS"])
df_min_max = df_sleep.groupby("USER_ID")["LOWER_DAYS"].agg(["min", "max"]).reset_index()
df_min_max.columns = ["USER_ID", "MIN_DAYS", "MAX_DAYS"]
print(df_min_max.head())
df_min_max.to_csv("working_data/sleep_userid_first_last_day.csv", index=False)

In [None]:
# find intervals of daily sleep recordings and measure length of interval

df = pd.read_csv("working_data/sleep_userid_day.csv")
df["LOWER_DAYS"] = pd.to_datetime(df["LOWER_DAYS"])

df["diff"] = df.groupby("USER_ID")["LOWER_DAYS"].diff().dt.days.ne(1).cumsum()
df_intervals = df.groupby(["USER_ID", "diff"]).agg(start_interval=("LOWER_DAYS", "first"), end_interval=("LOWER_DAYS", "last")).reset_index().drop(columns=["diff"])

df_intervals["interval_length"] = (df_intervals["end_interval"] - df_intervals["start_interval"]).dt.days + 1

df_intervals.to_csv("working_data/sleep_intervals.csv", index=False)

In [None]:
# create list of user id based in Switzerland

df = pd.read_csv("../original_data/mhs_demographics_sorted.csv")
df_country = df[df["COUNTRY"] == "CH"]

user_ids = df_country['USER_ID'].unique()

filtered_rows = []

for chunk in pd.read_csv("../original_data/mhs_sleep_sorted.csv", chunksize=10000):
    filtered = chunk[chunk['USER_ID'].isin(user_ids)]
    filtered_rows.append(filtered)

df_filtered = pd.concat(filtered_rows, ignore_index=True)
print(len(df_filtered))

In [None]:
# remove unused data

df = pd.read_csv("working_data/demographics_with_age.csv")
df_total = pd.merge(df_filtered, df, on="USER_ID", how="inner")
# columns dropped because they will not be used
df_total.drop(["AUTO_DETECTED", "ADMIN_DIVISION", "CITY", "MIDPOINT_DATE", "BIRTHDAY", "FIRST_SUBMISSION_DATE", "LAST_SUBMISSION_DATE"], axis=1, inplace=True)
# columns dropped because they have NaN values
df_total.drop(['SKIN_TEMP_CELSIUS', 'BLOOD_OXYGEN', 'RESPIRATORY_RATE', 'SLEEP_CONSISTENCY'], axis=1, inplace=True)
print(df_total.head())
df_total.to_csv("mhs_sleep_ch.csv", index=False)

In [None]:
# create survey file without NaN values
import pandas as pd

df_survey = pd.read_csv("../original_data/mhs_survey_sorted.csv")
df_survey = df_survey.dropna()

df_survey.to_csv("mhs_survey_sorted_without_nan.csv", index=False)