In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

In [None]:
# set to True for CH demographics only
only_CH = True

In [None]:
# read in demographics and survey data and sort by user-id

df_demo = pd.read_csv("../original_data/mhs_demographics_sorted.csv")
df_survey = pd.read_csv("../original_data/mhs_survey_sorted.csv")

df_demo = df_demo.sort_values(by="USER_ID", ascending=True)
df_survey = df_survey.sort_values(by="USER_ID", ascending=True)
df_survey = df_survey.dropna()
print(f"Number of entries in dataset: {len(df_survey)}")

In [None]:
# inner join of demographics and survey on user-id

df_total = pd.merge(df_demo, df_survey, on="USER_ID", how="inner")

if (only_CH):
    df_total = df_total[df_total['COUNTRY'] == 'CH']
    
print(f"Number of entries in dataset: {len(df_total)}")

In [None]:
# only keep dates

df_dates = df_total.loc[:,["USER_ID", "FIRST_SUBMISSION_DATE", "LAST_SUBMISSION_DATE", "SUBMITDATE"]]

df_dates["FIRST_SUBMISSION_DATE"] = pd.to_datetime(df_dates["FIRST_SUBMISSION_DATE"])
df_dates["LAST_SUBMISSION_DATE"] = pd.to_datetime(df_dates["LAST_SUBMISSION_DATE"])
df_dates["SUBMITDATE"] = pd.to_datetime(df_dates["SUBMITDATE"], format="%m/%d/%y")

print(df_dates.head())

In [None]:
# count number of entries per user-id, i.e. number of surveys submitted

user_counts = df_dates["USER_ID"].value_counts()
#print(user_counts)
user_counts = df_dates.groupby("USER_ID").size()
#print(user_counts)
df_user_counts = df_dates["USER_ID"].value_counts().reset_index()
df_user_counts.columns = ["USER_ID", "ENTRY_COUNT"]

In [None]:
# count how many people have submitted a certain amount of surveys (1 to 14)

survey_counts = df_user_counts["ENTRY_COUNT"].value_counts()
print(survey_counts)
plt.bar(survey_counts.index, survey_counts.values, edgecolor="black")
plt.xlabel("Number of Surveys filled out")
plt.ylabel("Count")
plt.title("Histogram of number of surveys")
plt.savefig("descriptive_statistics_plots/number_of_surveys_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# create table of first and last day sleep was recorded together with survey dates and entry counts

df_sleep_min_max = pd.read_csv("../working_data/sleep_userid_first_last_day.csv")

df = pd.merge(df_sleep_min_max, df_user_counts, on="USER_ID", how="inner")
df_surveys = pd.merge(df, df_dates, on="USER_ID", how="inner")
df_surveys =df_surveys.drop(columns=["FIRST_SUBMISSION_DATE", "LAST_SUBMISSION_DATE"])
df_surveys["MIN_DAYS"] = pd.to_datetime(df_surveys["MIN_DAYS"])
df_surveys["MAX_DAYS"] = pd.to_datetime(df_surveys["MAX_DAYS"])
df_surveys["SUBMITDATE"] = pd.to_datetime(df_surveys["SUBMITDATE"])
print(df_surveys.head())

In [None]:
# for each entry create a fraction of when the surveys are done between first and last time sleep was recorded

df_surveys["FRAC"] = (df_surveys["SUBMITDATE"] - df_surveys["MIN_DAYS"]) / (df_surveys["MAX_DAYS"] - df_surveys["MIN_DAYS"])
df_surveys["FRAC"] = df_surveys["FRAC"].astype(float)
print(df_surveys.head())

In [None]:
bins = np.linspace(0, 1, 11)

for i in range(1,15):
    df = df_surveys[df_surveys["ENTRY_COUNT"] == i]
    
    plt.hist(df["FRAC"], bins=bins, edgecolor="black")
    plt.xlabel("Fraction of survey submission date between first and last time sleep was recorded")
    plt.ylabel("Frequency")
    plt.title(f"Relative distribution of survey submission date for {i} surveys submitted")
    
    vlines = [(j / (i + 1)) for j in range(1, i + 1)]
    for v in vlines:
        plt.axvline(v, linestyle='dashed', color='red', alpha=0.7, label=f"{v:.2f}")
    
    plt.savefig(f"descriptive_statistics_plots/relative_distribution_{i}_surveys.png", dpi=300, bbox_inches="tight")
    plt.show()