In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

In [None]:
df = pd.read_csv("working_data/sleep_userid_day.csv")
df["LOWER_DAYS"] = pd.to_datetime(df["LOWER_DAYS"])
print(df.head())

In [None]:
# find intervals of daily sleep recordings

df["diff"] = df.groupby("USER_ID")["LOWER_DAYS"].diff().dt.days.ne(1).cumsum()
df_intervals = df.groupby(["USER_ID", "diff"]).agg(start_interval=("LOWER_DAYS", "first"), end_interval=("LOWER_DAYS", "last")).reset_index().drop(columns=["diff"])

print(df_intervals.head())

In [None]:
# find length of intervals

df_intervals["interval_length"] = (df_intervals["end_interval"] - df_intervals["start_interval"]).dt.days + 1

print(df_intervals.head())

In [None]:
# bar chart of intervals

interval_counts = df_intervals["interval_length"].value_counts()
print(interval_counts[:11])


bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, df_intervals["interval_length"].max() + 1]
bin_labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]

df_intervals["bin"] = pd.cut(df_intervals["interval_length"], bins=bins, labels=bin_labels, right=False)
interval_counts = df_intervals["bin"].value_counts().reindex(bin_labels, fill_value=0)


plt.bar(interval_counts.index, interval_counts.values, edgecolor="black")
plt.xlabel("Interval Length (Days)")
plt.ylabel("Frequency")
plt.title("Distribution of Interval Lengths")
plt.xticks(rotation=45)
plt.savefig(f"descriptive_statistics_plots/interval_length_histogram.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# bar chart of intervals smaller and bigger than 7 days

bins = [1, 7, df_intervals["interval_length"].max() + 1]
bin_labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]

df_intervals["bin"] = pd.cut(df_intervals["interval_length"], bins=bins, labels=bin_labels, right=False)
interval_counts = df_intervals["bin"].value_counts().reindex(bin_labels, fill_value=0)


plt.bar(interval_counts.index, interval_counts.values, edgecolor="black")
plt.xlabel("Interval Length (Days)")
plt.ylabel("Frequency")
plt.title("Distribution of Interval Lengths")
plt.xticks(rotation=45)
plt.savefig(f"descriptive_statistics_plots/interval_length_histogram_7_days.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# calculate how many days lie in between intervals, i.e. days missed

df_intervals_diff = df_intervals[["USER_ID", "start_interval", "end_interval", "interval_length"]].copy()
df_intervals_diff["prev_end_interval"] = df_intervals_diff.groupby("USER_ID")["end_interval"].shift(1)
df_intervals_diff["prev_interval_diff"] = (df_intervals_diff["start_interval"] - df_intervals_diff["prev_end_interval"]).dt.days - 1
df_intervals_diff["prev_interval_diff"] = df_intervals_diff["prev_interval_diff"].fillna(0).astype(int)
df_intervals_diff.drop(columns=["prev_end_interval"], inplace=True)

print(df_intervals_diff.head())

In [None]:
# bar chart for days between intervals

prev_interval_counts = df_intervals_diff["prev_interval_diff"].value_counts()
print(prev_interval_counts[:11])

bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, df_intervals_diff["prev_interval_diff"].max() + 1]
bin_labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]

df_intervals_diff["bin"] = pd.cut(df_intervals_diff["prev_interval_diff"], bins=bins, labels=bin_labels, right=False)
prev_interval_counts = df_intervals_diff["bin"].value_counts().reindex(bin_labels, fill_value=0)

plt.bar(prev_interval_counts.index, prev_interval_counts.values, edgecolor="black")
plt.xlabel("Previous Interval Difference (Days)")
plt.ylabel("Frequency")
plt.title("Distribution of Previous Interval Differences")
plt.xticks(rotation=45)
plt.savefig(f"descriptive_statistics_plots/interval_breaks_length_histogram.png", dpi=300, bbox_inches="tight")
plt.show()