In [1]:
import pandas as pd

from src.data_cleaning import (
    create_merged_df
)

from src.data_engineering import (
    calculate_rolling_baselines,
    engineer_target_variable,
    add_risk_scores,
    generate_mock_appointments
)

from src.config import (
    RISK_THRESHOLD,
    LOOKAHEAD_DAYS,
    SUBJECTIVE_COLS,
    PHYSIOLOGICAL_CONFIG,
    participant_information
)

In [3]:
[participant_id for participant_id in participant_information.keys()]

['60301085', '60107567', '60300310']

In [5]:
# loop over participant IDs and create a DataFrame for each and then combine all in one DataFrame
dfs = [create_merged_df(participant_id) for participant_id in participant_information.keys()]
df_final = pd.concat(dfs, ignore_index=True)

In [6]:
df_final

Unnamed: 0,participant_id,dateTime,resting_heart_rate,avg_overall_sleep_score,fatigue,mood,stress,sleep_quality,very_active_minutes_sum,sedentary_minutes_sum
0,60301085,2019-11-05,,,,,,,0,1440
1,60301085,2019-11-06,,,3.0,3.0,5.0,4.0,65,1270
2,60301085,2019-11-07,55.667916,,4.0,3.0,4.0,4.0,105,563
3,60301085,2019-11-08,53.800657,82.0,3.0,3.0,3.0,4.0,29,660
4,60301085,2019-11-09,53.646626,76.0,4.0,4.0,4.0,4.0,324,383
...,...,...,...,...,...,...,...,...,...,...
438,60300310,2020-03-27,57.553473,75.0,4.0,3.0,2.0,4.0,0,855
439,60300310,2020-03-28,58.318448,75.0,4.0,3.0,2.0,4.0,22,1130
440,60300310,2020-03-29,60.342356,76.0,4.0,3.0,2.0,4.0,2,808
441,60300310,2020-03-30,58.196049,88.0,4.0,4.0,4.0,3.0,0,748


In [7]:
df_final = calculate_rolling_baselines(df_final, ["resting_heart_rate","avg_overall_sleep_score"],dateTime_column= "dateTime", window_size=14, calculate_mean=True, calculate_std=True)

In [8]:
df_final.head()

Unnamed: 0_level_0,participant_id,dateTime,resting_heart_rate,avg_overall_sleep_score,fatigue,mood,stress,sleep_quality,very_active_minutes_sum,sedentary_minutes_sum,resting_heart_rate_14_mean,resting_heart_rate_14_std,avg_overall_sleep_score_14_mean,avg_overall_sleep_score_14_std
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-11-15,60107567,2019-11-15,53.468737,76.0,2.0,4.0,4.0,4.0,99,613,55.313041,3.504222,76.142857,10.316955
2019-11-16,60107567,2019-11-16,54.426253,75.0,3.0,4.0,4.0,3.0,111,546,54.852928,3.120667,75.071429,9.514881
2019-11-16,60301085,2019-11-16,55.303761,80.0,3.0,3.0,4.0,3.0,139,513,55.044874,3.05447,74.642857,9.111338
2019-11-16,60300310,2019-11-16,60.384912,81.0,5.0,4.0,2.0,5.0,34,536,55.582254,3.293276,74.857143,9.230861
2019-11-17,60107567,2019-11-17,53.575249,83.0,2.0,3.0,3.0,2.0,106,573,55.697496,3.183232,75.642857,9.434855


In [9]:
df_final = add_risk_scores(df_final, 
                subjective_cols_config = SUBJECTIVE_COLS,
                physiological_cols_config=PHYSIOLOGICAL_CONFIG)

Successfully calculated and appended daily risk scores using refactored functions.


In [10]:
df_final

Unnamed: 0_level_0,participant_id,dateTime,resting_heart_rate,avg_overall_sleep_score,fatigue,mood,stress,sleep_quality,very_active_minutes_sum,sedentary_minutes_sum,resting_heart_rate_14_mean,resting_heart_rate_14_std,avg_overall_sleep_score_14_mean,avg_overall_sleep_score_14_std,Subjective_Distress_Score,Physiological_Deviation_Score
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-11-15,60107567,2019-11-15,53.468737,76.0,2.0,4.0,4.0,4.0,99,613,55.313041,3.504222,76.142857,10.316955,1,0
2019-11-16,60107567,2019-11-16,54.426253,75.0,3.0,4.0,4.0,3.0,111,546,54.852928,3.120667,75.071429,9.514881,0,0
2019-11-16,60301085,2019-11-16,55.303761,80.0,3.0,3.0,4.0,3.0,139,513,55.044874,3.054470,74.642857,9.111338,0,0
2019-11-16,60300310,2019-11-16,60.384912,81.0,5.0,4.0,2.0,5.0,34,536,55.582254,3.293276,74.857143,9.230861,1,1
2019-11-17,60107567,2019-11-17,53.575249,83.0,2.0,3.0,3.0,2.0,106,573,55.697496,3.183232,75.642857,9.434855,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-30,60300310,2020-03-30,58.196049,88.0,4.0,4.0,4.0,3.0,0,748,56.320251,2.763394,78.357143,4.125104,0,0
2020-03-30,60301085,2020-03-30,55.178128,88.0,3.0,3.0,3.0,3.0,15,496,56.088858,2.709362,79.071429,4.859001,0,0
2020-03-31,60107567,2020-03-31,53.567041,72.0,2.0,4.0,4.0,2.0,95,665,55.800677,2.750342,78.357143,5.123207,2,1
2020-03-31,60301085,2020-03-31,55.457790,88.0,3.0,3.0,3.0,3.0,67,387,56.023167,2.569128,79.142857,5.709064,0,0


In [11]:
df_final = engineer_target_variable(df_final, risk_threshold=RISK_THRESHOLD, lookahead_window=LOOKAHEAD_DAYS)


In [12]:
df_final.columns

Index(['participant_id', 'dateTime', 'resting_heart_rate',
       'avg_overall_sleep_score', 'fatigue', 'mood', 'stress', 'sleep_quality',
       'very_active_minutes_sum', 'sedentary_minutes_sum',
       'resting_heart_rate_14_mean', 'resting_heart_rate_14_std',
       'avg_overall_sleep_score_14_mean', 'avg_overall_sleep_score_14_std',
       'Subjective_Distress_Score', 'Physiological_Deviation_Score',
       'Composite_Risk_Score', 'High_Risk_State', 'Is_High_Risk_Next_7_Days'],
      dtype='object')

In [21]:
df_final.to_csv("cleaned_data/many_participants_v2.csv", index=False)

In [14]:
df_final

Unnamed: 0_level_0,participant_id,dateTime,resting_heart_rate,avg_overall_sleep_score,fatigue,mood,stress,sleep_quality,very_active_minutes_sum,sedentary_minutes_sum,resting_heart_rate_14_mean,resting_heart_rate_14_std,avg_overall_sleep_score_14_mean,avg_overall_sleep_score_14_std,Subjective_Distress_Score,Physiological_Deviation_Score,Composite_Risk_Score,High_Risk_State,Is_High_Risk_Next_7_Days
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-11-15,60107567,2019-11-15,53.468737,76.0,2.0,4.0,4.0,4.0,99,613,55.313041,3.504222,76.142857,10.316955,1,0,1,0,0
2019-11-16,60107567,2019-11-16,54.426253,75.0,3.0,4.0,4.0,3.0,111,546,54.852928,3.120667,75.071429,9.514881,0,0,0,0,1
2019-11-16,60301085,2019-11-16,55.303761,80.0,3.0,3.0,4.0,3.0,139,513,55.044874,3.054470,74.642857,9.111338,0,0,0,0,1
2019-11-16,60300310,2019-11-16,60.384912,81.0,5.0,4.0,2.0,5.0,34,536,55.582254,3.293276,74.857143,9.230861,1,1,2,0,1
2019-11-17,60107567,2019-11-17,53.575249,83.0,2.0,3.0,3.0,2.0,106,573,55.697496,3.183232,75.642857,9.434855,2,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-30,60300310,2020-03-30,58.196049,88.0,4.0,4.0,4.0,3.0,0,748,56.320251,2.763394,78.357143,4.125104,0,0,0,0,1
2020-03-30,60301085,2020-03-30,55.178128,88.0,3.0,3.0,3.0,3.0,15,496,56.088858,2.709362,79.071429,4.859001,0,0,0,0,1
2020-03-31,60107567,2020-03-31,53.567041,72.0,2.0,4.0,4.0,2.0,95,665,55.800677,2.750342,78.357143,5.123207,2,1,3,1,0
2020-03-31,60301085,2020-03-31,55.457790,88.0,3.0,3.0,3.0,3.0,67,387,56.023167,2.569128,79.142857,5.709064,0,0,0,0,0


In [15]:
df_final["Composite_Risk_Score"]

dateTime
2019-11-15    1
2019-11-16    0
2019-11-16    0
2019-11-16    2
2019-11-17    2
             ..
2020-03-30    0
2020-03-30    0
2020-03-31    3
2020-03-31    0
2020-03-31    0
Name: Composite_Risk_Score, Length: 412, dtype: int64

In [16]:
# Get start and end dates for each participant
start_dates = df_final.groupby("participant_id")["dateTime"].min().dt.date.to_dict()
end_dates = df_final.groupby("participant_id")["dateTime"].max().dt.date.to_dict()

In [17]:
all_appointments = []
for participant_id in participant_information.keys():
    start_date = str(start_dates.get(participant_id, None))
    end_date = str(end_dates.get(participant_id, None))
    appointments = generate_mock_appointments(participant_id, start_date, end_date)
    all_appointments.extend(appointments)


Generating mock data for 60301085 participant
Generating mock data for 60107567 participant
Generating mock data for 60300310 participant


In [18]:
all_appointments
# convert all_appointments to a DataFrame

[{'participant_id': '60301085',
  'AppointmentDate': datetime.date(2020, 1, 22),
  'Status': 'Cancelled'},
 {'participant_id': '60301085',
  'AppointmentDate': datetime.date(2019, 11, 26),
  'Status': 'Attended'},
 {'participant_id': '60301085',
  'AppointmentDate': datetime.date(2019, 12, 17),
  'Status': 'Attended'},
 {'participant_id': '60301085',
  'AppointmentDate': datetime.date(2020, 1, 15),
  'Status': 'Cancelled'},
 {'participant_id': '60107567',
  'AppointmentDate': datetime.date(2020, 3, 21),
  'Status': 'Cancelled'},
 {'participant_id': '60107567',
  'AppointmentDate': datetime.date(2020, 2, 27),
  'Status': 'Attended'},
 {'participant_id': '60107567',
  'AppointmentDate': datetime.date(2020, 3, 19),
  'Status': 'Cancelled'},
 {'participant_id': '60107567',
  'AppointmentDate': datetime.date(2020, 1, 31),
  'Status': 'Cancelled'},
 {'participant_id': '60107567',
  'AppointmentDate': datetime.date(2020, 1, 25),
  'Status': 'Attended'},
 {'participant_id': '60107567',
  'Appo

In [19]:
appointments_df = pd.DataFrame(all_appointments)
appointments_df["participant_id"].unique()

array(['60301085', '60107567', '60300310'], dtype=object)

In [20]:
appointments_df.to_csv("cleaned_data/appointments_v2.csv", index=False)