In [None]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence

import plotly.express as px


# Analysis of 1 iteration

In [None]:
path_to_folder = 'D:/DATA_THESIS/Projekt_nw_AG_AJ/Outputs_project_data_preprocessing'

In [None]:
sleep_data_i01 = pd.read_pickle("all_fitbits/iteration_01_sleep.pkl")

In [None]:
for patient in sleep_data_i01['patient_id'].unique():
    # filter for 1 patient
    single_patient = sleep_data_i01[sleep_data_i01['patient_id'] == patient]
    # plot for each patient the date and the length of the sleep
    single_patient = single_patient.drop_duplicates(['dateOfSleep'])
    # display(single_patient)
    fig = px.bar(single_patient, x='dateOfSleep', y="timeInBed",color='mainSleep', hover_data = ['startTime'], title = f'Patient: {patient}')
    fig.show()

# Load data from 1st notebook - 1_morning_evening_emotion
* load the file with merged evening_morning data

In [None]:
evening_morning = pd.read_pickle(f'{path_to_folder}/Outputs_project_data_preprocessing/1_data/evening_morning.pkl')

In [None]:
len(list(evening_morning['id'].unique()))

# Load long sleep for all the iterations from notebook 2_fitbit_sleep_data


In [None]:
sleep_data = pd.read_pickle(f"{path_to_folder}/Outputs_project_data_preprocessing/2_fitbit_sleep/long_sleep_all_iterations.pkl")

In [None]:
len(list(sleep_data['patient_id'].unique()))

# Join evening_morning and long_sleep_data

In [None]:
sleep_data['morning_date'] = pd.to_datetime(sleep_data['endTime']).dt.floor('D')
evening_morning['morning_date'] = pd.to_datetime(evening_morning['filledTimestamp_morning']).dt.floor('D')

In [None]:
sleep_data['id'] = sleep_data['patient_id']

In [None]:
merged_df = pd.merge(sleep_data, evening_morning, on=['id', 'morning_date'])

In [None]:
merged_df['minutesAfterWakeup'].value_counts().get(0, 0)

In [None]:
merged_df.columns

## Save joined dataframe as m_e_sleeplength 

m - stands for morning

e - stands for evening

In [None]:
merged_df.to_pickle(f"{path_to_folder}/Outputs_project_data_preprocessing/3_questtionnaires_fitbit/m_e_sleeplength.pkl")

In [None]:
merged_df

# Load dataframe m_e_sleeplength.csv

In [None]:
# merged_df = merged_df.drop(['duration', 'logId', 'minutesToFallAsleep', 'minutesAfterWakeup','patient_id', 'logType','infoCode', 'type', 'levels','mainSleep', 'Unnamed: 0'  ], axis = 1)

In [None]:
merged_df = pd.read_pickle(f"{path_to_folder}/Outputs_project_data_preprocessing/3_questionnaires_fitbit/m_e_sleeplength.pkl")

In [None]:
# merged_df = merged_df.drop(['Unnamed: 0'], axis =1 )
merged_df.columns

In [None]:
clear_m_e_sl = merged_df.drop(['duration', 'filledTimestamp_evening', 
              'filledTimestamp_morning',
                    'startTime', 'endTime', 
                    'id',
                    'rest', 'stress', 
                    'composure', 'dateOfSleep',
               'logId', 'minutesToFallAsleep', 'minutesAfterWakeup',
               'efficiency', 'type', 
               'infoCode', 'logType', 'levels', 'mainSleep' ], axis =1)

In [None]:
clear_m_e_sl

In [None]:
for patient in clear_m_e_sl['patient_id'].unique():
    # filter for 1 patient
    single_patient = clear_m_e_sl[clear_m_e_sl['patient_id'] == patient]
    # plot for each patient the date and the length of the sleep
    fig = px.bar(single_patient, x='morning_date', y="timeInBed",color='sleepQuality', title = f'Patient: {patient}')
    fig.show()

In [None]:
# fix the data
new_clear = pd.DataFrame()
for patient in clear_m_e_sl['patient_id'].unique():
    # filter for 1 patient
    single_patient = clear_m_e_sl[clear_m_e_sl['patient_id'] == patient]
    duplicated_dates = single_patient[single_patient.duplicated(subset=['morning_date'], keep=False)]
    # display(duplicated_dates)

    #  === if there are duplicates ===
    if duplicated_dates['morning_date'].empty ==False:
        # print("not empty")
        # print(len(list(single_patient['morning_date'])))
    # === remove duplicates to replace later with the single row ===
        single_patient = single_patient.drop_duplicates(subset=['morning_date'], keep=False)
        # print(len(list(single_patient['morning_date'])))
    #  === for each duplicated date ===
        for i in duplicated_dates['morning_date'].unique():
            # print(duplicated_dates[duplicated_dates['morning_date']==i][["minutesAsleep", 'minutesAwake', 'timeInBed']].sum()
            # display(duplicated_dates)
            result_day = duplicated_dates[duplicated_dates['morning_date']==i].head(1)
            # print("RESULT")
            # display(result_day)
            result_day['minutesAsleep'] = duplicated_dates[duplicated_dates['morning_date']==i]["minutesAsleep"].sum()
            result_day['minutesAwake'] = duplicated_dates[duplicated_dates['morning_date']==i]["minutesAwake"].sum()
            result_day['timeInBed'] = duplicated_dates[duplicated_dates['morning_date']==i]["timeInBed"].sum()
            # print("RESULT")
            # display(result_day)
            new_clear = pd.concat([new_clear, result_day], axis = 0)
            # print(len(list(new_clear['morning_date'])))
        new_clear = pd.concat([new_clear, single_patient], axis = 0)
        # print(len(list(new_clear['morning_date'])))
    else:
            new_clear = pd.concat([new_clear, single_patient], axis = 0)
        


In [None]:
new_clear = new_clear.reset_index(drop = True)

In [None]:
new_clear

In [None]:
for patient in new_clear['patient_id'].unique():
    # filter for 1 patient
    single_patient = new_clear[new_clear['patient_id'] == patient]
    # plot for each patient the date and the length of the sleep
    fig = px.bar(single_patient, x='morning_date', y="timeInBed",color='sleepQuality', title = f'Patient: {patient}')
    fig.show()

In [None]:
single_patient.index

In [None]:
# delete single data 
for patient in new_clear['patient_id'].unique():
    # filter for 1 patient
    single_patient = new_clear[new_clear['patient_id'] == patient]
    if len(list(single_patient['patient_id'])) == 1:
        display(single_patient)
        new_clear = new_clear.drop([single_patient.index[0]])
    new_clear = new_clear.reset_index(drop= True)
    # display(duplicated_dates)

In [None]:
for patient in new_clear['patient_id'].unique():
    # filter for 1 patient
    single_patient = new_clear[new_clear['patient_id'] == patient]
    # plot for each patient the date and the length of the sleep
    fig = px.bar(single_patient, x='morning_date', y="timeInBed",color='sleepQuality', title = f'Patient: {patient}')
    fig.show()

In [None]:
new_clear.columns

# Interpolate the missing data

In [None]:
interpolated_df = pd.DataFrame()
for patient in new_clear['patient_id'].unique():
    # filter for 1 patient
    single_patient = new_clear[new_clear['patient_id'] == patient]
    min_date = single_patient['morning_date'].min()
    max_date = single_patient['morning_date'].max()
    
    # Step 2: Generate a list of dates between min and max dates
    all_dates = pd.date_range(start=min_date, end=max_date)
    
    # Step 3: Merge with the original DataFrame
    all_dates_df = pd.DataFrame({'morning_date': all_dates})
    # Convert 'morning_date' column to datetime if it's not already
    single_patient['morning_date'] = pd.to_datetime(single_patient['morning_date'])
    merged_df = pd.merge(all_dates_df, single_patient, on='morning_date', how='left')  
    
    # Step 4: Interpolate or fill missing values
    columns_to_interpolate = ['minutesAsleep', 'minutesAwake', 'timeInBed', 'sleepQuality', 'overwhelm', 'health', 'mood', 'unpredictability', 'time_difference']
    merged_df.loc[:, columns_to_interpolate] = merged_df.loc[:, columns_to_interpolate].interpolate(method='linear')
    merged_df['patient_id'] = merged_df['patient_id'].fillna(patient)
    interpolated_df = pd.concat([interpolated_df, merged_df], axis = 0)
    # Resulting DataFrame will have approximate data for all dates between min and max date


In [None]:
for patient in interpolated_df['patient_id'].unique():
    # filter for 1 patient
    single_patient = interpolated_df[interpolated_df['patient_id'] == patient]
    # plot for each patient the date and the length of the sleep
    fig = px.bar(single_patient, x='morning_date', y="timeInBed",color='sleepQuality', title = f'Patient: {patient}')
    fig.show()

In [None]:
# interpolated_df.to_csv('interpolated_m_e_l.csv')