# Synthesized Data Generator

In [3]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import math
from functools import reduce
sns.set()
%matplotlib inline
import os

In [59]:
# GLOBALS

# The dataframes need to be in range from 20:00:30PM to 09:52:34AM
START_TIME = datetime.time(20, 0, 30)
END_TIME = datetime.time(9, 52, 34)
PATIENT_ID = 'ggggg'
PATH_CLEAN = 'data/cleaned/samples50/'


In [5]:
def generate_df_skeleton(day):
    start_datetime = datetime.datetime(day.year, day.month, day.day, 
                              START_TIME.hour, START_TIME.minute, START_TIME.second, 0)
    
    next_day = day + datetime.timedelta(days=1)
    
    end_datetime = datetime.datetime(next_day.year, next_day.month, next_day.day, 
                            END_TIME.hour, END_TIME.minute, END_TIME.second, 0)
    
    timestamps = pd.date_range(start=start_datetime,
                         end=end_datetime, 
                         freq='s')
    null_values = [0 for _ in timestamps.to_list()]
    
    df = pd.DataFrame({'timestamp': timestamps, 
                       'measured_signal_strength': null_values,
                       'heart_rate': null_values,
                       'respiration_rate': null_values,
                      })
    return df


In [9]:
def fill_df(df, enter_bed_time, sleep_in_time, wake_up_time, get_up_time):
    start_datetime = df['timestamp'].iloc[0]
    day = start_datetime.date()
    
    enter_bed_datetime = datetime.datetime(day.year, day.month, day.day, 
                                           enter_bed_time.hour, 
                                           enter_bed_time.minute, 
                                           enter_bed_time.second, 0)
    
    sleep_in_datetime = datetime.datetime(day.year, day.month, day.day, 
                                          sleep_in_time.hour, 
                                          sleep_in_time.minute, 
                                          sleep_in_time.second, 0)
    
    next_day = day + datetime.timedelta(days=1)
    
    wake_up_datetime = datetime.datetime(next_day.year, next_day.month, next_day.day, 
                                          wake_up_time.hour, 
                                          wake_up_time.minute, 
                                          wake_up_time.second, 0)
    
    get_up_datetime = datetime.datetime(next_day.year, next_day.month, next_day.day, 
                                          get_up_time.hour, 
                                          get_up_time.minute, 
                                          get_up_time.second, 0)
    
    end_datetime = df['timestamp'].iloc[-1]
    
    
    # *** EMPTY BED ***
    
    phase1_index = 0
    empty_bed_size = (enter_bed_datetime - start_datetime).seconds
    phase2_index = empty_bed_size
    empty_bed_mss_values = np.random.randint(0, high=500, size=empty_bed_size)
    # Set MSS random equally distributed (0-500)
    df['measured_signal_strength'].iloc[phase1_index:phase2_index] = empty_bed_mss_values
    # Heart Rate and Respiration rate stay at 0
    
    # *** OCCUPIED BED, NOT YET ASLEEP ***
    occupied_bed_size = (sleep_in_datetime - enter_bed_datetime).seconds
    phase3_index = occupied_bed_size + phase2_index
    
    base_occupied_signal = np.random.uniform(0, 1, occupied_bed_size)
    # Map between 30k and 40k
    occupied_bed_mss_values = [x*10000 + 30000 for x in base_occupied_signal]
    # Map between 65 and 70
    occupied_bed_hr_values = [x*5 + 65 for x in base_occupied_signal]
    # Map between 15 and 18
    occupied_bed_rr_values = [x*3 + 15 for x in base_occupied_signal]
    
    df['measured_signal_strength'].iloc[phase2_index:phase3_index] = occupied_bed_mss_values
    df['heart_rate'].iloc[phase2_index:phase3_index] = occupied_bed_hr_values
    df['respiration_rate'].iloc[phase2_index:phase3_index] = occupied_bed_rr_values
    
    # *** OCCUPIED BED, ASLEEP ***
    sleeping_size = (wake_up_datetime - sleep_in_datetime).seconds
    phase4_index = phase3_index + sleeping_size
    
    # Create an cosine signal with a period of 90min (REM-Phase-Cycle)
    custom_cos = lambda x: (math.cos((2*math.pi*x)/5400) + 1) / 2
    
    base_sleeping_signal = [custom_cos(x) + np.random.uniform(-0.1, 0.1) 
                            for x in range(phase4_index - phase3_index)]
        
    sleeping_mss_values = [x*25000 + 15000 for x in base_sleeping_signal]
    sleeping_hr_values = [x*10 + 60 for x in base_sleeping_signal]
    sleeping_rr_values = [x*6 + 12 for x in base_sleeping_signal]
    
    
    df['measured_signal_strength'].iloc[phase3_index:phase4_index] = sleeping_mss_values
    df['heart_rate'].iloc[phase3_index:phase4_index] = sleeping_hr_values
    df['respiration_rate'].iloc[phase3_index:phase4_index] = sleeping_rr_values
    
    
    # *** OCCUPIED BED, WOKEN UP ***
    awake_size = (get_up_datetime - wake_up_datetime).seconds
    phase5_index = phase4_index + awake_size    
    
    base_awake_signal = np.random.uniform(0, 1, awake_size)
    # Map bewtween 15k and 40k
    awake_bed_mss_values = [x*25000 + 15000 for x in base_awake_signal]
    # Map bewtween 60 and 65
    awake_bed_hr_values = [x*5 + 60 for x in base_awake_signal]
    # Map between 12 and 17
    awake_bed_rr_values = [x*5 + 12 for x in base_awake_signal]
    
    df['measured_signal_strength'].iloc[phase4_index:phase5_index] = awake_bed_mss_values
    df['heart_rate'].iloc[phase4_index:phase5_index] = awake_bed_hr_values
    df['respiration_rate'].iloc[phase4_index:phase5_index] = awake_bed_rr_values
    
    # *** EMPTY BED (again) ***
    out_of_bed_size = (end_datetime - get_up_datetime).seconds
    phase6_index = phase5_index + out_of_bed_size
    
    out_of_bed_mss_values = np.random.randint(0, high=500, size=out_of_bed_size)
    # Set MSS random equally distributed (0-500)
    df['measured_signal_strength'].iloc[phase5_index:phase6_index] = out_of_bed_mss_values
    # Heart Rate and Respiration rate stay at 0
    return df


In [17]:
skel_df1 = generate_df_skeleton(datetime.date(2020, 4, 13))
skel_df2 = generate_df_skeleton(datetime.date(2020, 4, 14))
skel_df3 = generate_df_skeleton(datetime.date(2020, 4, 15))
skel_df4 = generate_df_skeleton(datetime.date(2020, 4, 16))

In [18]:
data_frames = []

data_frames.append(fill_df(skel_df1,
                           datetime.time(22, 0, 0),
                           datetime.time(23, 0, 0),
                           datetime.time(8, 0, 0),
                           datetime.time(8, 30, 0),
                          )
                  )

data_frames.append(fill_df(skel_df2,
                           datetime.time(20, 30, 0),
                           datetime.time(22, 0, 0),
                           datetime.time(6, 0, 0),
                           datetime.time(6, 5, 0),
                          )
                  )

data_frames.append(fill_df(skel_df3,
                           datetime.time(21, 0, 0),
                           datetime.time(21, 30, 0),
                           datetime.time(7, 0, 0),
                           datetime.time(7, 10, 0),
                          )
                  )

data_frames.append(fill_df(skel_df4,
                           datetime.time(23, 0, 0),
                           datetime.time(23, 45, 0),
                           datetime.time(9, 0, 0),
                           datetime.time(9, 52, 34),
                          )
                  )


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [21]:
data_frames[0]

Unnamed: 0,timestamp,measured_signal_strength,heart_rate,respiration_rate
0,2020-04-13 20:00:30,83.0,0.0,0.0
1,2020-04-13 20:00:31,326.0,0.0,0.0
2,2020-04-13 20:00:32,211.0,0.0,0.0
3,2020-04-13 20:00:33,463.0,0.0,0.0
4,2020-04-13 20:00:34,291.0,0.0,0.0
...,...,...,...,...
49920,2020-04-14 09:52:30,390.0,0.0,0.0
49921,2020-04-14 09:52:31,153.0,0.0,0.0
49922,2020-04-14 09:52:32,108.0,0.0,0.0
49923,2020-04-14 09:52:33,174.0,0.0,0.0


In [48]:

def adjust_time_stamp(row):
    # '2020-04-06T09:03:51+0000'
    ts = row['timestamp']
    date_str = ts.date().strftime('%Y-%m-%d')
    ts_str = f"{date_str}T{ts.hour}:{ts.minute}:{ts.second}+0000"
    ts_obj = datetime.datetime.strptime(ts_str, '%Y-%m-%dT%H:%M:%S%z')
    return ts_obj

# Create Timestamps
for _df in data_frames:
    _df['timestamp'] = _df.apply(lambda row: adjust_time_stamp(row), axis=1)


In [49]:
for _df in data_frames:
    try:
        _df.set_index('timestamp', inplace=True)
    except KeyError:
        pass

### 1 Hour granularity

In [53]:
dfs_1h = []
for _df in data_frames:
    df_1h = _df[['heart_rate', 'respiration_rate', 'measured_signal_strength']].resample('60Min').mean()
    df_1h.columns = ['heart_rate', 'respiration_rate', 'mss']
    df_1h['patient'] = df_1h.apply(lambda _: PATIENT_ID, axis=1)
    dfs_1h.append(df_1h)


In [55]:
dfs_1h[0].head()

Unnamed: 0_level_0,heart_rate,respiration_rate,mss,patient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-13 20:00:00+00:00,0.0,0.0,252.096639,ggggg
2020-04-13 21:00:00+00:00,0.0,0.0,250.547222,ggggg
2020-04-13 22:00:00+00:00,67.519253,16.511552,35038.505028,ggggg
2020-04-13 23:00:00+00:00,63.958118,14.374871,24895.29505,ggggg
2020-04-14 00:00:00+00:00,67.073352,16.244011,32683.38121,ggggg


In [58]:
for i, _df in enumerate(dfs_1h):
    nbr = i + 1    
    _df.to_csv(PATH_CLEAN + 'one_hour/' + f'{PATIENT_ID}_{nbr}.csv')

### 30 min granularity

In [60]:
dfs_30m = []
for _df in data_frames:
    df_30m = _df[['heart_rate', 'respiration_rate', 'measured_signal_strength']].resample('30Min').mean()
    df_30m.columns = ['heart_rate', 'respiration_rate', 'mss']
    df_30m['patient'] = df_30m.apply(lambda _: PATIENT_ID, axis=1)
    dfs_30m.append(df_30m)


In [62]:
dfs_30m[0].head(5)

Unnamed: 0_level_0,heart_rate,respiration_rate,mss,patient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-13 20:00:00+00:00,0.0,0.0,249.516384,ggggg
2020-04-13 20:30:00+00:00,0.0,0.0,254.633889,ggggg
2020-04-13 21:00:00+00:00,0.0,0.0,252.478889,ggggg
2020-04-13 21:30:00+00:00,0.0,0.0,248.615556,ggggg
2020-04-13 22:00:00+00:00,67.51894,16.511364,35037.879767,ggggg


In [63]:
for i, _df in enumerate(dfs_30m):
    nbr = i + 1    
    _df.to_csv(PATH_CLEAN + 'thirty_min/' + f'{PATIENT_ID}_{nbr}.csv')

### 5 min granularity

In [64]:
dfs_5m = []
for _df in data_frames:
    df_5m = _df[['heart_rate', 'respiration_rate', 'measured_signal_strength']].resample('5Min').mean()
    df_5m.columns = ['heart_rate', 'respiration_rate', 'mss']
    df_5m['patient'] = df_5m.apply(lambda _: PATIENT_ID, axis=1)
    dfs_5m.append(df_5m)


In [65]:
dfs_5m[0].head(5)

Unnamed: 0_level_0,heart_rate,respiration_rate,mss,patient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-13 20:00:00+00:00,0.0,0.0,250.948148,ggggg
2020-04-13 20:05:00+00:00,0.0,0.0,246.02,ggggg
2020-04-13 20:10:00+00:00,0.0,0.0,246.923333,ggggg
2020-04-13 20:15:00+00:00,0.0,0.0,247.553333,ggggg
2020-04-13 20:20:00+00:00,0.0,0.0,244.71,ggggg


In [66]:
for i, _df in enumerate(dfs_5m):
    nbr = i + 1    
    _df.to_csv(PATH_CLEAN + 'five_min/' + f'{PATIENT_ID}_{nbr}.csv')

### 1 min granularity

In [67]:
dfs_1m = []
for _df in data_frames:
    df_1m = _df[['heart_rate', 'respiration_rate', 'measured_signal_strength']].resample('1Min').mean()
    df_1m.columns = ['heart_rate', 'respiration_rate', 'mss']
    df_1m['patient'] = df_1m.apply(lambda _: PATIENT_ID, axis=1)
    dfs_1m.append(df_1m)


In [68]:
dfs_1m[0].head(5)

Unnamed: 0_level_0,heart_rate,respiration_rate,mss,patient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-13 20:00:00+00:00,0.0,0.0,221.833333,ggggg
2020-04-13 20:01:00+00:00,0.0,0.0,260.566667,ggggg
2020-04-13 20:02:00+00:00,0.0,0.0,256.916667,ggggg
2020-04-13 20:03:00+00:00,0.0,0.0,245.8,ggggg
2020-04-13 20:04:00+00:00,0.0,0.0,255.066667,ggggg


In [69]:
for i, _df in enumerate(dfs_1m):
    nbr = i + 1    
    _df.to_csv(PATH_CLEAN + 'one_min/' + f'{PATIENT_ID}_{nbr}.csv')