# Convert RAW JSON listener files to annotator-friendly file

* Project: AHRQ/MeTeOR/PERSEUS

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

## Create functions for assigning timestamps to signals

Data arrives in packets of 64 (ecg) or 32 (pleth) with a single timestamp. Therefore, each array needs to manually be assigned individual timestamps.

In [None]:
def expand_pleth_times(timestamp):
    x = pd.date_range(timestamp, periods=32,freq='8L',closed="left")
    return x

def expand_ecg_times(timestamp):
    x = pd.date_range(timestamp, periods=64,freq='4L',closed="left")
    return x

## Load and Clean Signals

* Load physio data
* Cleaning:
    * Timestamps/timezones -- physio data CSVs are time local (EST) but alamrs come with UTC offset.
        * Bokeh visualizer and pandas treat times as UTC, so must explicitly declare TZ intent.
    * Merge duplicate timestamp entries into 1 row

In [None]:
#fname = 'data//Brown datathon 3.4-5.2017 files/x00-02.1982-06-25 (de-id)'
fname = '../../original_data/x00-03.2017-05-26'

In [None]:
physio_df = pd.read_json(fname, lines=True)

In [None]:
physio_df.set_index("timestamp",inplace=True)

In [None]:
physio_df.tz_localize('Etc/GMT+4',copy=False)

In [None]:
'''
Collapse multiple entries for a single timestamp to one row.

Example, this:

        value_1 value_2 value_3
time_1    1        1      NaN
time_1    NaN     NaN      1

Becomes:
        value_1 value_2 value_3
time_1     1       1       1

'''

cleaned_physio_df = physio_df.groupby("timestamp").first().combine_first(physio_df.groupby("timestamp").last())

In [None]:
cleaned_physio_df[['Heart Rate','Respiration Rate','SpO2','qos']] = cleaned_physio_df[['Heart Rate','Respiration Rate','SpO2','qos']].apply(pd.to_numeric,errors='coerce')

In [None]:
cleaned_physio_df

In [None]:
cleaned_physio_df.isnull().sum()

## Load and Clean Alarms

* Load alarm data
* Cleaning:
    * Timestamps/timezones

In [None]:
alarms_fname = '../../original_data/x00-03_5.26.2017_alarms.csv'

In [None]:
alarms_df = pd.read_csv(alarms_fname,parse_dates=[0])

In [None]:
alarms_df.set_index("_time", inplace=True)

In [None]:
alarms_df.tz_localize("UTC",copy=False).tz_convert('Etc/GMT+4',copy=False)

In [None]:
alarms = alarms_df.index.to_pydatetime()

## Putting it together

* Pipeline:
    1. Choose an alarm
    2. Slice dataframe based on window of time around alarm (isolated_physio_df)
    3. Unpack the dictionary containing non-invasive blood pressure values

In [None]:
alarm = alarms[0]

In [None]:
# isolated_physio_df = cleaned_physio_df[alarm-pd.Timedelta("500 seconds"):alarm+pd.Timedelta("500 seconds")]
isolated_physio_df = cleaned_physio_df[alarm-pd.Timedelta("500 seconds"):alarm]

In [None]:
isolated_physio_df[["diastolic_bp","mean_bp","systolic_bp"]] = isolated_physio_df["Non-invasive Blood Pressure"].apply(pd.Series).apply(pd.to_numeric,errors='coerce')

In [None]:
isolated_physio_df

In [None]:
x = np.hstack(isolated_physio_df["Pleth"].dropna().index.to_series().apply(expand_pleth_times).values)
y = np.hstack(isolated_physio_df["Pleth"].dropna().values)

In [None]:
plt.plot(x,y)

In [None]:
# x = np.hstack(isolated_physio_df["SpO2"].dropna().index.to_series())
# y = np.hstack(isolated_physio_df["SpO2"].dropna().values)
# plt.plot(x,y)

In [None]:
# start = isolated_physio_df.index[0].to_pydatetime()
# increment = 8*pd.Timedelta("10 seconds")
# window_length = pd.Timedelta("10 seconds")

In [None]:
# x = isolated_physio_df[start+increment:start+increment+window_length].mean_bp.dropna().index.to_series() # need to_series for tz-aware
# y = isolated_physio_df[start+increment:start+increment+window_length].mean_bp.dropna()

In [None]:
# plt.plot(x,y)

# -----------------------------------------------
# Information about dataset
# -----------------------------------------------

I use space below to make edits, test functions, explore dataset, etc.

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

# Read a file

In [2]:
fname = '../../original_data/x00-03.2017-05-25'

In [None]:
alarms_fname = '../../original_data/x00-03_5.25.2017_alarms.csv'

In [3]:
df = pd.read_json(fname, lines=True)

In [4]:
df

Unnamed: 0,Airway,ECG,Heart Rate,Non-invasive Blood Pressure,Pleth,Respiration Rate,SpO2,alarms,qos,timestamp
0,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2723, 2807, 2874, 2925, 2961, 2985, 2994, 299...",,,,1,2017-05-24 23:56:16.760
1,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1718, 1695, 1673, 1652, 1611, 1560, 1524, 149...",,,,1,2017-05-24 23:56:17.016
2,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 88, 'systolic': 139, 'diastolic': 73}",,Not a number,,,1,2017-05-24 23:56:18.680
3,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,96.7,,1,2017-05-24 23:56:18.680
4,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,,1,2017-05-24 23:56:18.680
5,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,"{'Alarm_T_0': {'source': 'NOM_RESP', 'state': ...",1,2017-05-24 23:56:18.712
6,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1208, 1320, 1460, 1621, 1771, 1914, 2071, 222...",,,,1,2017-05-24 23:56:17.272
7,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2191, 2102, 2015, 1933, 1859, 1794, 1737, 168...",,,,1,2017-05-24 23:56:17.528
8,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1173, 1147, 1119, 1097, 1079, 1062, 1053, 106...",,,,1,2017-05-24 23:56:17.784
9,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2849, 2809, 2762, 2707, 2643, 2570, 2490, 240...",,,,1,2017-05-24 23:56:18.040


**Notes:** 
* timestamps are out of order 
* lists need to be expanded 
* timestamps need to be generated for arrays of data (e.g. Pleth) 
* no UTC tz code present

In [None]:
alarms = pd.read_csv(alarms_fname,parse_dates=[0])

In [None]:
alarms.head()

**Notes:** 
* UTC timestamp has been applied -0400

# Clean Alarms dataset

This set of actions ensures all data input is on the same timestamp.

In [None]:
alarms.set_index("_time").head()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').head()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').head().iloc[0]

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').index.to_pydatetime()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').index.to_pydatetime().size

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').iloc[0]

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').loc['2017-05-27T23:43:27.536-0400']

# Clean Signals dataset

## Timestamps UTC/TZ

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').head()

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().head(10)

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last().head(10)

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().head(10).combine_first(df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last().head(10))

In [5]:
example = df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().combine_first(df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last())

In [None]:
cols = [0,7,8]
example.drop(example.columns[cols],axis=1)['Respiration Rate'].dropna()

In [None]:
df3 = example[['qos']]

In [None]:
df3

In [None]:
df3 = df3.set_index(df3.index - pd.Timedelta('5 seconds'))
df3

In [None]:
df2 = example.drop(['qos'],axis = 1)
df2

In [None]:
merged = pd.merge(left=df2, left_index=True,
                  right=df3, right_index=True,
                  how='inner')
merged

In [6]:
example

Unnamed: 0_level_0,Airway,ECG,Heart Rate,Non-invasive Blood Pressure,Pleth,Respiration Rate,SpO2,alarms,qos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-05-24 23:56:16.760000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2723, 2807, 2874, 2925, 2961, 2985, 2994, 299...",,,,1
2017-05-24 23:56:17.016000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1718, 1695, 1673, 1652, 1611, 1560, 1524, 149...",,,,1
2017-05-24 23:56:17.272000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1208, 1320, 1460, 1621, 1771, 1914, 2071, 222...",,,,1
2017-05-24 23:56:17.528000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2191, 2102, 2015, 1933, 1859, 1794, 1737, 168...",,,,1
2017-05-24 23:56:17.784000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1173, 1147, 1119, 1097, 1079, 1062, 1053, 106...",,,,1
2017-05-24 23:56:18.040000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2849, 2809, 2762, 2707, 2643, 2570, 2490, 240...",,,,1
2017-05-24 23:56:18.296000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[1397, 1382, 1368, 1350, 1330, 1309, 1287, 126...",,,,1
2017-05-24 23:56:18.552000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2914, 2934, 2943, 2942, 2933, 2917, 2894, 286...",,,,1
2017-05-24 23:56:18.680000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 88, 'systolic': 139, 'diastolic': 73}",,Not a number,,,1
2017-05-24 23:56:18.712000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,"{'Alarm_T_0': {'source': 'NOM_RESP', 'state': ...",1


In [7]:
example["Pleth"].dropna()

timestamp
2017-05-24 23:56:16.760000-04:00    [2723, 2807, 2874, 2925, 2961, 2985, 2994, 299...
2017-05-24 23:56:17.016000-04:00    [1718, 1695, 1673, 1652, 1611, 1560, 1524, 149...
2017-05-24 23:56:17.272000-04:00    [1208, 1320, 1460, 1621, 1771, 1914, 2071, 222...
2017-05-24 23:56:17.528000-04:00    [2191, 2102, 2015, 1933, 1859, 1794, 1737, 168...
2017-05-24 23:56:17.784000-04:00    [1173, 1147, 1119, 1097, 1079, 1062, 1053, 106...
2017-05-24 23:56:18.040000-04:00    [2849, 2809, 2762, 2707, 2643, 2570, 2490, 240...
2017-05-24 23:56:18.296000-04:00    [1397, 1382, 1368, 1350, 1330, 1309, 1287, 126...
2017-05-24 23:56:18.552000-04:00    [2914, 2934, 2943, 2942, 2933, 2917, 2894, 286...
2017-05-24 23:56:18.808000-04:00    [1521, 1508, 1493, 1476, 1457, 1437, 1416, 139...
2017-05-24 23:56:19.064000-04:00    [2387, 2497, 2589, 2664, 2746, 2819, 2861, 288...
2017-05-24 23:56:19.320000-04:00    [1600, 1568, 1540, 1515, 1493, 1472, 1452, 143...
2017-05-24 23:56:19.576000-04:00    [1310, 1

In [13]:
example["Pleth"].dropna().values

array([ list([2723, 2807, 2874, 2925, 2961, 2985, 2994, 2992, 2986, 2973, 2954, 2929, 2898, 2860, 2816, 2765, 2707, 2641, 2568, 2499, 2430, 2352, 2270, 2187, 2107, 2032, 1964, 1904, 1852, 1807, 1769, 1740]),
       list([1718, 1695, 1673, 1652, 1611, 1560, 1524, 1495, 1470, 1448, 1427, 1407, 1392, 1378, 1361, 1342, 1321, 1300, 1279, 1258, 1236, 1214, 1191, 1166, 1146, 1127, 1103, 1080, 1064, 1062, 1081, 1129]),
       list([1208, 1320, 1460, 1621, 1771, 1914, 2071, 2227, 2373, 2503, 2616, 2710, 2787, 2847, 2891, 2920, 2936, 2938, 2930, 2917, 2899, 2874, 2843, 2806, 2762, 2711, 2653, 2586, 2511, 2438, 2364, 2280]),
       ...,
       list([2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047]),
       list([2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 20

In [8]:
def expand_pleth_times(timestamp):
    x = pd.date_range(timestamp, periods=32,freq='8L',closed="left")
    return x

In [9]:
def expand_ecg_times(timestamp):
    x = pd.date_range(timestamp, periods=64,freq='4L',closed="left")
    return x

In [10]:
x = np.hstack(example["Pleth"].dropna().index.to_series().apply(expand_pleth_times).values)
y = np.hstack(example["Pleth"].dropna().values)

In [12]:
y

array([2723, 2807, 2874, ..., 2047, 2047, 2047])

In [11]:
print(len(x))
print(len(y))

8371840
8371840


In [None]:
x

In [None]:
y

In [None]:
X = np.hstack(example["ECG"].dropna().index.to_series().apply(expand_pleth_times).values)
X

In [None]:
Y = np.hstack(example["ECG"].dropna().values)
Y = [Y[i]  for i in range(len(Y)) if i % 2 == 1]
print(len(Y))

In [None]:
ecg_df = pd.DataFrame(y, index=x, columns = ['ECG'])
ecg_df.index.name = 'timestamp'
ecg_df

In [None]:
ppg_df = pd.DataFrame(Y, index = X, columns = ['PPG'])
ppg_df.index.name = 'timestamp'
ppg_df

In [None]:
signals_df = pd.merge(ecg_df, ppg_df, how='inner', left_index=True, right_index=True)
signals_df

In [None]:
plt.plot(X,Y)

In [None]:
plt.plot(x,y)

In [None]:
plt.plot(y)

## Expand BP

In [None]:
df["Non-invasive Blood Pressure"].head(10).apply(pd.Series).apply(pd.to_numeric, errors="coerce")

In [None]:
# takes a long time to run on whole df
%time df["Non-invasive Blood Pressure"].apply(pd.Series).apply(pd.to_numeric, errors="coerce")