# Convert RAW JSON listener files to annotator-friendly file

* Project: AHRQ/MeTeOR/PERSEUS

## Imports

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

## Create functions for assigning timestamps to signals

Data arrives in packets of 64 (ecg) or 32 (pleth) with a single timestamp. Therefore, each array needs to manually be assigned individual timestamps.

In [7]:
def expand_pleth_times(timestamp):
    x = pd.date_range(timestamp, periods=32,freq='8L',closed="left")
    return x

def expand_ecg_times(timestamp):
    x = pd.date_range(timestamp, periods=64,freq='4L',closed="left")
    return x

## Load and Clean Signals

* Load physio data
* Cleaning:
    * Timestamps/timezones -- physio data CSVs are time local (EST) but alamrs come with UTC offset.
        * Bokeh visualizer and pandas treat times as UTC, so must explicitly declare TZ intent.
    * Merge duplicate timestamp entries into 1 row

In [17]:
#fname = 'data//Brown datathon 3.4-5.2017 files/x00-02.1982-06-25 (de-id)'
fname = 'data/x00-03.2017-05-27'

In [18]:
physio_df = pd.read_json(fname, lines=True)

In [19]:
physio_df.set_index("timestamp",inplace=True)

In [20]:
physio_df.tz_localize('Etc/GMT+4',copy=False)

Unnamed: 0_level_0,Airway,ECG,Heart Rate,Non-invasive Blood Pressure,Pleth,Respiration Rate,SpO2,alarms,qos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-05-26 23:56:11.024000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 'Not a number', 'systolic': 'Not a nu...",,Not a number,,,0
2017-05-26 23:56:11.024000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,Not a number,,0
2017-05-26 23:56:11.024000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,,0
2017-05-26 23:56:11.024000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,{'Alarm_T_2': {'source': 'NOM_PULS_OXIM_SAT_O2...,0
2017-05-26 23:56:09.680000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:09.936000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.192000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.448000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:12.048000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 'Not a number', 'systolic': 'Not a nu...",,Not a number,,,0
2017-05-26 23:56:12.048000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,Not a number,,0


In [21]:
'''
Collapse multiple entries for a single timestamp to one row.

Example, this:

        value_1 value_2 value_3
time_1    1        1      NaN
time_1    NaN     NaN      1

Becomes:
        value_1 value_2 value_3
time_1     1       1       1

'''

cleaned_physio_df = physio_df.groupby("timestamp").first().combine_first(physio_df.groupby("timestamp").last())

In [22]:
cleaned_physio_df[['Heart Rate','Respiration Rate','SpO2','qos']] = cleaned_physio_df[['Heart Rate','Respiration Rate','SpO2','qos']].apply(pd.to_numeric,errors='coerce')

In [39]:
cleaned_physio_df

Unnamed: 0_level_0,Airway,ECG,Heart Rate,Non-invasive Blood Pressure,Pleth,Respiration Rate,SpO2,alarms,qos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-05-26 23:56:09.680000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:09.936000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.192000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.448000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.704000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:10.960000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:11.024000-04:00,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': 'Not a number', 'systolic': 'Not a nu...",,,,{'Alarm_T_2': {'source': 'NOM_PULS_OXIM_SAT_O2...,0
2017-05-26 23:56:11.216000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:11.472000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0
2017-05-26 23:56:11.728000-04:00,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0


In [38]:
cleaned_physio_df.isnull().sum()

Airway                              0
ECG                             63614
Heart Rate                     224743
Non-invasive Blood Pressure         0
Pleth                           63616
Respiration Rate               223005
SpO2                           244713
alarms                         208714
qos                                 0
dtype: int64

## Load and Clean Alarms

* Load alarm data
* Cleaning:
    * Timestamps/timezones

In [None]:
alarms_fname = 'alarms/x00-05.1982-06-11_alarms.csv'

In [None]:
alarms_df = pd.read_csv(alarms_fname,parse_dates=[0])

In [None]:
alarms_df.set_index("_time", inplace=True)

In [None]:
alarms_df.tz_localize("UTC",copy=False).tz_convert('Etc/GMT+4',copy=False)

In [None]:
alarms = alarms_df.index.to_pydatetime()

## Putting it together

* Pipeline:
    1. Choose an alarm
    2. Slice dataframe based on window of time around alarm (isolated_physio_df)
    3. Unpack the dictionary containing non-invasive blood pressure values

In [None]:
alarm = alarms[0]

In [None]:
isolated_physio_df = cleaned_physio_df[alarm-pd.Timedelta("500 seconds"):alarm+pd.Timedelta("500 seconds")]

In [None]:
isolated_physio_df[["diastolic_bp","mean_bp","systolic_bp"]] = isolated_physio_df["Non-invasive Blood Pressure"].apply(pd.Series).apply(pd.to_numeric,errors='coerce')

In [None]:
# x = np.hstack(isolated_physio_df["Pleth"].dropna().index.to_series().apply(expand_pleth_times).values)
# y = np.hstack(isolated_physio_df["Pleth"].dropna().values)

In [None]:
# plt.plot(x,y)

In [None]:
# x = np.hstack(isolated_physio_df["SpO2"].dropna().index.to_series())
# y = np.hstack(isolated_physio_df["SpO2"].dropna().values)
# plt.plot(x,y)

In [None]:
# start = isolated_physio_df.index[0].to_pydatetime()
# increment = 8*pd.Timedelta("10 seconds")
# window_length = pd.Timedelta("10 seconds")

In [None]:
# x = isolated_physio_df[start+increment:start+increment+window_length].mean_bp.dropna().index.to_series() # need to_series for tz-aware
# y = isolated_physio_df[start+increment:start+increment+window_length].mean_bp.dropna()

In [None]:
# plt.plot(x,y)

# -----------------------------------------------
# Information about dataset
# -----------------------------------------------

I use space below to make edits, test functions, explore dataset, etc.

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

# Read a file

In [2]:
fname = 'data/x00-03.2017-05-27'

In [3]:
alarms_fname = 'data/x00-03_5.27.2017_alarms.csv'

In [4]:
df = pd.read_json(fname, lines=True)

In [5]:
df

Unnamed: 0,Airway,ECG,Heart Rate,Non-invasive Blood Pressure,Pleth,Respiration Rate,SpO2,alarms,qos,timestamp
0,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 'Not a number', 'systolic': 'Not a nu...",,Not a number,,,0,2017-05-26 23:56:11.024
1,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,Not a number,,0,2017-05-26 23:56:11.024
2,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,,0,2017-05-26 23:56:11.024
3,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,,{'Alarm_T_2': {'source': 'NOM_PULS_OXIM_SAT_O2...,0,2017-05-26 23:56:11.024
4,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0,2017-05-26 23:56:09.680
5,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0,2017-05-26 23:56:09.936
6,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0,2017-05-26 23:56:10.192
7,"{'Respiration Rate': None, 'etCO2': None}","[-40.96, -40.96, -40.96, -40.96, -40.96, -40.9...",,"{'mean': None, 'systolic': None, 'diastolic': ...","[2047, 2047, 2047, 2047, 2047, 2047, 2047, 204...",,,,0,2017-05-26 23:56:10.448
8,"{'Respiration Rate': None, 'etCO2': None}",,Not a number,"{'mean': 'Not a number', 'systolic': 'Not a nu...",,Not a number,,,0,2017-05-26 23:56:12.048
9,"{'Respiration Rate': None, 'etCO2': None}",,,"{'mean': None, 'systolic': None, 'diastolic': ...",,,Not a number,,0,2017-05-26 23:56:12.048


**Notes:** 
* timestamps are out of order 
* lists need to be expanded 
* timestamps need to be generated for arrays of data (e.g. Pleth) 
* no UTC tz code present

In [None]:
alarms = pd.read_csv(alarms_fname,parse_dates=[0])

In [None]:
alarms.head()

**Notes:** 
* UTC timestamp has been applied -0400

# Clean Alarms dataset

This set of actions ensures all data input is on the same timestamp.

In [None]:
alarms.set_index("_time").head()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').head()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').head().iloc[0]

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').index.to_pydatetime()

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').index.to_pydatetime().size

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').iloc[0]

In [None]:
alarms.set_index("_time").tz_localize("UTC").tz_convert('Etc/GMT+4').loc['2017-05-27T23:43:27.536-0400']

# Clean Signals dataset

## Timestamps UTC/TZ

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').head()

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().head(10)

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last().head(10)

In [None]:
df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().head(10).combine_first(df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last().head(10))

In [None]:
example = df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").first().head(10).combine_first(df.set_index("timestamp").tz_localize('Etc/GMT+4').groupby("timestamp").last().head(10))

In [None]:
cols = [0,7,8]
example.drop(example.columns[cols],axis=1)['Respiration Rate'].dropna()

In [None]:
df3 = example[['qos']]

In [None]:
df3

In [None]:
df3.set_index(df3.index - pd.Timedelta('5 seconds'))

In [None]:
example["Pleth"].dropna()

In [None]:
def expand_pleth_times(timestamp):
    x = pd.date_range(timestamp, periods=32,freq='8L',closed="left")
    return x

In [None]:
def expand_ecg_times(timestamp):
    x = pd.date_range(timestamp, periods=64,freq='4L',closed="left")
    return x

In [None]:
x = np.hstack(example["Pleth"].dropna().index.to_series().apply(expand_pleth_times).values)
y = np.hstack(example["Pleth"].dropna().values)

In [None]:
plt.plot(x,y)

In [None]:
plt.plot(y)

## Expand BP

In [None]:
df["Non-invasive Blood Pressure"].head(10).apply(pd.Series).apply(pd.to_numeric, errors="coerce")

In [None]:
# takes a long time to run on whole df
%time df["Non-invasive Blood Pressure"].apply(pd.Series).apply(pd.to_numeric, errors="coerce")