In [2]:
# Imports
import pandas as pd
import numpy as np

In [3]:
# load csv
all_data = pd.read_csv('data/time_series_data.csv')

In [4]:
# get aperiodic values only
aperiodic_data = all_data.loc[all_data['origin'] == 'aperiodic'].reset_index()
aperiodic_data['observationoffset'] /= 60.0
aperiodic_data.head(10)

Unnamed: 0,index,patientunitstayid,observationoffset,Key,Value,origin
0,3,143103,0.15,noninvasivesystolic,103.0,aperiodic
1,4,143103,0.15,noninvasivediastolic,63.0,aperiodic
2,5,143103,0.15,noninvasivemean,76.0,aperiodic
3,61,143103,0.65,noninvasivesystolic,115.0,aperiodic
4,62,143103,0.65,noninvasivediastolic,70.0,aperiodic
5,63,143103,0.65,noninvasivemean,85.0,aperiodic
6,141,143103,1.15,noninvasivesystolic,125.0,aperiodic
7,142,143103,1.15,noninvasivediastolic,81.0,aperiodic
8,143,143103,1.15,noninvasivemean,98.0,aperiodic
9,399,143103,4.216667,noninvasivesystolic,111.0,aperiodic


In [23]:
# get time series for each patient
systolic = aperiodic_data.loc[aperiodic_data['Key'] == 'noninvasivesystolic']
systolic = systolic.drop(columns = ['index', 'Key', 'origin'])
systolic.rename(columns={'Value': 'noninvasivesystolic'}, inplace=True)

diastolic = aperiodic_data.loc[aperiodic_data['Key'] == 'noninvasivediastolic']
diastolic = diastolic.drop(columns = ['index', 'Key', 'origin'])
diastolic.rename(columns={'Value': 'noninvasivediastolic'}, inplace=True)

mean = aperiodic_data.loc[aperiodic_data['Key'] == 'noninvasivemean']
mean = mean.drop(columns = ['index', 'Key', 'origin'])
mean.rename(columns={'Value': 'noninvasivemean'}, inplace=True)

tmp_joined = pd.merge(systolic, diastolic, how='left', on=['patientunitstayid', 'observationoffset'])
patient_joined = pd.merge(tmp_joined, mean, how='left', on=['patientunitstayid', 'observationoffset'])

#print(patient_joined.head(10))
by_patient = patient_joined.groupby('patientunitstayid')

patient_time_series = np.vstack([i[1].reset_index().drop(columns=['index']).to_numpy() for i in by_patient])
patient_time_series_df = pd.DataFrame(patient_time_series, columns=patient_joined.columns)
patient_time_series_df = patient_time_series_df.loc[patient_time_series_df['observationoffset'] >= 0]

patient_time_series_df.head(10)

Unnamed: 0,patientunitstayid,observationoffset,noninvasivesystolic,noninvasivediastolic,noninvasivemean
0,143103.0,0.15,103.0,63.0,76.0
1,143103.0,0.65,115.0,70.0,85.0
2,143103.0,1.15,125.0,81.0,98.0
3,143103.0,4.216667,111.0,72.0,87.0
4,143103.0,15.966667,164.0,88.0,123.0
5,143103.0,16.966667,157.0,90.0,119.0
6,143103.0,17.966667,144.0,84.0,107.0
7,143103.0,18.966667,134.0,77.0,105.0
8,143103.0,20.966667,138.0,73.0,96.0
9,143103.0,21.283333,136.0,78.0,101.0


In [28]:
print(patient_time_series.shape)
#np.save('aperiodic_time_series_cleaned.npy', patient_time_series)
patient_time_series_df.to_csv('aperiodic_time_series_granular.csv', sep='\t')

(492547, 5)


In [29]:
# bin values in patient_time_series_df by hour
patient_time_series_df['observationoffset'] = list(map(int, patient_time_series_df['observationoffset']))
binned_time_series = patient_time_series_df.groupby(['patientunitstayid', 'observationoffset']).mean().reset_index()
binned_time_series.head(100)

Unnamed: 0,patientunitstayid,observationoffset,noninvasivesystolic,noninvasivediastolic,noninvasivemean
0,143103.0,0,109.0,66.5,80.5
1,143103.0,1,125.0,81.0,98.0
2,143103.0,4,111.0,72.0,87.0
3,143103.0,15,164.0,88.0,123.0
4,143103.0,16,157.0,90.0,119.0
...,...,...,...,...,...
95,143103.0,117,131.2,74.6,97.8
96,143103.0,118,136.5,75.5,104.0
97,143103.0,119,122.0,73.0,92.0
98,143103.0,120,120.0,77.5,96.0


In [30]:
binned_time_series.to_csv('binned_aperiodic_data.csv', sep='\t')