In [None]:
import os
import os.path as osp
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import collections

In [None]:
data_dir = '/home/akara/Workspace/sleep_data/shhs'
work_dir = osp.join(data_dir, 'datasets')

In [None]:
glob.glob(osp.join(work_dir, '*.csv'))

In [None]:
shhs1_csv = osp.join(work_dir, 'shhs1-dataset-0.19.0.csv')
shhs_var_csv = osp.join(work_dir, 'shhs-data-dictionary-0.19.0-variables.csv')

In [None]:
ann_df = pd.read_csv(shhs1_csv)
var_df = pd.read_csv(shhs_var_csv)

In [None]:
ann_df

In [None]:
var_df

# Random pick subjects

In [None]:
def AHI_class(v):
    '''
    Assign each subject's AHI to one of obstructive sleep apnea (OSA) severity categories.
    Ref: https://jcsm.aasm.org/doi/pdf/10.5664/jcsm.7916
    '''
    # Normal
    if v < 5:
        return 0
    # Mild
    elif v < 15:
        return 1
    # Moderate
    elif v < 30:
        return 2
    # Severe
    else:
        return 3
    
# Apnea-Hypopnea Index (AHI), the AASM recommended definition (3%A)
target_c = 'ahi_o0h3a'
ann_df['osa_cat'] = ann_df[target_c].apply(AHI_class)

In [None]:
n_samples = 100

In [None]:
nor_df = ann_df[ann_df['osa_cat'] == 0].sample(n_samples)
nor_df

In [None]:
osa_df = ann_df[ann_df['osa_cat'] > 0].sample(n_samples)
osa_df

In [None]:
nor_df['osa_cat'].value_counts()

In [None]:
osa_df['osa_cat'].value_counts()

In [None]:
# nor_df.to_csv('shhs1_normal.csv', index=False)
# osa_df.to_csv('shhs1_osa.csv', index=False)

# Create Symbolic Link

In [None]:
nor_df = pd.read_csv('shhs1_normal.csv')
osa_df = pd.read_csv('shhs1_osa.csv')

In [None]:
list(glob.glob(osp.join(data_dir, '*')))

In [None]:
glob.glob('/home/akara/Workspace/U-Time/processed/*')

In [None]:
data_dir = '/home/akara/Workspace/U-Time/processed/shhs1'

# Normal
out_dir = '/home/akara/Workspace/U-Time/processed/shhs1_nor'
if not osp.isdir(out_dir):
    os.makedirs(out_dir)
for sid in nor_df['nsrrid']:
    src = osp.join(data_dir, f"shhs1-{sid}")
    dst = osp.join(out_dir, f"shhs1-{sid}")
    if osp.exists(dst):
        os.unlink(dst)
    os.symlink(src, dst)
    
# OSA
out_dir = '/home/akara/Workspace/U-Time/processed/shhs1_osa'
if not osp.isdir(out_dir):
    os.makedirs(out_dir)
for sid in osa_df['nsrrid']:
    src = osp.join(data_dir, f"shhs1-{sid}")
    dst = osp.join(out_dir, f"shhs1-{sid}")
    if osp.exists(dst):
        os.unlink(dst)
    os.symlink(src, dst)

# Study SHHS1 dataset

In [None]:
freq_used_cols = list(var_df.loc[var_df['commonly_used'] == True]['id'].values)
freq_used_cols = list(ann_df.columns.intersection(freq_used_cols))
freq_used_cols

In [None]:
ann_df.loc[:, ann_df.columns.isin(freq_used_cols)]

In [None]:
for i, r in var_df[var_df['id'].isin(freq_used_cols)].iterrows():
    print(f"{r['id']}: {r['display_name']}")

In [None]:
ann_df['visitnumber'].value_counts()

In [None]:
pat = '|'.join(r"\b{}\b".format(x) for x in ['apnea','Apnea'])
sleep_apnea_cols = var_df[var_df['display_name'].str.contains(pat)][['id','display_name','description','type']]
sleep_apnea_cols

In [None]:
list(ann_df.loc[:, ann_df.columns.isin(sleep_apnea_cols['id'])].columns)

In [None]:
for i, r in sleep_apnea_cols[sleep_apnea_cols['type']=='choices'].iterrows():
    print(f"{r['id']} ({r['type']}): {r['description']}")
    if r['id'] in ann_df.columns:
        print(ann_df[r['id']].value_counts())
        ann_df[r['id']].hist()
        plt.show()
        plt.close('all')
    else:
        print('Not exists')

In [None]:
ann_df.loc[(ann_df['sa15'] == 1) | (ann_df[''] == 1)]

In [None]:
ann_df['ahi_o0h4a'].hist()

In [None]:
ann_df['ahi_a0h4a'].hist()

In [None]:
def AHI_class(v):
    if v < 5:
        return 0
    elif v < 15:
        return 1
    elif v < 30:
        return 2
    else:
        return 3

In [None]:
target_cols = [
    # 'ahi_a0h3', 'ahi_a0h4', 'ahi_o0h3', 'ahi_o0h4',
    # 'ahi_a0h3a', 'ahi_a0h4a', 'ahi_o0h3a', 'ahi_o0h4a'
    'ahi_a0h3a', 'ahi_o0h3a', 
]
for c in target_cols:
    cnts = ann_df[c].apply(AHI_class).value_counts().to_dict()
    n_subs = sum(cnts.values())
    cnts = collections.OrderedDict(sorted(cnts.items()))
    print(f"{c} ({n_subs}): {cnts}")