In [1]:
# --- Baseline packages ---
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# --- Statistical packages ---
from sklearn.preprocessing import MultiLabelBinarizer

# --- Utility packages ---
import os

# **A.** Preliminaries

In [2]:
# Identify relevant folder and contents.
original_metadata_path = os.path.join("..", "metadata", "patient_diagnostics.xlsx")
meta_ori_df = pd.read_excel(original_metadata_path)

clean_metadata_path = os.path.join("..", "metadata", "patient_diagnostics_clean.csv")
meta_clean_df = pd.read_csv(clean_metadata_path)

# **B.** Profiling

### **B.1.** Rhythm-related Condition Distribution 
*Absolute figures*

In [3]:
meta_clean_df[['Rhythm_L1', 'Rhythm_L2']].value_counts()

Rhythm_L1  Rhythm_L2
SB         SB           3888
SR         SR           1825
AFIB       AFIB         1780
GSVT       ST           1564
           SVT           544
AFIB       AF            438
SR         SI            397
GSVT       AT            121
           AVNRT          16
           AVRT            8
           SAAWR           7
Name: count, dtype: int64

### **B.2.** Co-occurrence Distribution
*As percent of total recordings for each merged-label*

In [4]:
pd.DataFrame(meta_clean_df.groupby(['Rhythm_L1', 'BeatCount'])['FileName'].count() * 100 / meta_clean_df.groupby('Rhythm_L1')['FileName'].count()).reset_index().pivot(
    columns='Rhythm_L1',
    index='BeatCount',
    values='FileName'
).round(2)

Rhythm_L1,AFIB,GSVT,SB,SR
BeatCount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,22.5,45.27,56.56,74.44
1,34.63,31.77,29.84,16.88
2,24.21,13.05,8.8,5.58
3,12.17,6.02,3.19,1.89
4,4.55,2.43,0.93,0.86
5,1.53,0.88,0.44,0.23
6,0.36,0.49,0.23,
7,0.05,0.04,0.03,0.09
8,,,,0.05
9,,0.04,,


### **B.3.** Beat-related Condition Distribution

In [5]:
# Extract beat-related condition column.
splitArray = meta_clean_df['Beat'].str.split()

# Encode beat conditions.
mlb = MultiLabelBinarizer()
oneHot = mlb.fit_transform(splitArray)
oneHot_df = pd.DataFrame(oneHot, columns=mlb.classes_)
beat_df = pd.concat([meta_clean_df[['Rhythm_L1', 'BeatCount']], oneHot_df], axis=1)

# Calculate the occurences of each beat condition by merged-rhythm label.
beat_long = beat_df.melt(id_vars='Rhythm_L1', value_vars=beat_df.columns.drop(['Rhythm_L1', 'BeatCount']), var_name='Beat', value_name='Count')
beat_pivot = pd.pivot_table(beat_long, values='Count', index='Beat', columns='Rhythm_L1', aggfunc='sum')
beat_pivot.columns.name = None
beat_pivot['Total'] = beat_pivot.sum(axis=1)

# Calculate as share of total recordings within each merged-rhythm label.
rhythm_tot = meta_clean_df.groupby('Rhythm_L1')['Rhythm_L1'].value_counts()
beat_pivot['AFIB'] = beat_pivot['AFIB'] / rhythm_tot.AFIB
beat_pivot['GSVT'] = beat_pivot['GSVT'] / rhythm_tot.GSVT
beat_pivot['SB'] = beat_pivot['SB'] / rhythm_tot.SB
beat_pivot['SR'] = beat_pivot['SR'] / rhythm_tot.SR
beat_pivot['Total'] = beat_pivot['Total'] / np.sum(rhythm_tot)

beat_pivot.sort_values('Total', ascending=False).head(10)

Unnamed: 0_level_0,AFIB,GSVT,SB,SR,Total
Beat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NONE,0.224977,0.452655,0.565586,0.744374,0.50765
TWC,0.327773,0.15531,0.154064,0.10216,0.179826
LVHV,0.190261,0.079204,0.15535,0.048605,0.124008
STTC,0.224977,0.177876,0.043467,0.045005,0.110502
RBBB,0.08927,0.047345,0.031379,0.014401,0.043351
STDD,0.095131,0.045133,0.015175,0.015302,0.038345
ALS,0.062669,0.040708,0.027263,0.021602,0.036362
VPB,0.074391,0.031858,0.010031,0.014851,0.029184
APB,0.0,0.05354,0.025977,0.025203,0.026256
1AVB,0.0,0.026106,0.040638,0.015752,0.023801


### **B.3.** Beat-related Condition Distribution: AFIB

In [16]:
print(beat_pivot.loc[['ALS', 'LBBB', 'CR'], 'AFIB'].round(3) * 100, "\n")

print("\nAll conditions:")
beat_pivot[['AFIB']].sort_values('AFIB', ascending=False).round(3) * 100

Beat
ALS     6.3
LBBB    1.9
CR      1.0
Name: AFIB, dtype: float64 


All conditions:


Unnamed: 0_level_0,AFIB
Beat,Unnamed: 1_level_1
TWC,32.8
NONE,22.5
STTC,22.5
LVHV,19.0
STDD,9.5
RBBB,8.9
VPB,7.4
AVB,6.7
ALS,6.3
IDC,5.5
