In [1]:
!pip install wfdb

Collecting wfdb
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Downloading wfdb-4.3.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wfdb
Successfully installed wfdb-4.3.0


In [2]:
def flatten_feature_dict(d, keep_sums_for=None, keep_means_for=None):
    if keep_sums_for is None:
        keep_sums_for = {'T_inversion', 'Premature_beat', 'Bigeminy', 'Trigeminy'}
    if keep_means_for is None:
        keep_means_for = {'T_inversion', 'Premature_beat', 'Bigeminy', 'Trigeminy'}

    flat = {}
    for k, v in d.items():
        if isinstance(v, dict):
            for subk, subv in v.items():
                if pd.isna(subv):
                    continue  # Skip NaNs entirely
                # Always keep mean values
                if subk == 'mean':
                    flat[f'{k}_{subk}'] = subv
                # Keep sum only for specified features
                elif subk == 'sum' and k in keep_sums_for:
                    flat[f'{k}_{subk}'] = subv
                # Keep min, max, std for non-binary columns
                elif subk in {'min', 'max', 'std'} and k not in keep_sums_for:
                    flat[f'{k}_{subk}'] = subv
        else:
            if not pd.isna(v):
                flat[k] = v
    return flat

In [3]:
from collections import Counter
def extract_signal_features(df_signal):
    # Aggregation
    agg_funcs = {
        'Duree_QRS_ms': ['mean', 'std', 'min', 'max'],
        'Duree_P_ms': ['mean', 'std', 'min', 'max'],
        'Duree_T_ms': ['mean', 'std', 'min', 'max'],
        'Intervalle_QT_ms': ['mean', 'std', 'min', 'max'],
        'Intervalle_PR_ms': ['mean', 'std', 'min', 'max'],
        'Intervalle_ST_ms': ['mean', 'std', 'min', 'max'],
        'Amplitude_P': ['mean', 'std', 'min', 'max'],
        'Amplitude_Q': ['mean', 'std', 'min', 'max'],
        'Amplitude_R': ['mean', 'std', 'min', 'max'],
        'Amplitude_S': ['mean', 'std', 'min', 'max'],
        'Amplitude_T': ['mean', 'std', 'min', 'max'],
        'T/R_ratio': ['mean', 'std', 'min', 'max'],
        'P/R_ratio': ['mean', 'std', 'min', 'max'],
        'QRS_area': ['mean', 'std', 'min', 'max'],
        'Slope_QR': ['mean', 'std', 'min', 'max'],
        'Slope_RS': ['mean', 'std', 'min', 'max'],
        'Heart_rate_bpm': ['mean', 'std', 'min', 'max'],
        'Local_RMSSD': ['mean', 'std', 'min', 'max'],
        'T_inversion': ['sum', 'mean'],
        'Premature_beat': ['sum'],
        'Bigeminy': ['sum'],
        'Trigeminy': ['sum'],
    }

    agg_df = df_signal.agg(agg_funcs)
    agg_df.columns = [''.join(col).strip() for col in agg_df.columns.values]
    agg_features = agg_df.to_dict()

    beat_types = df_signal['Type'].tolist()
    type_counts = Counter(beat_types)
    total_beats = len(df_signal)

    type_features = {
        f'count_{t}': type_counts.get(t, 0)
        for t in ['N', 'L', 'R', '/', 'V', 'else']
    }
    type_features.update({
        f'ratio_{t}': type_counts.get(t, 0) / total_beats if total_beats > 0 else 0
        for t in ['N', 'L', 'R', '/', 'V', 'else']
    })

    derived_features = {
        'percent_T_inversion': df_signal['T_inversion'].mean(),
        'QRS_prolonged_ratio': (df_signal['Duree_QRS_ms'] > 120).mean(),
        'QT_prolonged_ratio': (df_signal['Intervalle_QT_ms'] > 450).mean(),
        'PVC_ratio': type_counts.get('V', 0) / total_beats if total_beats > 0 else 0,
        'num_beats': total_beats,
        'std_Intervalle_RR_ms': df_signal['Intervalle_RR_ms'].std(skipna=True)
    }

    signal_features = {
        **agg_features,
        **type_features,
        **derived_features
    }

    
    # Then standardize the keys
    return flatten_feature_dict(signal_features)


In [4]:

import pandas as pd
beats1_df = pd.read_csv("/kaggle/input/comp-2021-features-extraction-0-to-29417/ecg-comp-2021-features.csv")
beats1_df = beats1_df[beats1_df['Duree_QRS_ms'] >= 40]
beats1_df.count()["record_name"]

433741

In [5]:
beats1_df.groupby('record_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d486ec39990>

In [6]:
import pandas as pd
paths = {
    "/kaggle/input/comp-2021-features-extraction-0-to-29417",
    "/kaggle/input/comp-2021-features-extraction-v2-29417-to-58834",
    "/kaggle/input/comp-2021-features-extraction-v3-58834-to-73543",
    "/kaggle/input/comp-2021-features-extraction-v4-73543-to-end"
}
p = 0
for path in paths:
    p+=1
    beats1_df = pd.read_csv(path+"/ecg-comp-2021-features.csv")
    beats1_df = beats1_df[beats1_df['Duree_QRS_ms'] >= 40]
    beats1_df.count()["record_name"]
    signal_feature_list = []
    
    for record_name, group_df in beats1_df.groupby('record_name'):
        features = extract_signal_features(group_df)
        features['record_name'] = record_name
    
        signal_feature_list.append(features)
    
    df_signal_features = pd.DataFrame(signal_feature_list)
    df_signal_features = df_signal_features[['record_name'] + [col for col in df_signal_features.columns if col != 'record_name']]
    df_signal_features.to_csv(f"comp_2021_v{p}_signal_features.csv",index=False)

In [7]:
df_signal_features.columns

Index(['record_name', 'Duree_QRS_ms_mean', 'Duree_QRS_ms_std',
       'Duree_QRS_ms_min', 'Duree_QRS_ms_max', 'Duree_P_ms_mean',
       'Duree_P_ms_std', 'Duree_P_ms_min', 'Duree_P_ms_max', 'Duree_T_ms_mean',
       'Duree_T_ms_std', 'Duree_T_ms_min', 'Duree_T_ms_max',
       'Intervalle_QT_ms_mean', 'Intervalle_QT_ms_std', 'Intervalle_QT_ms_min',
       'Intervalle_QT_ms_max', 'Intervalle_PR_ms_mean', 'Intervalle_PR_ms_std',
       'Intervalle_PR_ms_min', 'Intervalle_PR_ms_max', 'Intervalle_ST_ms_mean',
       'Intervalle_ST_ms_std', 'Intervalle_ST_ms_min', 'Intervalle_ST_ms_max',
       'Amplitude_P_mean', 'Amplitude_P_std', 'Amplitude_P_min',
       'Amplitude_P_max', 'Amplitude_Q_mean', 'Amplitude_Q_std',
       'Amplitude_Q_min', 'Amplitude_Q_max', 'Amplitude_R_mean',
       'Amplitude_R_std', 'Amplitude_R_min', 'Amplitude_R_max',
       'Amplitude_S_mean', 'Amplitude_S_std', 'Amplitude_S_min',
       'Amplitude_S_max', 'Amplitude_T_mean', 'Amplitude_T_std',
       'Amplitude_T_mi

In [8]:
print(df_signal_features.iloc[0])

record_name             training/chapman_shaoxing/g1/JS00001
Duree_QRS_ms_mean                                 227.333333
Duree_QRS_ms_std                                   94.729336
Duree_QRS_ms_min                                        40.0
Duree_QRS_ms_max                                       380.0
                                        ...                 
QRS_prolonged_ratio                                 0.777778
QT_prolonged_ratio                                  0.166667
PVC_ratio                                                0.0
num_beats                                                 18
std_Intervalle_RR_ms                              151.173838
Name: 0, Length: 96, dtype: object


In [9]:
import wfdb
import pandas as pd
from tqdm import tqdm  # optional: for progress bar

# Load signal features DataFrame
sign1_df = pd.read_csv("/kaggle/working/comp_2021_v1_signal_features.csv")
sign2_df = pd.read_csv("/kaggle/working/comp_2021_v2_signal_features.csv")
sign3_df = pd.read_csv("/kaggle/working/comp_2021_v3_signal_features.csv")
sign4_df = pd.read_csv("/kaggle/working/comp_2021_v4_signal_features.csv")

arr = [sign1_df,sign2_df,sign3_df,sign4_df]
origin_comp_df = pd.concat(arr, ignore_index=True)

# Path to the signal f3les
signals_path = "/kaggle/input/signal-classification-data/challenge-2021/"

# Create new columns
origin_comp_df["Age"] = None
origin_comp_df["Sex"] = None
origin_comp_df["Dx"] = None
origin_comp_df["Class"] = None

origin_comp_df.iloc[0]

record_name             training/ningbo/g34/JS44560
Duree_QRS_ms_mean                        166.117647
Duree_QRS_ms_std                          33.320193
Duree_QRS_ms_min                              140.0
Duree_QRS_ms_max                              272.0
                                   ...             
std_Intervalle_RR_ms                       6.961801
Age                                            None
Sex                                            None
Dx                                             None
Class                                          None
Name: 0, Length: 100, dtype: object

In [10]:

# Iterate over each record and extract metadata
for i in tqdm(range(len(origin_comp_df))):
#for i in tqdm(range(1)):
    
    name = str(origin_comp_df.loc[i, "record_name"])
    
    
    try:
        record = wfdb.rdheader(signals_path + name)
        info = {line.split(':')[0].strip(): line.split(':')[1].strip()
                for line in record.comments if ':' in line}
        
        # Store Dx as list
        dx_list = info.get("Dx", "").split(',') if "Dx" in info else []
        origin_comp_df.at[i, "Dx"] = dx_list

        origin_comp_df.loc[i, "Age"] = info.get("Age")
        origin_comp_df.loc[i, "Sex"] = info.get("Sex")
        if "426783006" in dx_list:
            origin_comp_df.loc[i, "Class"] = 0
        
        elif "426177001" in dx_list : 
            origin_comp_df.loc[i, "Class"] = 1
            
        else : 
            origin_comp_df.loc[i, "Class"] = 2
            
    except Exception as e:
        print(f"Failed to read {name}: {e}")
        continue
    

100%|██████████| 88230/88230 [14:35<00:00, 100.80it/s]


In [11]:


origin_comp_df.to_csv("ecg-comp-2021-signals-features.csv",index=False)



In [12]:
! rm -f "/kaggle/working/comp_2021_v1_signal_features.csv" "/kaggle/working/comp_2021_v2_signal_features.csv" "/kaggle/working/comp_2021_v3_signal_features.csv" "/kaggle/working/comp_2021_v4_signal_features.csv"

In [13]:
beats1_df = pd.read_csv("/kaggle/input/comp-2021-features-extraction-0-to-29417/ecg-comp-2021-features.csv")
beats2_df = pd.read_csv("/kaggle/input/comp-2021-features-extraction-v2-29417-to-58834/ecg-comp-2021-features.csv")
beats3_df = pd.read_csv("/kaggle/input/comp-2021-features-extraction-v3-58834-to-73543/ecg-comp-2021-features.csv")
beats4_df = pd.read_csv("/kaggle/input/comp-2021-features-extraction-v4-73543-to-end/ecg-comp-2021-features.csv")

arr1 = [beats1_df,beats2_df,beats3_df,beats4_df]
origin_beats_df = pd.concat(arr1, ignore_index=True)
origin_beats_df.to_csv("ecg-comp-2021-beats-features.csv", index=False)