In [2]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wfdb
import random
import neurokit2 as nk
import tsfresh as tsf
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features

In [3]:
dataset = pd.read_csv('patient_scp.csv')
hrv_dataset = pd.read_csv('ECG_Cardiac_Features.csv')
directory = 'physionet.org/files/ptb-xl/1.0.3/'

In [13]:
time_features = pd.DataFrame()

In [14]:
settings = ComprehensiveFCParameters()
print(settings)
print(len(settings))

{'variance_larger_than_standard_deviation': None, 'has_duplicate_max': None, 'has_duplicate_min': None, 'has_duplicate': None, 'sum_values': None, 'abs_energy': None, 'mean_abs_change': None, 'mean_change': None, 'mean_second_derivative_central': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variation_coefficient': None, 'variance': None, 'skewness': None, 'kurtosis': None, 'root_mean_square': None, 'absolute_sum_of_changes': None, 'longest_strike_below_mean': None, 'longest_strike_above_mean': None, 'count_above_mean': None, 'count_below_mean': None, 'last_location_of_maximum': None, 'first_location_of_maximum': None, 'last_location_of_minimum': None, 'first_location_of_minimum': None, 'percentage_of_reoccurring_values_to_all_values': None, 'percentage_of_reoccurring_datapoints_to_all_datapoints': None, 'sum_of_reoccurring_values': None, 'sum_of_reoccurring_data_points': None, 'ratio_value_number_to_time_series_length': None, 'sample_entropy': None, 

In [None]:
print(settings)
features = 20
for index, row in hrv_dataset.iterrows():
    print(f"{index}/{len(hrv_dataset)}")
    ecg_id = row['ecg_id']
    record = wfdb.rdrecord(directory + dataset[dataset['ecg_id'] == ecg_id]['filename_hr'].values[0])
    ecg_signal = record.p_signal[:,0]
    ecg_signal = nk.ecg_clean(ecg_signal, sampling_rate=500)

    ecg_df = pd.DataFrame({
        'id': [ecg_id] * len(ecg_signal),
        'time': range(len(ecg_signal)),
        'value': ecg_signal
    })

    extracted_features = extract_features(
        ecg_df,
        column_id='id',
        column_sort='time',
        column_value='value',
        n_jobs=1,
        default_fc_parameters=settings
    )

    extracted_features['ecg_id'] = ecg_id
    print(extracted_features)
    print(extracted_features.shape)
    time_features = pd.concat([time_features, extracted_features], ignore_index=True)


    

In [None]:
time_features.to_csv('time_features.csv', index=False)

In [4]:
from tsfresh.feature_selection.relevance import calculate_relevance_table

time_features = pd.read_csv('time_features.csv')
y = pd.read_csv('patient_scp.csv')
y = y[['ecg_id', 'label']]

time_features = time_features.merge(y, on='ecg_id')
time_features.head()

X = time_features.drop(columns=['ecg_id', 'label'])  # Feature columns
y = time_features['label']  # Target column

X=X.drop(columns=X.columns[X.isna().any()])

relevance_table = calculate_relevance_table(X, y, ml_task='auto')

sorted_table = relevance_table.sort_values(by='p_value')
top_n_features = sorted_table.head(100)
with open('top_n_features_relevance.csv', 'w') as f:
    f.write(top_n_features.to_csv(index=False))



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

# Load the time features and target data
time_features = pd.read_csv('time_features.csv')
y = pd.read_csv('patient_scp.csv')
y = y[['ecg_id', 'label']]

# Merge features with labels
time_features = time_features.merge(y, on='ecg_id')

# Separate features and target
X = time_features.drop(columns=['ecg_id', 'label'])  # Feature columns
y = time_features['label']  # Target column

# Drop features with NaN values
X = X.drop(columns=X.columns[X.isna().any()])

# Use SelectKBest to select the top 50 features
selector = SelectKBest(score_func=f_classif, k=50)  # Use ANOVA F-value for scoring
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
print(selected_feature_names)

with open('top_n_features_selectkbest.txt', 'w') as f:
    for i in selected_feature_names:
        f.write(i + '\n')


Index(['value__abs_energy', 'value__median', 'value__standard_deviation',
       'value__variance', 'value__kurtosis', 'value__root_mean_square',
       'value__maximum', 'value__absolute_maximum', 'value__minimum',
       'value__benford_correlation', 'value__c3__lag_1', 'value__c3__lag_2',
       'value__c3__lag_3', 'value__symmetry_looking__r_0.05',
       'value__quantile__q_0.1', 'value__quantile__q_0.6',
       'value__quantile__q_0.9', 'value__partial_autocorrelation__lag_2',
       'value__binned_entropy__max_bins_10',
       'value__spkt_welch_density__coeff_2',
       'value__spkt_welch_density__coeff_5',
       'value__change_quantiles__f_agg_"mean"__isabs_True__qh_0.2__ql_0.0',
       'value__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0',
       'value__change_quantiles__f_agg_"mean"__isabs_True__qh_0.6__ql_0.0',
       'value__change_quantiles__f_agg_"var"__isabs_False__qh_0.4__ql_0.2',
       'value__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.2

  53  54  55  56  57  58  59  60  61  62  63  64  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  91 104 364 668 669 670 757] are constant.
  f = msb / msw
