In [315]:
import keras
import pandas as pd

In [316]:
data_1 = pd.read_csv('mit-bih-data-clear-data.csv')
data_2 = pd.read_csv('ptb-diagnostic-clear-data.csv')


# Feature cleanup
df = pd.concat([data_1, data_2], ignore_index=True)
df.to_csv('heart_rate_final_data.csv', index=False)
columns_to_drop = [
    'patient',
    'Max_hr_slope',
    'mean_deviation',
    'outliers_percentage'
]

columns_to_reduce = [
    'vlf_power',
    'lf_power',
    'hf_power',
    'highest_heart_rate',
    'lowest_heart_rate',
    'mean_heart_rate',
    'median_heart_rate'
    
]

df[columns_to_reduce] = df[columns_to_reduce].apply(lambda x: x/100)
df.drop(columns_to_drop, axis=1, inplace=True)


df.value_counts('diagnosis')


diagnosis
1    487
0     83
Name: count, dtype: int64

In [317]:
# Let's check for missing values

df.isnull().sum()


diagnosis                      0
highest_heart_rate             0
lowest_heart_rate              0
mean_heart_rate                0
median_heart_rate              0
standard_deviation_hr          0
minimum_hrv                    0
maximum_hrv                    0
mean_hrv                       0
median_hrv                     0
standard_deviation_hrv         0
mean_hr_slope                  0
tendency_slope                 0
standard_deviation_hr_slope    0
vlf_power                      0
lf_power                       0
hf_power                       0
mean_rr                        0
standard_deviation_rr          0
minimum_rr                     0
maximum_rr                     0
tendency_standard_deviation    0
approximation_entropy          0
sample_entropy                 2
dtype: int64

In [318]:
# Replace missing and infinite values with median
from sklearn.impute import SimpleImputer
import numpy as np
df.replace([np.inf, -np.inf], np.nan, inplace=True)
imputer = SimpleImputer(strategy='median')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [319]:
# Resample the data

# Scenario 1: This works fine, however, this duplicates data and can lead to overfitting

# from sklearn.utils import resample
# 
# df_majority = df[df.diagnosis==1]
# df_minority = df[df.diagnosis==0]
# 
# df_minority_upsampled = resample(df_minority, replace=True, n_samples=487)
# df_upsampled = pd.concat([df_majority, df_minority_upsampled])
# df_upsampled.value_counts('diagnosis')

# Scenario 2: Let's use smote to generate synthetic data

from imblearn.over_sampling import SMOTE
smote = SMOTE()
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']
X_resampled, y_resampled = smote.fit_resample(X, y)

df_upsampled = pd.concat([X_resampled, y_resampled], axis=1)




In [320]:
df_upsampled.value_counts('diagnosis')

diagnosis
0.0    487
1.0    487
Name: count, dtype: int64

In [321]:
X = df_upsampled.drop(columns=['diagnosis'])
y = df_upsampled['diagnosis']

X.head()

Unnamed: 0,highest_heart_rate,lowest_heart_rate,mean_heart_rate,median_heart_rate,standard_deviation_hr,minimum_hrv,maximum_hrv,mean_hrv,median_hrv,standard_deviation_hrv,...,vlf_power,lf_power,hf_power,mean_rr,standard_deviation_rr,minimum_rr,maximum_rr,tendency_standard_deviation,approximation_entropy,sample_entropy
0,1.13089,1.028571,1.081274,1.085427,2.022095,0.501672,5.685571,0.645095,0.0,0.926676,...,4.893324,61.726334,221.350711,3.906391,2.105712,1.069444,12.505556,0.702512,1.408457,1.177573
1,1.136842,0.847059,0.963004,0.951542,5.026822,0.346988,12.139094,1.013533,0.410101,1.456583,...,17.896511,145.833155,367.016413,6.039207,3.036317,1.075,33.636111,4.329784,1.282741,1.121386
2,0.903766,0.64095,0.758583,0.757895,4.681641,0.19895,12.804878,1.090299,0.0,1.755623,...,42.477758,192.530932,311.683614,10.013405,6.048494,2.15,30.163889,3.849098,1.307559,1.043753
3,0.84375,0.708197,0.755022,0.75,2.64487,0.237619,5.898876,0.629692,0.0,0.959347,...,16.714324,68.667396,209.085037,6.649409,2.325983,1.527778,18.361111,2.302985,1.2261,1.048676
4,0.54,0.478936,0.507868,0.508235,0.903975,0.109323,2.010028,0.251715,0.111317,0.363182,...,7.649133,28.05919,67.323863,10.148943,5.530223,2.288889,30.327778,0.644782,1.55832,1.377851


In [322]:
from sklearn.impute import SimpleImputer
import numpy as np
X.replace([np.inf, -np.inf], np.nan, inplace=True)
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

df.head()

Unnamed: 0,diagnosis,highest_heart_rate,lowest_heart_rate,mean_heart_rate,median_heart_rate,standard_deviation_hr,minimum_hrv,maximum_hrv,mean_hrv,median_hrv,...,vlf_power,lf_power,hf_power,mean_rr,standard_deviation_rr,minimum_rr,maximum_rr,tendency_standard_deviation,approximation_entropy,sample_entropy
0,1.0,1.13089,1.028571,1.081274,1.085427,2.022095,0.501672,5.685571,0.645095,0.0,...,4.893324,61.726334,221.350711,3.906391,2.105712,1.069444,12.505556,0.702512,1.408457,1.177573
1,1.0,1.136842,0.847059,0.963004,0.951542,5.026822,0.346988,12.139094,1.013533,0.410101,...,17.896511,145.833155,367.016413,6.039207,3.036317,1.075,33.636111,4.329784,1.282741,1.121386
2,1.0,0.903766,0.64095,0.758583,0.757895,4.681641,0.19895,12.804878,1.090299,0.0,...,42.477758,192.530932,311.683614,10.013405,6.048494,2.15,30.163889,3.849098,1.307559,1.043753
3,1.0,0.84375,0.708197,0.755022,0.75,2.64487,0.237619,5.898876,0.629692,0.0,...,16.714324,68.667396,209.085037,6.649409,2.325983,1.527778,18.361111,2.302985,1.2261,1.048676
4,1.0,0.54,0.478936,0.507868,0.508235,0.903975,0.109323,2.010028,0.251715,0.111317,...,7.649133,28.05919,67.323863,10.148943,5.530223,2.288889,30.327778,0.644782,1.55832,1.377851


In [323]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [324]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [325]:
from keras import Sequential
from keras.api.layers import Dense, Dropout, Activation

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [326]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])


from keras.api.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('model.keras', save_best_only=True)

history = model.fit(X_train_resampled, y_train_resampled, validation_data=(X_test, y_test), epochs=1000, callbacks=[checkpoint])


Epoch 1/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4713 - loss: 0.7375 - val_accuracy: 0.3487 - val_loss: 0.7491
Epoch 2/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4246 - loss: 0.7569 - val_accuracy: 0.3590 - val_loss: 0.7462
Epoch 3/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4654 - loss: 0.7372 - val_accuracy: 0.3692 - val_loss: 0.7434
Epoch 4/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4550 - loss: 0.7322 - val_accuracy: 0.3641 - val_loss: 0.7408
Epoch 5/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4482 - loss: 0.7417 - val_accuracy: 0.3744 - val_loss: 0.7382
Epoch 6/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4371 - loss: 0.7344 - val_accuracy: 0.3795 - val_loss: 0.7360
Epoch 7/1000
[1m25/25[0m 

In [327]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Loss: {loss:.4f}')

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 845us/step - accuracy: 0.8012 - loss: 0.4671
Test Accuracy: 0.8308
Test Loss: 0.4223


In [328]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = (model.predict(X_test) > 0.5).astype("int32")
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", cr)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
Confusion Matrix:
 [[91  6]
 [27 71]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.94      0.85        97
         1.0       0.92      0.72      0.81        98

    accuracy                           0.83       195
   macro avg       0.85      0.83      0.83       195
weighted avg       0.85      0.83      0.83       195

