# Exploratory Neural Network (may take an hour to run)

In [1]:
import wfdb
import pandas as pd
import numpy as np
import ast
import pyspark.sql.functions as F
from pyspark.sql.functions import col,sum, isnan
from scipy.signal import find_peaks
from scipy import sparse
from scipy.sparse.linalg import spsolve
import matplotlib.pyplot as plt 
import biosppy
from biosppy.signals import ecg

In [None]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

In [None]:
path = "ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/"
sampling_rate=100

In [None]:
X = load_raw_data(Y, sampling_rate, path)

In [None]:
Y = pd.read_csv('ECG_features.csv', index_col='ecg_id')
Y.reset_index(drop=False, inplace=True)
Y.index += 1
Y.head()
len(Y.columns)
Y = Y.replace(float("nan"), 0)

In [None]:
Y['NORM'] = Y['NORM'].astype(bool).astype(int)
Y['STTC'] = Y['STTC'].astype(bool).astype(int)
Y['MI'] = Y['MI'].astype(bool).astype(int)
Y['HYP'] = Y['HYP'].astype(bool).astype(int)
Y['CD'] = Y['CD'].astype(bool).astype(int)

In [None]:
from pyspark.sql import SparkSession
from pyspark import SQLContext
import os

spark = SparkSession.builder \
        .master("local") \
        .appName("mllib_classifier") \
        .getOrCreate()
sc = spark.sparkContext

sql = SQLContext(sc)

In [None]:
mySchema = StructType([ 
    StructField("ecg_id", IntegerType(), True)\
    ,StructField("patient_id", FloatType(), True)\
    ,StructField("age", FloatType(), True)\
    ,StructField("sex", IntegerType(), True)\
    ,StructField("height", FloatType(), True)\
    ,StructField("weight", FloatType(), True)\
    ,StructField("nurse", FloatType(), True)\
    ,StructField("site", StringType(), True)\
    ,StructField("device", StringType(), True)\
    ,StructField("recording_date", StringType(), True)\
    ,StructField("report", StringType(), True)\
    ,StructField("scp_codes", StringType(), True)\
    ,StructField("heart_axis", StringType(), True)\
    ,StructField("infarction_stadium1", StringType(), True)\
    ,StructField("infarction_stadium2", StringType(), True)\
    ,StructField("validated_by", FloatType(), True)\
    ,StructField("second_opinion", StringType(), True)\
    ,StructField("initial_autogenerated_report", StringType(), True)\
    ,StructField("validated_by_human", StringType(), True)\
    ,StructField("baseline_drift", StringType(), True)\
    ,StructField("static_noise", StringType(), True)\
    ,StructField("burst_noise", StringType(), True)\
    ,StructField("electrodes_problems", StringType(), True)\
    ,StructField("extra_beats", StringType(), True)\
    ,StructField("pacemaker", StringType(), True)\
    ,StructField("strat_fold", StringType(), True)\
    ,StructField("filename_lr", StringType(), True)\
    ,StructField("filename_hr", StringType(), True)\
    ,StructField("diagnostic_superclass", StringType(), True)\
    ,StructField("NORM", IntegerType(), True)\
    ,StructField("MI", IntegerType(), True)\
    ,StructField("STTC", IntegerType(), True)\
    ,StructField("HYP", IntegerType(), True)\
    ,StructField("CD", IntegerType(), True)\
    ,StructField("bpm", FloatType(), True)\
    ,StructField("bif", FloatType(), True)\
    ,StructField("bif2", FloatType(), True)\
    ,StructField("TRinterval", FloatType(), True)\
    ,StructField("TRratio", FloatType(), True)\
    ,StructField("PRinterval", FloatType(), True)\
    ,StructField("PRratio", FloatType(), True)\
    ,StructField("QRinterval", FloatType(), True)\
])

In [None]:
df = spark.createDataFrame(Y,schema=mySchema)

In [None]:
vars_to_keep = ["diagnostic_superclass","age","sex", "bpm", "bif", "bif2", "TRinterval", "TRratio","PRinterval","PRratio","QRinterval","NORM", "MI","STTC","HYP","CD",
               "strat_fold"]

# subset the dataframe on these predictors
df2 = df[vars_to_keep]
df2.show()

In [None]:
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

assembler = VectorAssembler(
    inputCols=["age","sex", "bpm", "bif", "bif2", "TRinterval", "TRratio","PRinterval","PRratio","QRinterval"],
    outputCol="features")

df2 = assembler.transform(df2)
df2.show()

In [None]:
import pandas as pd
import pyspark.sql.functions as F
import pyspark.mllib.regression as reg
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

In [None]:
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", 
                                withStd=True, withMean=False)

# Fit the DataFrame to the scaler; this computes the mean, standard deviation of each feature
scaler = standardScaler.fit(df2)

# Transform the data in `df2` with the scaler
scaled_df = scaler.transform(df2)

scaled_df.show()

In [None]:
import tensorflow as tf
from pyspark.sql import functions as F

scaled_df2 = scaled_df.withColumn("Target", F.when(col('NORM') == 1, 1).otherwise(0))

target = np.array(scaled_df2.select('Target').collect())

features = np.array(scaled_df2.select('features').collect())

features2 = np.array(scaled_df2.select('features_scaled').collect())

df3 = df[vars_to_keep]

assembler = VectorAssembler(
    inputCols=["bpm", "bif", "bif2", "TRinterval", "TRratio","PRinterval","PRratio","QRinterval"],
    outputCol="features")

df3 = assembler.transform(df3)

standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", 
                                withStd=True, withMean=False)

# Fit the DataFrame to the scaler; this computes the mean, standard deviation of each feature
scaler = standardScaler.fit(df3)

# Transform the data in `df2` with the scaler
scaled_df3 = scaler.transform(df3)

scaled_df3 = scaled_df3.withColumn("Target", F.when(col('NORM') == 1, 1).otherwise(0))

target3 = np.array(scaled_df3.select('Target').collect())
features3 = np.array(scaled_df3.select('features_scaled').collect())

from tensorflow import keras
from tensorflow.keras import layers

x_train, x_test = features2[0:16000], features2[16000:21837]

x_train2, x_test2 = features[0:16000], features[16000:21837]

y_train, y_test = target[0:16000], target[16000:21837]

x_train3, x_test3 = features3[0:16000], features3[16000:21837]

y_train3, y_test3 = target3[0:16000], target3[16000:21837]

def neural_net(x_train, y_train, x_test, y_test, epochs, batch_size):
    model = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=x_train.shape[1:3]),
      tf.keras.layers.Dense(256, activation='relu'),
      tf.keras.layers.Dropout(0.1),
      tf.keras.layers.Dense(256, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

    return model.evaluate(x_test, y_test)

neural_net(x_train, y_train, x_test, y_test, 1000, 256) #Test Accuracy: 0.5988

neural_net(x_train, y_train, x_test, y_test, 1000, 128) #Test Accuracy: 0.6087

neural_net(x_train2, y_train, x_test2, y_test, 1000, 256) #Test Accuracy: 0.6356

neural_net(x_train3, y_train3, x_test3, y_test3, 1000, 128) #Test Accuracy: 0.5220

x_train4, y_train4 = X[0:16000], target[0:16000]

x_test4, y_test4 = X[16000:21837], target[16000:21837]

neural_net(x_train4, y_train4, x_test4, y_test4, 1000, 128) #Test Accuracy: 0.5102