In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
# list of .csv files in the directory
incart_csv = 'INCART 2-lead Arrhythmia Database' # this is the most relevant at the current time of writing this
arr_csv = 'MIT-BIH Arrhythmia Database'
supra_csv = 'MIT-BIH Supraventricular Arrhythmia Database'
scdh_csv = 'Sudden Cardiac Death Holter Database'

In [32]:
csv_path = os.path.join('..',
                        'raw_data', # root directory of the dataset folder
                        f'{incart_csv}.csv' # which csv are we looking at?
                        )
df = pd.read_csv(csv_path)

In [4]:
print(csv_path)

../raw_data/INCART 2-lead Arrhythmia Database.csv


In [5]:
rows = df.shape[0]
cols = df.shape[1]
print(f'The "{incart_csv}" has {rows} rows and {cols} columns.')

The "INCART 2-lead Arrhythmia Database" has 175729 rows and 34 columns.


In [6]:
df.head()

Unnamed: 0,record,type,0_pre-RR,0_post-RR,0_pPeak,0_tPeak,0_rPeak,0_sPeak,0_qPeak,0_qrs_interval,...,1_qPeak,1_qrs_interval,1_pq_interval,1_qt_interval,1_st_interval,1_qrs_morph0,1_qrs_morph1,1_qrs_morph2,1_qrs_morph3,1_qrs_morph4
0,I01,N,163,165,0.06961,-0.083281,0.614133,-0.392761,0.047159,15,...,-0.02337,14,3,23,6,-0.02337,-0.01165,0.082608,0.101373,-0.183387
1,I01,N,165,166,-0.09703,0.597254,-0.078704,-0.078704,-0.137781,3,...,0.081637,15,5,27,7,0.081637,0.102992,0.191225,0.217544,-0.068248
2,I01,N,166,102,0.109399,0.680528,-0.010649,-0.010649,-0.72062,6,...,-0.148539,33,13,52,6,-0.148539,-0.06062,0.08108,0.2044,0.335172
3,I01,VEB,102,231,0.176376,0.256431,-0.101098,-0.707525,-0.101098,4,...,0.046898,21,9,34,4,0.046898,0.083728,0.279512,0.526785,0.450969
4,I01,N,231,165,0.585577,0.607461,-0.083499,-0.083499,-0.167858,3,...,-0.112552,32,5,43,6,-0.112552,0.012989,0.091491,0.134004,0.265232


In [7]:
def load_data(folder_path, folder_name):
    """
    folder_path: string,
                 path of which folder has been saved on project
                 (folder_path = 'raw_data/')

    folder_name: string,
                 name of csv file inside of folder.

           Method:
               load_data('raw_data/Arythmia_monitor.csv') return (pandas.core.frame.DataFrame)
    """

    return pd.read_csv(folder_path + folder_name)

In [8]:
def clean():
    """
    Put descriptions please of what this function is doing
    """
    data = load_data('../raw_data/','INCART 2-lead Arrhythmia Database.csv')
    type_names = {
        'N': 'Normal',
        'SVEB': 'Supraventricular ectopic beat',
        'VEB': 'Ventricular ectopic beat',
        'F': 'Fusion beat',
        'Q': 'Unknown beat'
    }
    data = data.drop(columns=['record'])
    return data

In [9]:
def preprocess():
    """
    The preprocess of data before training
    """
    data = clean()

    X = data.drop('type', axis=1)
    y = data['type']
    type_mapping = {'N': 0, 'SVEB': 1, 'VEB': 2, 'F': 3, 'Q': 4}
    y = y.map(type_mapping)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Scaling the data before training
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    #Resampling and rebalance the data
    smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    #Slicing to get light training memory
    subset_size = 10000
    X_train_subsample = X_train_resampled[:subset_size]
    y_train_subsample = y_train_resampled[:subset_size]

    return X_train_subsample, y_train_subsample, X_test, y_test

In [10]:
def initialize_model():
    """
    Good practice for any further modification
    """
    return RandomForestClassifier(random_state=101, n_estimators=50)

In [11]:
def main_stream():

    X_train, y_train, X_test, y_test = preprocess()
    model = initialize_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #accuracy = accuracy_score(y_test, y_pred)
    return y_pred

In [12]:
main_output = main_stream()
main_output

array([0, 2, 0, ..., 0, 2, 0])

In [70]:
cols = str(df.columns[2:])
cleanX = cols.replace('-','')
cleanX

"Index(['0_preRR', '0_postRR', '0_pPeak', '0_tPeak', '0_rPeak', '0_sPeak',\n       '0_qPeak', '0_qrs_interval', '0_pq_interval', '0_qt_interval',\n       '0_st_interval', '0_qrs_morph0', '0_qrs_morph1', '0_qrs_morph2',\n       '0_qrs_morph3', '0_qrs_morph4', '1_preRR', '1_postRR', '1_pPeak',\n       '1_tPeak', '1_rPeak', '1_sPeak', '1_qPeak', '1_qrs_interval',\n       '1_pq_interval', '1_qt_interval', '1_st_interval', '1_qrs_morph0',\n       '1_qrs_morph1', '1_qrs_morph2', '1_qrs_morph3', '1_qrs_morph4'],\n      dtype='object')"

In [73]:
df.dtypes

record             object
type               object
0_pre-RR            int64
0_post-RR           int64
0_pPeak           float64
0_tPeak           float64
0_rPeak           float64
0_sPeak           float64
0_qPeak           float64
0_qrs_interval      int64
0_pq_interval       int64
0_qt_interval       int64
0_st_interval       int64
0_qrs_morph0      float64
0_qrs_morph1      float64
0_qrs_morph2      float64
0_qrs_morph3      float64
0_qrs_morph4      float64
1_pre-RR            int64
1_post-RR           int64
1_pPeak           float64
1_tPeak           float64
1_rPeak           float64
1_sPeak           float64
1_qPeak           float64
1_qrs_interval      int64
1_pq_interval       int64
1_qt_interval       int64
1_st_interval       int64
1_qrs_morph0      float64
1_qrs_morph1      float64
1_qrs_morph2      float64
1_qrs_morph3      float64
1_qrs_morph4      float64
dtype: object

In [58]:
print(df.dtypes.to_dict())


{'record': dtype('O'), 'type': dtype('O'), '0_pre-RR': dtype('int64'), '0_post-RR': dtype('int64'), '0_pPeak': dtype('float64'), '0_tPeak': dtype('float64'), '0_rPeak': dtype('float64'), '0_sPeak': dtype('float64'), '0_qPeak': dtype('float64'), '0_qrs_interval': dtype('int64'), '0_pq_interval': dtype('int64'), '0_qt_interval': dtype('int64'), '0_st_interval': dtype('int64'), '0_qrs_morph0': dtype('float64'), '0_qrs_morph1': dtype('float64'), '0_qrs_morph2': dtype('float64'), '0_qrs_morph3': dtype('float64'), '0_qrs_morph4': dtype('float64'), '1_pre-RR': dtype('int64'), '1_post-RR': dtype('int64'), '1_pPeak': dtype('float64'), '1_tPeak': dtype('float64'), '1_rPeak': dtype('float64'), '1_sPeak': dtype('float64'), '1_qPeak': dtype('float64'), '1_qrs_interval': dtype('int64'), '1_pq_interval': dtype('int64'), '1_qt_interval': dtype('int64'), '1_st_interval': dtype('int64'), '1_qrs_morph0': dtype('float64'), '1_qrs_morph1': dtype('float64'), '1_qrs_morph2': dtype('float64'), '1_qrs_morph3':

In [49]:
# _0preRR = '0_pre-RR'
# _0postRR = '0_post-RR'
# _0pPeak = '0_pPeak'
# _0tPeak = '0_tPeak'
# _0rPeak = '0_rPeak'
# _0sPeak = '0_sPeak'
# _0qPeak = '0_qPeak'
# _0qrsinterval = '0_qrs_interval'
# _0pqinterval = '0_pq_interval'
# _0qtinterval = '0_qt_interval'
# _0stinterval = '0_st_interval'
# _0qrsmorph0 = '0_qrs_morph0'
# _0qrsmorph1 = '0_qrs_morph1'
# _0qrsmorph2 = '0_qrs_morph2'
# _0qrsmorph3 = '0_qrs_morph3'
# _0qrsmorph4 = '0_qrs_morph4'
# _1preRR = '1_pre-RR'
# _1postRR = '1_post-RR'
# _1pPeak = '1_pPeak'
# _1tPeak = '1_tPeak'
# _1rPeak = '1_rPeak'
# _1sPeak = '1_sPeak'
# _1qPeak ='1_qPeak'
# _1qrsinterval = '1_qrs_interval'
# _1pqinterval = '1_pq_interval'
# _1qtinterval = '1_qt_interval'
# _1stinterval = '1_st_interval'
# _1qrsmorph0 = '1_qrs_morph0'
# _1qrsmorph1 = '1_qrs_morph1'
# _1qrsmorph2 = '1_qrs_morph2'
# _1qrsmorph3 = '1_qrs_morph3'
# _1qrsmorph4 = '1_qrs_morph4'


dtype('O')

In [None]:
{'0_pre-RR': pd.dtype('int64'),
#     '0_post-RR': pd.dtype('int64'),
#     '0_pPeak': pd.dtype('float64'),
#     '0_tPeak': pd.dtype('float64'),
#     '0_rPeak': pd.dtype('float64'),
#     '0_sPeak': pd.dtype('float64'),
#     '0_qPeak': pd.dtype('float64'),
#     '0_qrs_interval': pd.dtype('int64'),
#     '0_pq_interval': pd.dtype('int64'),
#     '0_qt_interval': pd.dtype('int64'),
#     '0_st_interval': pd.dtype('int64'),
#     '0_qrs_morph0': pd.dtype('float64'),
#     '0_qrs_morph1': pd.dtype('float64'),
#     '0_qrs_morph2': pd.dtype('float64'),
#     '0_qrs_morph3': pd.dtype('float64'),
#     '0_qrs_morph4': pd.dtype('float64'),
#     '1_pre-RR': pd.dtype('int64'),
#     '1_post-RR': pd.dtype('int64'),
#     '1_pPeak': pd.dtype('float64'),
#     '1_tPeak': pd.dtype('float64'),
#     '1_rPeak': pd.dtype('float64'),
#     '1_sPeak': pd.dtype('float64'),
#     '1_qPeak': pd.dtype('float64'),
#     '1_qrs_interval': pd.dtype('int64'),
#     '1_pq_interval': pd.dtype('int64'),
#     '1_qt_interval': pd.dtype('int64'),
#     '1_st_interval': pd.dtype('int64'),
#     '1_qrs_morph0': pd.dtype('float64'),
#     '1_qrs_morph1': pd.dtype('float64'),
#     '1_qrs_morph2': pd.dtype('float64'),
#     '1_qrs_morph3': pd.dtype('float64'),
#     '1_qrs_morph4': pd.dtype('float64')}

In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import pickle

# instantiate
app = FastAPI()

# filename variable in case our pickle file changes
pkl_file = "local_model" # or whatever the pickle filename is

# load model from pickle file
# important to do it outside the model so that we don't have to wait for it to load
with open(f'heartbd/models/{pkl_file}.pkl','rb') as file:
    '''
    the path specified is where the file is being loaded from ('rb' is 'read binary')
    '''
    app.state.model = pickle.load(file)

# implementing FastApi middleware because it is the recommended best practice
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"]
)

# define the root directory of the API
@app.get('/')
def root():
	return {'greeting': 'ground control to major tom...'}

# define our prediction and call the class of InputFeatures
@app.get('/predict')
def predict(_0preRR: int,
    _0postRR: int,
    _0pPeak: float,
    _0tPeak: float,
    _0rPeak: float,
    _0sPeak: float,
    _0qPeak: float,
    _0qrsinterval: int,
    _0pqinterval: int,
    _0qtinterval: int,
    _0stinterval: int,
    _0qrsmorph0: float,
    _0qrsmorph1: float,
    _0qrsmorph2: float,
    _0qrsmorph3: float,
    _0qrsmorph4: float,
    _1preRR: int,
    _1postRR: int,
    _1pPeak: float,
    _1tPeak: float,
    _1rPeak: float,
    _1sPeak: float,
    _1qPeak: float,
    _1qrsinterval: int,
    _1pqinterval: int,
    _1qtinterval: int,
    _1stinterval: int,
    _1qrsmorph0: float,
    _1qrsmorph1: float,
    _1qrsmorph2: float,
    _1qrsmorph3: float,
    _1qrsmorph4: float,
):
    '''
    our predict function in the API
    '''
    model = app.state.model # call the model from the pickle
    assert model is not None # assuming the model exists in the first place
    prediction = model.predict(_0preRR,
                        _0postRR,
                        _0pPeak,
                        _0tPeak,
                        _0rPeak,
                        _0sPeak,
                        _0qPeak,
                        _0qrsinterval,
                        _0pqinterval,
                        _0qtinterval,
                        _0stinterval,
                        _0qrsmorph0,
                        _0qrsmorph1,
                        _0qrsmorph2,
                        _0qrsmorph3,
                        _0qrsmorph4,
                        _1preRR,
                        _1postRR,
                        _1pPeak,
                        _1tPeak,
                        _1rPeak,
                        _1sPeak,
                        _1qPeak,
                        _1qrsinterval,
                        _1pqinterval,
                        _1qtinterval,
                        _1stinterval,
                        _1qrsmorph0,
                        _1qrsmorph1,
                        _1qrsmorph2,
                        _1qrsmorph3,
                        _1qrsmorph4,) # pass all features into the api, feature variables indexed through the front end
    y_pred = float(prediction[0])
    return {"result": y_pred} # return a dictionary formatted as {result: <float>}