In [1]:
from typing import List, Tuple
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
FEATURE_COLUMNS: List["str"] = ["Age", "Cholesterol"]
TARGET_COLUMN: "str" = "HeartDisease"

In [3]:
# Load data

def load_data() -> pd.DataFrame:
    return pd.read_csv("./assets/heart.csv")

df_raw: pd.DataFrame = load_data()
df_raw.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [4]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:

    df = df.dropna()
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    features = FEATURE_COLUMNS + [TARGET_COLUMN]
    df = df[features]

    return df

df_cleaned: pd.DataFrame = clean_data(df_raw)
df_cleaned.head(2)

Unnamed: 0,Age,Cholesterol,HeartDisease
0,40,289,0
1,49,180,1


In [5]:
df_cleaned[TARGET_COLUMN]

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64

In [6]:
def split_data(df: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(df[FEATURE_COLUMNS], df[TARGET_COLUMN], test_size=0.33, random_state=1)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_cleaned)
y_train

246    1
354    1
799    0
40     0
89     0
      ..
767    0
72     1
908    1
235    0
37     0
Name: HeartDisease, Length: 615, dtype: int64

In [7]:
from ngboost import NGBClassifier
import pickle as pkl
import joblib


def fit_model(X_train: pd.DataFrame, y_train: pd.DataFrame):

    classifier = NGBClassifier()
    classifier.fit(X_train, y_train)
    
    return classifier

def save_model(model: NGBClassifier) -> None:
    with open("ng_boost_model.pkl", "wb") as f:
        pkl.dump(model, f, protocol=pkl.HIGHEST_PROTOCOL)
    joblib.dump(model, "ng_boost_model.joblib", protocol=1, compress=0)

classifier = fit_model(X_train, y_train)
save_model(classifier)

[iter 0] loss=0.6913 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5288 val_loss=0.0000 scale=2.0000 norm=3.6419
[iter 200] loss=0.5013 val_loss=0.0000 scale=1.0000 norm=1.8405
[iter 300] loss=0.4948 val_loss=0.0000 scale=2.0000 norm=3.6981
[iter 400] loss=0.4883 val_loss=0.0000 scale=2.0000 norm=3.6973


In [8]:
def load_model() -> NGBClassifier:
    model = joblib.load("ng_boost_model.joblib")
    return model
    # with open("ng_boost_model.pkl", "rb") as f:
    #     return pkl.load(f)
    
loaded_model = load_model()

In [9]:
def predict(model: NGBClassifier, X_test: pd.DataFrame):
    return model.predict_proba(X_test)

predictions = predict(loaded_model, X_test)
predictions[:2]

array([[0.36334095, 0.63665905],
       [0.55905487, 0.44094513]])

In [11]:
def predict():
    pass

In [12]:
def test_load_data() -> None:
    interesting_columns: List["str"] = FEATURE_COLUMNS.copy()
    interesting_columns.append(TARGET_COLUMN)
    
    df_loaded = load_data()

    assert not df_loaded.empty, "The dataframe should not be empty"

    for column in interesting_columns:
        assert column in df_loaded.columns, f"The column {column} is not beeing loaded"
    
    assert df_loaded["Age"].dtype == "int64", "'Age' column type should be int64"
    assert df_loaded["Cholesterol"].dtype == "int64", "'Cholesterol' column type should be int64"
    assert df_loaded["HeartDisease"].dtype == "int64", "'HeartDisease' column type should be int64"

def test_clean_data() -> None:

    #TODO: create mocked data
    df_raw = load_data()
    df_clean = clean_data(df_raw)

    assert set(df_clean.columns) == set(FEATURE_COLUMNS + [TARGET_COLUMN]), f"Columns should be {FEATURE_COLUMNS + [TARGET_COLUMN]}"
    assert df_clean.shape[0] == df_raw.shape[0], f"Dataframe should have the same number of rows as the raw dataset"

def test_split_data() -> None:

    #TODO: create mocked data
    df_raw = load_data().copy()
    df_clean = clean_data(df_raw)

    X_train, X_test, y_train, y_test = split_data(df_clean)

    print(X_train.shape[0])
    assert X_train.shape[0] != 0, f"X_train should not be empty"
    assert X_test.shape[0] != 0, f"X_test should not be empty"
    assert y_train.shape[0] != 0, f"y_train should not be empty"
    assert y_test.shape[0] != 0, f"y_test should not be empty"

def test_fit_model() -> None:
    #TODO: create mocked data
    _df_raw = load_data()
    display(_df_raw)
    df_clean = clean_data(_df_raw)
    display(df_clean)

    X_train, X_test, y_train, y_test = split_data(df_clean)
    display(X_train.head(2))
    display(y_train.head(2))
    fit_model(X_train, y_train)

def run_test():
    test_load_data()
    test_clean_data()
    test_split_data()
    test_fit_model()


run_test()

615


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


Unnamed: 0,Age,Cholesterol,HeartDisease
0,40,289,0
1,49,180,1
2,37,283,0
3,48,214,1
4,54,195,0
...,...,...,...
913,45,264,1
914,68,193,1
915,57,131,1
916,57,236,1


Unnamed: 0,Age,Cholesterol
246,54,237
354,55,0


246    1
354    1
Name: HeartDisease, dtype: int64

[iter 0] loss=0.6913 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5288 val_loss=0.0000 scale=2.0000 norm=3.6419
[iter 200] loss=0.5013 val_loss=0.0000 scale=1.0000 norm=1.8405
[iter 300] loss=0.4948 val_loss=0.0000 scale=2.0000 norm=3.6981
[iter 400] loss=0.4883 val_loss=0.0000 scale=2.0000 norm=3.6973
