In [89]:
import numpy as np
import pandas as pd 
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.metrics import brier_score_loss
import warnings

warnings.filterwarnings("ignore")

In [90]:
trainData = pd.read_csv("./heart.csv") 
testData = pd.read_csv("./hdp.csv")
for i in range(trainData.shape[1]): #renames the columns of the second dataset to make combining easier
    testData.rename(columns={str(testData.columns[i]) : str(trainData.columns[i])}, inplace=True)
#change the presence abcense text to match 0 and 1 in trainData
testData["target"] = np.where(testData["target"] == "Absence", 0, 1)


In [91]:
#PRERPOCESSING
#delete duplicates
trainData = trainData.drop_duplicates()
testData = testData.drop_duplicates()
#split train
X = trainData.drop(columns=['target'], axis=1)  # Replace 'target' with your actual target column name
Y = trainData['target']
# Split the data into training and test sets 80-20
#dataSetWithoutTargetTrain, dataSetWithoutTargetTest, targetTrain, targetTest
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2, stratify=Y)
#standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [94]:
#train models
LRmodel = LogisticRegression(C = 0.1, class_weight= 'balanced', fit_intercept= True, l1_ratio= 0.1, max_iter=100, penalty='elasticnet', solver = 'saga')
LRmodel.fit(X_train, Y_train) #tries to find a pattern between all medical stats and whether or not the person has heart disease

KNNmodel = KNeighborsClassifier(algorithm='auto', leaf_size = 10, metric = 'manhattan',  n_neighbors=16, p =1, weights='uniform')
KNNmodel.fit(X_train, Y_train)

RFmodel = RandomForestClassifier(
    n_estimators = 75,  # Reduce number of trees
    min_samples_split = 30,  # Increase minimum samples for split
    min_samples_leaf = 10,   # Increase minimum samples for leaf
    max_features = 'sqrt',   # Limit features to sqrt of total
    max_depth = 15,  # Shallower trees
    class_weight = 'balanced', 
    bootstrap = True
)
RFmodel.fit(X_train, Y_train)

SVMmodel = SVC(C = 0.1, class_weight = 'balanced', degree = 2, gamma = 'scale', kernel='linear', max_iter = 200)
SVMmodel.fit(X_train, Y_train)

MLPmodel = MLPClassifier(learning_rate= 'adaptive', learning_rate_init=0.1, activation = 'logistic', alpha = 0.0001, batch_size = 32, early_stopping = False, hidden_layer_sizes = 100, max_iter = 500, solver= 'sgd')
MLPmodel.fit(X_train, Y_train)

votingClassifier = VotingClassifier(
    estimators=[('lr', LRmodel), ('knn', KNNmodel), ('rf', RFmodel), ('svm', SVMmodel), ('mlp', MLPmodel)],
    voting='soft'
)
votingClassifier.fit(X_train, Y_train)


Test Accuracy

In [41]:
trainingDatasetPrediction = LRmodel.predict(X_train)
trainingDataAccuracy = accuracy_score(trainingDatasetPrediction, Y_train)
print("Training Data Accuracy: " + str(trainingDataAccuracy))
testingDataPrediction = LRmodel.predict(X_test)
testingDataAccuracy = accuracy_score(testingDataPrediction, Y_test)
score_briar_LR = brier_score_loss(Y_test, testingDataPrediction)
print("Testing Data Accuracy", testingDataAccuracy)
print(classification_report(Y_test, testingDataPrediction))
print("Briar Score LR: ", score_briar_LR)

Training Data Accuracy: 0.8589211618257261
Testing Data Accuracy 0.7868852459016393
              precision    recall  f1-score   support

           0       0.83      0.68      0.75        28
           1       0.76      0.88      0.82        33

    accuracy                           0.79        61
   macro avg       0.79      0.78      0.78        61
weighted avg       0.79      0.79      0.78        61

Briar Score LR:  0.21311475409836064


In [42]:
knnTrainPred = KNNmodel.predict(X_train)
knnTrainAcc = accuracy_score(knnTrainPred, Y_train)
print("Training Data Accuracy", knnTrainAcc)

knnTestPred = KNNmodel.predict(X_test)
knnTestAcc = accuracy_score(knnTestPred, Y_test)
score_briar_KNN = brier_score_loss(Y_test, knnTestPred)


print("Testing Data Accuracy", knnTestAcc)
print(classification_report(Y_test, knnTestPred))
print("Briar Score KNN:", score_briar_KNN)

Training Data Accuracy 0.8423236514522822
Testing Data Accuracy 0.8032786885245902
              precision    recall  f1-score   support

           0       0.86      0.68      0.76        28
           1       0.77      0.91      0.83        33

    accuracy                           0.80        61
   macro avg       0.82      0.79      0.80        61
weighted avg       0.81      0.80      0.80        61

Briar Score KNN: 0.19672131147540983


In [43]:
rfModelTrainPred = RFmodel.predict(X_train)
trainAccuracy = accuracy_score(rfModelTrainPred, Y_train)
print("Training Data Accuracy: ", trainAccuracy)

rfModelTestPred = RFmodel.predict(X_test)
testAccuracy = accuracy_score(rfModelTestPred, Y_test)
score_briar_RF = brier_score_loss(Y_test, rfModelTestPred)

print("Testing Data Accuracy: ", testAccuracy)
print(classification_report(Y_test, rfModelTestPred))
print("Brier Score RF: ", score_briar_RF)

Training Data Accuracy:  0.8838174273858921
Testing Data Accuracy:  0.8360655737704918
              precision    recall  f1-score   support

           0       0.88      0.75      0.81        28
           1       0.81      0.91      0.86        33

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.83        61

Brier Score RF:  0.16393442622950818


In [44]:
svmModelTrainPred = SVMmodel.predict(X_train)
svmTrainAccuracy = accuracy_score(svmModelTrainPred, Y_train)
print("Training Data Accuracy: ", svmTrainAccuracy)

svmModelTestPred = SVMmodel.predict(X_test)
svmTestAccuracy = accuracy_score(svmModelTestPred, Y_test)
brier_score_SVM = brier_score_loss(Y_test, svmModelTestPred)

print("Training Data Accuracy: ", svmTestAccuracy)

print(classification_report(Y_test, svmModelTestPred))
print("Brier Score SVM: ", brier_score_SVM)



Training Data Accuracy:  0.8796680497925311
Training Data Accuracy:  0.819672131147541
              precision    recall  f1-score   support

           0       0.95      0.64      0.77        28
           1       0.76      0.97      0.85        33

    accuracy                           0.82        61
   macro avg       0.85      0.81      0.81        61
weighted avg       0.85      0.82      0.81        61

Brier Score SVM:  0.18032786885245902


In [45]:
#MLP
MLPtrainingDatasetPrediction = MLPmodel.predict(X_train)
MLPtrainingDataAccuracy = accuracy_score(MLPtrainingDatasetPrediction, Y_train)
print("Training Data Accuracy: " + str(MLPtrainingDataAccuracy))

MLPtestingDataPrediction = MLPmodel.predict(X_test)
MLPtestingDataAccuracy = accuracy_score(MLPtestingDataPrediction, Y_test)
brier_score_MLP = brier_score_loss(Y_test, MLPtestingDataPrediction)

print("Testing Data Accuracy", MLPtestingDataAccuracy)
print(classification_report(Y_test, MLPtestingDataPrediction))
print("Brier Score SVM: ", brier_score_MLP)

Training Data Accuracy: 0.8506224066390041
Testing Data Accuracy 0.7868852459016393
              precision    recall  f1-score   support

           0       0.83      0.68      0.75        28
           1       0.76      0.88      0.82        33

    accuracy                           0.79        61
   macro avg       0.79      0.78      0.78        61
weighted avg       0.79      0.79      0.78        61

Brier Score SVM:  0.21311475409836064


In [95]:
#Voting Classifier
VCtrainingDatasetPrediction = votingClassifier.predict(X_train)
VCtrainingDataAccuracy = accuracy_score(VCtrainingDatasetPrediction, Y_train)
print("Training Data Accuracy: " + str(VCtrainingDataAccuracy))

VCtestingDataPrediction = votingClassifier.predict(X_test)
VCtestingDataAccuracy = accuracy_score(VCtestingDataPrediction, Y_test)
brier_score_VC = brier_score_loss(Y_test, VCtestingDataPrediction)

print("Testing Data Accuracy", VCtestingDataAccuracy)
print(classification_report(Y_test, VCtestingDataPrediction))
print("Brier Score VC: ", brier_score_VC)


AttributeError: This 'SVC' has no attribute 'predict_proba'