In [3]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load data into a dataframe
conn = sqlite3.connect('original_heart_database.db')
query = "SELECT * FROM cut_heart_disease;"
data = pd.read_sql_query(query, conn)
conn.close()

# Split data into training and test sets
X = data.drop(columns=['target'])
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Dictionary to store results
results = []

# Training and evaluating models
for name, clf in classifiers.items():
    # Model training
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Model evaluation using 5-fold cross-validation
    scores = cross_val_score(clf, X_train, y_train)

    # Collect results
    result = {
        "Model": name,
        "Test Accuracy": test_accuracy,
        "CV Scores": scores,
        "Mean CV Score": scores.mean(),
        "Standard Deviation CV Score": scores.std()
    }
    results.append(result)

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Determine the best model based on mean CV score
best_model = results_df.loc[results_df['Mean CV Score'].idxmax()]

# Adding explanation
explanation = f"The best model is {best_model['Model']} with a test set accuracy of {best_model['Test Accuracy']:.2f} and a mean cross-validation score of {best_model['Mean CV Score']:.2f}. This model was selected based on the highest mean cross-validation score, indicating better generalization to unseen data."

# Print results in a cleaner format
print("Model Performance:")
print(results_df.to_string(index=False))
print("\nBest Model:")
print(explanation)

# Save results and explanation to a file
output_file = "model_performance_results.txt"
with open(output_file, 'w') as f:
    f.write("Model Performance:\n")
    f.write(results_df.to_string(index=False))
    f.write("\n\nBest Model:\n")
    f.write(explanation)


# Display the file path for confirmation
output_file



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Performance:
              Model  Test Accuracy                                                                                            CV Scores  Mean CV Score  Standard Deviation CV Score
Logistic Regression       0.844444  [0.7962962962962963, 0.7654320987654321, 0.808641975308642, 0.8148148148148148, 0.7639751552795031]       0.789832                     0.021371
      Decision Tree       0.722222 [0.7283950617283951, 0.6728395061728395, 0.6728395061728395, 0.6604938271604939, 0.6956521739130435]       0.686044                     0.024031
      Random Forest       0.811111 [0.7839506172839507, 0.7592592592592593, 0.8148148148148148, 0.7962962962962963, 0.8074534161490683]       0.792355                     0.019565
                SVM       0.711111  [0.7654320987654321, 0.6111111111111112, 0.6604938271604939, 0.691358024691358, 0.7204968944099379]       0.689778                     0.052375

Best Model:
The best model is Random Forest with a test set accuracy of 0.81 and

'model_performance_results.txt'