In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier  # For combining models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
df  = pd.read_csv('/content/drive/MyDrive/ML/heart.csv')

In [12]:
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(C=0.9, class_weight='balanced', max_iter=100000))
])

pipeline_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(C=1, probability=True, random_state=42))
])

ensemble_model = VotingClassifier(estimators=[
    ('lr', pipeline_lr),
    ('svm', pipeline_svc)
], voting='soft')

In [10]:
# Select features and target
X = df.drop('target', axis=1)
y = df['target']

   # Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

   # Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=100000, C=0.9, penalty="l2", class_weight='balanced')

   # Initialize SVM model
svm_model = SVC(kernel='rbf', C=1, gamma='scale', probability=True, random_state=42) # Note: probability=True is important for VotingClassifier

   # Train individual models
lr_model.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)

In [15]:
# Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(estimators=[('lr', lr_model), ('svm', svm_model)], voting='soft')  # voting='soft' uses probabilities

   # Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

In [18]:
# Create a VotingClassifier ensemble using the pipelines
ensemble_model = VotingClassifier(estimators=[('lr', pipeline_lr), ('svm', pipeline_svc)], voting='soft')  # voting='soft' uses probabilities

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'lr__lr__C': [0.5, 0.9, 1.2],  # Nested parameter names: pipeline name -> model name -> parameter
    'svm__svc__C': [0.5, 1, 1.5]
}

grid_search = GridSearchCV(ensemble_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)  # Now GridSearchCV will handle scaling internally

print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Params: {'lr__lr__C': 0.5, 'svm__svc__C': 1.5}
Best Score: 0.9654105502960679


In [19]:
# Make predictions on the test set
predictions = ensemble_model.predict(X_test_scaled)


from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

predictions = grid_search.predict(X_test)
print("Accuracy:", grid_search.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))
print("ROC-AUC Score:", roc_auc_score(y_test, grid_search.predict_proba(X_test)[:, 1]))


Accuracy: 0.9739047619047619
Confusion Matrix:
 [[92  8]
 [ 6 99]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       100
           1       0.93      0.94      0.93       105

    accuracy                           0.93       205
   macro avg       0.93      0.93      0.93       205
weighted avg       0.93      0.93      0.93       205

ROC-AUC Score: 0.9739047619047619


In [29]:
# Define the expected features (update these names if your dataset uses different ones)
expected_features = ['age', 'sex', 'cp', 'trestbps', 'chol',
                     'fbs', 'restecg', 'thalach', 'exang',
                     'oldpeak', 'slope', 'ca', 'thal']

def validate_input(df):
    # Check for missing columns
    missing_cols = set(expected_features) - set(df.columns)
    if missing_cols:
        raise ValueError(f"Missing columns in input data: {missing_cols}")

    # Domain Validations:
    # Age must be non-negative
    if (df['age'] < 0).any():
        raise ValueError("Invalid input: 'age' must be non-negative.")

    # Sex must be either 0 or 1
    if not df['sex'].isin([0, 1]).all():
        raise ValueError("Invalid input: 'sex' should only be 0 or 1.")

    # Chest Pain (cp) should be one of 0, 1, 2, or 3
    if not df['cp'].isin([0, 1, 2, 3]).all():
        raise ValueError("Invalid input: 'cp' (chest pain) must be one of [0, 1, 2, 3].")

    # Resting Blood Pressure (trestbps) must be > 0
    if (df['trestbps'] <= 0).any():
        raise ValueError("Invalid input: 'trestbps' (resting blood pressure) must be > 0.")

    # Cholesterol (chol) must be > 0
    if (df['chol'] <= 0).any():
        raise ValueError("Invalid input: 'chol' (cholesterol) must be > 0.")

    # Fasting Blood Sugar (fbs) must be 0 or 1
    if not df['fbs'].isin([0, 1]).all():
        raise ValueError("Invalid input: 'fbs' (fasting blood sugar) must be 0 or 1.")

    # Resting ECG (restecg) should be one of 0, 1, or 2
    if not df['restecg'].isin([0, 1, 2]).all():
        raise ValueError("Invalid input: 'restecg' must be one of [0, 1, 2].")

    # Maximum Heart Rate Achieved (thalach) must be > 0
    if (df['thalach'] <= 0).any():
        raise ValueError("Invalid input: 'thalach' (max heart rate) must be > 0.")

    # Exercise-induced Angina (exang) must be 0 or 1
    if not df['exang'].isin([0, 1]).all():
        raise ValueError("Invalid input: 'exang' must be 0 or 1.")

    # ST depression (oldpeak) must be non-negative
    if (df['oldpeak'] < 0).any():
        raise ValueError("Invalid input: 'oldpeak' must be non-negative.")

    # Slope should be one of [1, 2, 3] (adjust if your encoding is different)
    if not df['slope'].isin([1, 2, 3]).all():
        raise ValueError("Invalid input: 'slope' must be one of [1, 2, 3].")

    # Number of vessels colored by fluoroscopy (ca) should be one of 0, 1, 2, or 3
    if not df['ca'].isin([0, 1, 2, 3]).all():
        raise ValueError("Invalid input: 'ca' must be one of [0, 1, 2, 3].")

    # Thal should be one of [0, 1, 2] (adjust this based on your dataset's encoding)
    if not df['thal'].isin([0, 1, 2]).all():
        raise ValueError("Invalid input: 'thal' must be one of [0, 1, 2].")

    # If all validations pass, return True
    return True

# Example new data (try changing these values to test the validations!)
new_data = {
    'age': 58,          # Non-negative
    'sex': 1,           # Must be 0 or 1
    'cp': 2,            # Must be one of [0, 1, 2, 3]
    'trestbps': 140,    # Must be > 0
    'chol': 240,        # Must be > 0
    'fbs': 0,           # Must be 0 or 1
    'restecg': 1,       # Must be one of [0, 1, 2]
    'thalach': 150,     # Must be > 0
    'exang': 0,         # Must be 0 or 1
    'oldpeak': 1.5,     # Non-negative
    'slope': 2,         # Must be one of [1, 2, 3]
    'ca': 0,            # Must be one of [0, 1, 2, 3]
    'thal': 2           # Must be one of [0, 1, 2]
}

# Convert the dictionary to a DataFrame
new_df = pd.DataFrame([new_data])

# Validate input
try:
    validate_input(new_df)
except Exception as e:
    print("Input validation error:", e)
    # Optionally, handle the error (e.g., prompt for new input or exit)

# If valid, predict using the fitted model (assuming grid_search is your trained pipeline)
try:
    prediction = grid_search.predict(new_df)
    probability = grid_search.predict_proba(new_df)
    print("Prediction (0 = No Heart Disease, 1 = Heart Disease):", prediction[0])
    print("Probability Distribution:", probability[0])
except Exception as e:
    print("Error during prediction:", e)


Prediction (0 = No Heart Disease, 1 = Heart Disease): 1
Probability Distribution: [0.18624468 0.81375532]
