In [6]:
!pip install imblearn

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Using cached imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Using cached imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.2 imblearn-0.0


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import os

In [10]:
# Load data
data = pd.read_csv('~/data/heart_disease_cleaned.csv')

# Applying transformations
data['sex'] = np.where(data['sex'] == 0, 0, 1)  # Ensuring binary encoding is correct
data['smoke'] = np.log1p(data['smoke'])
data['fbs'] = np.log1p(data['fbs'])
data['prop'] = np.log1p(data['prop'])
data['nitr'] = np.log1p(data['nitr'])
data['pro'] = np.log1p(data['pro'])
data['diuretic'] = np.log1p(data['diuretic'])
data['oldpeak'] = np.sqrt(data['oldpeak'])
data['cdc_smoke_rate'] = np.sqrt(data['cdc_smoke_rate'])

# High correlation interaction terms
data['age_trestbps'] = data['age'] * data['trestbps']
data['smoke_oldpeak'] = data['smoke'] * data['oldpeak']

# Split data into features and target
X = data.drop(['target'], axis=1)
y = data['target']

# Splitting the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

# Handling imbalanced data for the 'sex' variable
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Setup parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

# Initialize GridSearchCV
model = LogisticRegression(max_iter=10000)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")
print("Test Set Performance:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Best Model Parameters: {'C': 0.1, 'class_weight': None, 'solver': 'liblinear'}
Best CV score: 0.818
Test Set Performance:
Accuracy: 0.744
Confusion Matrix:
[[28 12]
 [11 39]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71        40
           1       0.76      0.78      0.77        50

    accuracy                           0.74        90
   macro avg       0.74      0.74      0.74        90
weighted avg       0.74      0.74      0.74        90

