# Heart Disease Classification Project

This notebook implements a comprehensive analysis and classification of heart disease data using multiple machine learning approaches.

## Setup and Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Configure visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
%matplotlib inline

## 1. Dataset Description

The Heart Disease dataset contains various medical attributes that can be used to predict the presence of heart disease. Here are the features:

1. **age**: Age in years
2. **sex**: Sex (1 = male, 0 = female)
3. **cp**: Chest pain type
   - Value 0: Typical angina
   - Value 1: Atypical angina
   - Value 2: Non-anginal pain
   - Value 3: Asymptomatic
4. **trestbps**: Resting blood pressure (in mm Hg)
5. **chol**: Serum cholesterol in mg/dl
6. **fbs**: Fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
7. **restecg**: Resting electrocardiographic results
   - Value 0: Normal
   - Value 1: Having ST-T wave abnormality
   - Value 2: Showing probable or definite left ventricular hypertrophy
8. **thalach**: Maximum heart rate achieved
9. **exang**: Exercise induced angina (1 = yes; 0 = no)
10. **oldpeak**: ST depression induced by exercise relative to rest
11. **slope**: The slope of the peak exercise ST segment
12. **ca**: Number of major vessels colored by fluoroscopy (0-3)
13. **thal**: Thalassemia
   - Value 1: Normal
   - Value 2: Fixed defect
   - Value 3: Reversible defect
14. **target**: Diagnosis of heart disease (0 = absence, 1 = presence)

In [7]:
# Load the dataset
df = pd.read_csv('heart.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 42, saw 36


## 2. Data Preprocessing and Cleaning

Let's examine and clean our dataset:

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Display basic statistics
print("\nBasic statistics:")
df.describe()

In [None]:
# Visualize distributions and relationships
plt.figure(figsize=(15, 10))

# Distribution of target variable
plt.subplot(2, 2, 1)
sns.countplot(data=df, x='target')
plt.title('Distribution of Heart Disease')

# Age distribution by target
plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='target', y='age')
plt.title('Age Distribution by Heart Disease')

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()

### Data Preprocessing Steps

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

## 3. Model Implementation and Evaluation

We'll implement and evaluate multiple classification models:

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return model, accuracy

### A. Logistic Regression

In [None]:
# Initial model
lr_model = LogisticRegression(random_state=42)
lr_model, lr_accuracy = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, "Logistic Regression")

### B. Decision Tree

In [None]:
# Initial model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model, dt_accuracy = evaluate_model(dt_model, X_train_scaled, X_test_scaled, y_train, y_test, "Decision Tree")

### C. Random Forest

In [None]:
# Initial model
rf_model = RandomForestClassifier(random_state=42)
rf_model, rf_accuracy = evaluate_model(rf_model, X_train_scaled, X_test_scaled, y_train, y_test, "Random Forest")

### D. SGD Classifier

In [None]:
# Initial model
sgd_model = SGDClassifier(random_state=42)
sgd_model, sgd_accuracy = evaluate_model(sgd_model, X_train_scaled, X_test_scaled, y_train, y_test, "SGD Classifier")

### E. Support Vector Machine

In [None]:
# Initial model
svm_model = SVC(random_state=42)
svm_model, svm_accuracy = evaluate_model(svm_model, X_train_scaled, X_test_scaled, y_train, y_test, "Support Vector Machine")

## 4. Hyperparameter Tuning with GridSearchCV

In [None]:
def perform_grid_search(model, param_grid, X_train, y_train, model_name):
    # Create GridSearchCV object
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    print(f"\n{model_name} Grid Search Results:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

### Logistic Regression Tuning

In [None]:
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200, 300]
}

lr_best = perform_grid_search(LogisticRegression(random_state=42), 
                             lr_param_grid, 
                             X_train_scaled, 
                             y_train, 
                             "Logistic Regression")

# Evaluate tuned model
evaluate_model(lr_best, X_train_scaled, X_test_scaled, y_train, y_test, "Tuned Logistic Regression")

### Decision Tree Tuning

In [None]:
dt_param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_best = perform_grid_search(DecisionTreeClassifier(random_state=42),
                             dt_param_grid,
                             X_train_scaled,
                             y_train,
                             "Decision Tree")

# Evaluate tuned model
evaluate_model(dt_best, X_train_scaled, X_test_scaled, y_train, y_test, "Tuned Decision Tree")

### Random Forest Tuning

In [None]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_best = perform_grid_search(RandomForestClassifier(random_state=42),
                             rf_param_grid,
                             X_train_scaled,
                             y_train,
                             "Random Forest")

# Evaluate tuned model
evaluate_model(rf_best, X_train_scaled, X_test_scaled, y_train, y_test, "Tuned Random Forest")

### SGD Classifier Tuning

In [None]:
sgd_param_grid = {
    'loss': ['hinge', 'log_loss', 'modified_huber'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'optimal', 'adaptive']
}

sgd_best = perform_grid_search(SGDClassifier(random_state=42),
                              sgd_param_grid,
                              X_train_scaled,
                              y_train,
                              "SGD Classifier")

# Evaluate tuned model
evaluate_model(sgd_best, X_train_scaled, X_test_scaled, y_train, y_test, "Tuned SGD Classifier")

### SVM Tuning

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 1]
}

svm_best = perform_grid_search(SVC(random_state=42),
                              svm_param_grid,
                              X_train_scaled,
                              y_train,
                              "Support Vector Machine")

# Evaluate tuned model
evaluate_model(svm_best, X_train_scaled, X_test_scaled, y_train, y_test, "Tuned SVM")

## 5. Feature Selection Analysis

In [None]:
# Get feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_best.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance from Random Forest')
plt.show()

# Select top 8 features
top_features = feature_importance['feature'].head(8).tolist()
X_selected = X[top_features]

# Split and scale the selected features
X_train_selected, X_test_selected, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

print("\nSelected features:", top_features)

### Evaluate Models with Selected Features

In [None]:
# Evaluate all models with selected features
print("\nModel Performance with Selected Features:")

# Logistic Regression
evaluate_model(lr_best, X_train_selected_scaled, X_test_selected_scaled, 
              y_train, y_test, "Logistic Regression (Selected Features)")

# Decision Tree
evaluate_model(dt_best, X_train_selected_scaled, X_test_selected_scaled, 
              y_train, y_test, "Decision Tree (Selected Features)")

# Random Forest
evaluate_model(rf_best, X_train_selected_scaled, X_test_selected_scaled, 
              y_train, y_test, "Random Forest (Selected Features)")

# SGD Classifier
evaluate_model(sgd_best, X_train_selected_scaled, X_test_selected_scaled, 
              y_train, y_test, "SGD Classifier (Selected Features)")

# SVM
evaluate_model(svm_best, X_train_selected_scaled, X_test_selected_scaled, 
              y_train, y_test, "SVM (Selected Features)")

## Conclusions

1. **Model Performance Comparison**:
   - Initial vs Tuned models
   - Full features vs Selected features
   
2. **Feature Importance**:
   - Most significant features for prediction
   - Impact of feature selection on model performance
   
3. **Best Performing Model**:
   - Which model performed best overall
   - Trade-offs between different models