<a href="https://colab.research.google.com/github/anejalakshya2005-spec/CodeAlpha_DiseasePrediction/blob/main/CodeAlpha_DiseasePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd

# URL for the Heart Disease dataset from UCI ML Repository
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Column names based on the dataset description
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
    "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Load the dataset
df = pd.read_csv(url, names=column_names, na_values="?")

# Display the first 5 rows
display(df.head())

# Display information about the DataFrame
display(df.info())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


None

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

# 1. Address missing values
# Impute missing values in 'ca' and 'thal' columns with the most frequent value (mode)
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['ca', 'thal']] = imputer.fit_transform(df[['ca', 'thal']])

# 2. Identify and encode categorical features
# Categorical features identified from the dataset description and exploration:
# sex, cp, fbs, restecg, exang, slope, ca, thal, target (although target is the label)
# For encoding features, we select: sex, cp, fbs, restecg, exang, slope, ca, thal
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Apply one-hot encoding to the identified categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])

# Create a DataFrame from encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Drop the original categorical columns and concatenate the encoded features
df_processed = df.drop(columns=categorical_features)
df_processed = pd.concat([df_processed, encoded_df], axis=1)

# 3. Separate features (X) and target variable (y)
X = df_processed.drop('target', axis=1)
y = df_processed['target']

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets to verify the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (242, 28)
Shape of X_test: (61, 28)
Shape of y_train: (242,)
Shape of y_test: (61,)


In [9]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Instantiate the models
svm_model = SVC()
lr_model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier()

# Train the models
svm_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

print("Models trained successfully.")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Models trained successfully.


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a dictionary to store the evaluation results
results = {}

# List of models to evaluate
models = {
    "SVM": svm_model,
    "Logistic Regression": lr_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

# Evaluate each model
for model_name, model in models.items():
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    }

# Print the evaluation results
print("Model Evaluation Results:")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Model Evaluation Results:

SVM:
  Accuracy: 0.4754
  Precision: 0.2260
  Recall: 0.4754
  F1-score: 0.3064

Logistic Regression:
  Accuracy: 0.4918
  Precision: 0.4563
  Recall: 0.4918
  F1-score: 0.4722

Random Forest:
  Accuracy: 0.4918
  Precision: 0.3762
  Recall: 0.4918
  F1-score: 0.4253

XGBoost:
  Accuracy: 0.4754
  Precision: 0.3873
  Recall: 0.4754
  F1-score: 0.4264


In [11]:
# Find the best performing model based on F1-score
best_f1_score = 0
best_model_name = None

for model_name, metrics in results.items():
    if metrics['F1-score'] > best_f1_score:
        best_f1_score = metrics['F1-score']
        best_model_name = model_name

# Get the best model object
best_model = models[best_model_name]

# Print the name of the best model
print(f"The best performing model based on F1-score is: {best_model_name}")

The best performing model based on F1-score is: Logistic Regression


In [12]:
# Use the best model to predict on the test data
y_pred_best_model = best_model.predict(X_test)

# Display the first few predictions
print("First 10 predictions from the best model:")
print(y_pred_best_model[:10])

First 10 predictions from the best model:
[0 1 2 1 1 3 2 3 0 1]
