In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# This code lets you upload the file from your computer
from google.colab import files
uploaded = files.upload()

# Load the data into a pandas DataFrame
# The original file doesn't have a header, so we add column names manually
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv('heart_disease.csv', header=None, names=column_names)

# Let's see the first 5 rows of our data
print(df.head())

Saving heart_disease.csv to heart_disease.csv
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  target  
0    3.0  0.0  6.0       0  
1    2.0  3.0  3.0       2  
2    2.0  2.0  7.0       1  
3    3.0  0.0  3.0       0  
4    1.0  0.0  3.0       0  


In [2]:
# Replace '?' with a standard missing value marker (NaN)
df.replace('?', np.nan, inplace=True)

# Drop rows with any missing values to keep it simple
df.dropna(inplace=True)

# Check the data types to make sure they are all numbers
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    float64
 2   cp        297 non-null    float64
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    float64
 6   restecg   297 non-null    float64
 7   thalach   297 non-null    float64
 8   exang     297 non-null    float64
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    float64
 11  ca        297 non-null    object 
 12  thal      297 non-null    object 
 13  target    297 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 34.8+ KB
None


In [3]:
# 'X' is all columns EXCEPT the target
X = df.drop('target', axis=1)

# 'y' is ONLY the target column
y = df['target']

In [4]:
# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using our training data
model.fit(X_train, y_train)

In [5]:
# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 60.00%


In [6]:
# 1. Import the new model
from sklearn.linear_model import LogisticRegression

# 2. Create an instance of the new model
# We add max_iter=1000 to give the model enough time to find the best solution
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

# 3. Train the new model on the same training data
log_reg_model.fit(X_train, y_train)

# 4. Make predictions with the new model
log_reg_predictions = log_reg_model.predict(X_test)

# 5. Calculate the new accuracy
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)

print(f"Random Forest Model Accuracy: {accuracy * 100:.2f}%") # This is your old score
print(f"Logistic Regression Model Accuracy: {log_reg_accuracy * 100:.2f}%")

Random Forest Model Accuracy: 60.00%
Logistic Regression Model Accuracy: 63.33%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Import the scaler
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# IMPORTANT: Only transform the test data (using the rules learned from the training data)
X_test_scaled = scaler.transform(X_test)

# Now, create and train a new model on the SCALED data
scaled_model = LogisticRegression(random_state=42, max_iter=1000)
scaled_model.fit(X_train_scaled, y_train)

# Make predictions and check the accuracy
scaled_predictions = scaled_model.predict(X_test_scaled)
scaled_accuracy = accuracy_score(y_test, scaled_predictions)

print(f"Logistic Regression Accuracy (on scaled data): {scaled_accuracy * 100:.2f}%")

Logistic Regression Accuracy (on scaled data): 61.67%


In [8]:
from sklearn.model_selection import GridSearchCV

# 1. Define the settings (parameters) we want to test
param_grid = {
    'C': [0.1, 1, 10, 100],  # A parameter that controls the model's complexity
    'solver': ['liblinear', 'saga'] # Different optimization algorithms
}

# 2. Create the GridSearchCV object
# This will test every combination of our parameters using 5-fold cross-validation
# verbose=2 will print updates so you can see it working
grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=42, max_iter=2000),
    param_grid=param_grid,
    cv=5,
    verbose=2,
    n_jobs=-1
)

# 3. Run the search on our scaled training data
grid_search.fit(X_train_scaled, y_train)

# 4. Print the best settings it found
print(f"Best parameters found: {grid_search.best_params_}")

# 5. Evaluate the best model on the test data
best_model = grid_search.best_estimator_
best_predictions = best_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test, best_predictions)

print(f"Accuracy with Tuned Model: {best_accuracy * 100:.2f}%")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found: {'C': 10, 'solver': 'liblinear'}
Accuracy with Tuned Model: 63.33%


In [9]:
# First, we may need to install XGBoost
!pip install xgboost

# --- Import all the necessary tools ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # Our new tool for handling missing data
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier # Our new powerful model

# --- 1. Reload and do initial cleaning ---
# We do this again to make sure we're starting fresh before imputing
df.replace('?', np.nan, inplace=True)
# Convert columns to numeric, forcing errors will turn non-numbers into NaN
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# --- 2. Separate features and target BEFORE imputing ---
X = df.drop('target', axis=1)
y = df['target']

# We need to drop rows where the 'target' itself is missing
y.dropna(inplace=True)
X = X.loc[y.index] # Keep only the X rows that correspond to the remaining y values


# --- 3. Impute missing values ---
# We'll fill missing values ('NaN') with the most frequent value in that column
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)


# --- 4. Split the data ---
# We use the imputed data now
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


# --- 5. Scale the data (still a best practice) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- 6. Train the XGBoost Model ---
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)


# --- 7. Evaluate the final model ---
xgb_predictions = xgb_model.predict(X_test_scaled)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

print(f"XGBoost Model Accuracy: {xgb_accuracy * 100:.2f}%")

XGBoost Model Accuracy: 58.33%


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
