In [None]:
a



In [None]:
from google.colab import files
uploaded = files.upload()


Saving cleaveland.csv to cleaveland.csv


In [None]:
cols = ['age','sex','cp','trestbps','chol','fbs','restecg',
        'thalach','exang','oldpeak','slope','ca','thal','target']

df = pd.read_csv("cleaveland.csv", header=None, names=cols)



# Replace missing or undefined values ("?") with NaN
df.replace('?', np.nan, inplace=True)

# Count how many were replaced
num_missing = df.isnull().sum().sum()
print(f"Total missing or undefined values replaced: {num_missing}")

# Convert all numeric columns to numbers
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='ignore')

# Drop rows with NaN (as per labsheet)
df.dropna(inplace=True)


Total missing or undefined values replaced: 6


In [None]:
# Convert target column to binary: 0 = No disease, 1 = Disease present
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)
print(df['target'].value_counts())


target
0    160
1    137
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='target')
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(X_train.shape, X_test.shape)


(237, 13) (60, 13)


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_features = ['age','trestbps','chol','thalach','oldpeak']
categorical_features = ['sex','cp','fbs','restecg','exang','slope','ca','thal']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100, max_depth=10, min_samples_split=5,
    min_samples_leaf=2, max_features='sqrt', random_state=42
)

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

rf_pipeline.fit(X_train, y_train)


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=6,
    subsample=0.8, colsample_bytree=0.8, gamma=1,
    use_label_encoder=False, eval_metric='logloss', random_state=42
)

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

xgb_pipeline.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score

def evaluate(name, model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"\n{name} Results:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print(f"AUC      : {auc:.4f}")
    return {'Accuracy': acc, 'Precision': prec, 'F1': f1, 'AUC': auc}

rf_results = evaluate("Random Forest", rf_pipeline)
xgb_results = evaluate("XGBoost", xgb_pipeline)



Random Forest Results:
Accuracy : 0.8333
Precision: 0.8462
F1 Score : 0.8148
AUC      : 0.9397

XGBoost Results:
Accuracy : 0.8833
Precision: 0.8889
F1 Score : 0.8727
AUC      : 0.9297


In [None]:
print("\nComparison based on AUC:")
print(f"Random Forest AUC: {rf_results['AUC']:.4f}")
print(f"XGBoost AUC      : {xgb_results['AUC']:.4f}")

if xgb_results['AUC'] > rf_results['AUC']:
    print("XGBoost performs better based on AUC.")
else:
    print(" Random Forest performs better based on AUC.")



Comparison based on AUC:
Random Forest AUC: 0.9397
XGBoost AUC      : 0.9297
 Random Forest performs better based on AUC.
