In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=10000),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier()
}

pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

In [4]:
def evaluate_model(name, pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    print(f"\nModel: {name}")
    print(classification_report(y_test, y_pred))

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    evaluate_model(name, pipe, X_test, y_test)


Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Model: RandomForest
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


Model: SVM
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg  

In [5]:
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
}

grid_rf = GridSearchCV(pipelines['RandomForest'], param_grid=param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("\n[GridSearchCV] Best RandomForest Params:", grid_rf.best_params_)
evaluate_model("RandomForest (GridSearch)", grid_rf.best_estimator_, X_test, y_test)


[GridSearchCV] Best RandomForest Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}

Model: RandomForest (GridSearch)
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [6]:
param_dist_svm = {
    'classifier__C': np.logspace(-2, 2, 10),
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto']
}

random_svm = RandomizedSearchCV(pipelines['SVM'], param_distributions=param_dist_svm,
                                n_iter=10, cv=5, scoring='f1', random_state=42, n_jobs=-1)
random_svm.fit(X_train, y_train)

print("\n[RandomizedSearchCV] Best SVM Params:", random_svm.best_params_)
evaluate_model("SVM (RandomizedSearch)", random_svm.best_estimator_, X_test, y_test)


[RandomizedSearchCV] Best SVM Params: {'classifier__kernel': 'linear', 'classifier__gamma': 'scale', 'classifier__C': np.float64(0.0774263682681127)}

Model: SVM (RandomizedSearch)
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [7]:
def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

results = {
    "LogisticRegression": get_metrics(pipelines["LogisticRegression"], X_test, y_test),
    "RandomForest": get_metrics(pipelines["RandomForest"], X_test, y_test),
    "SVM": get_metrics(pipelines["SVM"], X_test, y_test),
    "KNN": get_metrics(pipelines["KNN"], X_test, y_test),
    "DecisionTree": get_metrics(pipelines["DecisionTree"], X_test, y_test),
    "RandomForest_Tuned": get_metrics(grid_rf.best_estimator_, X_test, y_test),
    "SVM_Tuned": get_metrics(random_svm.best_estimator_, X_test, y_test),
}

pd.DataFrame(results).T.sort_values("F1-Score", ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
SVM,0.982456,0.972603,1.0,0.986111
SVM_Tuned,0.982456,0.972603,1.0,0.986111
LogisticRegression,0.973684,0.972222,0.985915,0.979021
RandomForest,0.964912,0.958904,0.985915,0.972222
RandomForest_Tuned,0.964912,0.958904,0.985915,0.972222
DecisionTree,0.947368,0.957746,0.957746,0.957746
KNN,0.947368,0.957746,0.957746,0.957746
