In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# raw_data = pd.concat([X, y], axis=1).head()
# raw_data

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=92, stratify=y)

In [4]:
# X_train.describe()

In [5]:
# sns.pairplot(data=raw_data, kind='scatter', diag_kind='kde', hue='target')

# raw_data.corr()

Now let us do some preparatory work to lay the foundation of training the model(s) and evaluating them. This is where we form the preprocessing pileline, which will be used to transform the data before training the model. We will also define the evaluation metrics that we will use to evaluate the model's performance.

Setup the estimators that we want to use for training the model. We will use a pipeline to chain the preprocessing steps and the model training step together. This will allow us to easily apply the same preprocessing steps to the test data when we evaluate the model.

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
    ('SGD Classifier', SGDClassifier(), {'classifier__max_iter': [100, 500, 1000], 'classifier__tol': [1e-4, 1e-3, 1e-2], 'classifier__alpha': [0.0001, 0.001, 0.01]}),
    ('Logistic Regression', LogisticRegression(max_iter=1000), {'classifier__max_iter': [100, 500, 1000], 'classifier__C': [0.01, 0.1, 1, 10]}),
    ('Ridge Classifier', RidgeClassifier(), {'classifier__tol': [1e-4, 1e-3, 1e-2], 'classifier__alpha': [0.0001, 0.001, 0.01], 'classifier__max_iter': [100, 500, 1000]}),
    ('Random Forest Classifier', RandomForestClassifier(), {'classifier__n_estimators': [100, 500, 1000]}),
    ('Gradient Boosting Classifier', GradientBoostingClassifier(), {'classifier__n_estimators': [100, 500, 1000], 'classifier__learning_rate': [0.001, 0.01, 0.1, 1]})
]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

grid_search_results = pd.DataFrame(columns=['Best Score', 'Test Score', 'Model'])

print("Starting grid search for classifiers...")

for name, clf, params in classifiers:
    
    # print(f"Training {name}...")
    
    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()),
        ('classifier', clf)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    score = grid_search.score(X_test, y_test)
    # print(f"{name} test score: {score:.4f}")
    # print(f"{name} best score: {grid_search.best_score_:.4f}")
    # print(f"{name} best parameters: {grid_search.best_params_}")

    grid_search_results.loc[name, 'Best Score'] = grid_search.best_score_
    grid_search_results.loc[name, 'Test Score'] = score

    print('\n\n')

Starting grid search for classifiers...

















In [12]:
grid_search_results.sort_values(by='Test Score', ascending=False, inplace=True)
grid_search_results.sort_values(by='Best Score', ascending=True, inplace=True)
grid_search_results

Unnamed: 0,Best Score,Test Score,Model
Gradient Boosting Classifier,0.947253,0.973684,GradientBoostingClassifier()
Random Forest Classifier,0.956044,0.964912,RandomForestClassifier()
Ridge Classifier,0.956044,0.947368,RidgeClassifier()
SGD Classifier,0.978022,0.973684,SGDClassifier()
Logistic Regression,0.98022,0.973684,LogisticRegression(max_iter=1000)


In [None]:
# import pickle

# with open('best_model_breast_cancer.pkl', 'wb') as f:
#     pickle.dump(grid_search.best_estimator_, f)
