In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets

# Load the breast cancer dataset
# This dataset is used for classification tasks, but can also be adapted for regression tasks.
data = datasets.load_breast_cancer()

df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [29]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Dataset shape: {X.shape}")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Dataset shape: (569, 30)
Training set shape: (455, 30)
Testing set shape: (114, 30)


In [30]:
# Convert the data to contiguous arrays with specific data types

X_train = np.ascontiguousarray(X_train, dtype=np.float64)
y_train = np.ascontiguousarray(y_train, dtype=np.int64)
X_test = np.ascontiguousarray(X_test, dtype=np.float64)
y_test = np.ascontiguousarray(y_test, dtype=np.int64)

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to train and evaluate multiple models
def model_trainer(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, models: dict) -> pd.DataFrame:
    
    """
    Train and evaluate multiple classification models.
    
    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training labels.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing labels.
        models (dict): Dictionary of model names and their corresponding sklearn model instances.
    
    Returns:
        pd.DataFrame: DataFrame containing model names and their evaluation metrics.
    """
    
    results = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': []
    }
    
    for model_name, model in models.items():
        
        print(f"Training model: {model_name}")
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        results['Model'].append(model_name)
        results['Accuracy'].append(accuracy_score(y_test, y_pred))
        results['Precision'].append(precision_score(y_test, y_pred))
        results['Recall'].append(recall_score(y_test, y_pred))
        results['F1 Score'].append(f1_score(y_test, y_pred))
        
    return pd.DataFrame(results)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Define the models to be evaluated
# We defined the model with default hyperparameters for simplicity, but they can be tuned further.
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

results_df = model_trainer(X_train, y_train, X_test, y_test, models)

Training model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training model: Decision Tree
Training model: Random Forest
Training model: Support Vector Machine
Training model: K-Nearest Neighbors
Training model: Naive Bayes
Training model: Gradient Boosting
Training model: XGBoost
Training model: LightGBM
[LightGBM] [Info] Number of positive: 286, number of negative: 169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628571 -> initscore=0.526093
[LightGBM] [Info] Start training from score 0.526093


In [33]:
results_df.sort_values(by='Accuracy', ascending=False, inplace=True)

results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
5,Naive Bayes,0.973684,0.959459,1.0,0.97931
2,Random Forest,0.964912,0.958904,0.985915,0.972222
8,LightGBM,0.964912,0.958904,0.985915,0.972222
0,Logistic Regression,0.95614,0.945946,0.985915,0.965517
4,K-Nearest Neighbors,0.95614,0.934211,1.0,0.965986
6,Gradient Boosting,0.95614,0.958333,0.971831,0.965035
7,XGBoost,0.95614,0.958333,0.971831,0.965035
3,Support Vector Machine,0.947368,0.922078,1.0,0.959459
1,Decision Tree,0.929825,0.956522,0.929577,0.942857
