In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer-wisconsin-data/data.csv


# Importing the necessary libraries
I'm just testing good old traditional machine learning models

In [6]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import xgboost as xgb

# Load Data

In [7]:
# Load data
data_path = "/kaggle/input/breast-cancer-wisconsin-data/data.csv"
data = pd.read_csv(data_path)

# Basic Data Quality Check

In [9]:
# Check for missing values
print(data.isnull().sum())

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

# Drop Unnecessary Columns

In [10]:
# Drop unnecessary columns
data = data.drop(['id', 'Unnamed: 32'], axis=1)

# Converting Labels To Number

In [12]:
# Convert diagnosis to 0 (B) and 1 (M)
data['diagnosis'] = data['diagnosis'].map({'B': 0, 'M': 1})

# Train Test Split

In [15]:
# Split data into train and test sets
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 10-fold Cross-Validation

In [16]:
clf = xgb.XGBClassifier()
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

In [18]:
print("10-fold cross-validation scores: ", cross_val_scores)
print("Mean CV Accuracy: ", np.mean(cross_val_scores))

10-fold cross-validation scores:  [0.97826087 0.93478261 0.97826087 0.93478261 1.         0.97777778
 0.97777778 0.95555556 0.97777778 0.93333333]
Mean CV Accuracy:  0.9648309178743961


# A simple try at Hyperparameter Tuning

In [19]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

grid_clf = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
grid_clf.fit(X_train, y_train)

# Best Hyperparameter and Corresponding Accuracy

In [20]:
# Best parameters and corresponding accuracy
print("Best parameters: ", grid_clf.best_params_)
print("Best CV accuracy: ", grid_clf.best_score_)

Best parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best CV accuracy:  0.9779227053140097


# Evaluation of model with the best parameters on the test set

In [21]:
y_pred = grid_clf.predict(X_test)

# Metrics

In [23]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})

print(metrics_df)

      Metric     Value
0   Accuracy  0.964912
1  Precision  0.975610
2     Recall  0.930233
3   F1 Score  0.952381
