# Classification Algorithms with Breast Cancer Dataset
This notebook demonstrates various classification algorithms using the **Breast Cancer dataset** from `sklearn.datasets`. We'll train, evaluate, and compare models like
- Logistic Regression
- Decision Tree
- Random Forest
- SVM
- KNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print('Shape of X:', X.shape)
print('Shape of y:', y.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier()
}

results = {}

In [None]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name}:")
    print(classification_report(y_test, y_pred))

In [None]:
fig, axes = plt.subplots(1, len(models), figsize=(20, 4))
for ax, (name, model) in zip(axes, models.items()):
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.xticks(rotation=45)
plt.ylabel('Accuracy')
plt.title('Model Comparison on Breast Cancer Dataset')
plt.show()

## Hyperparameter Tuning with GridSearchCV
We'll perform hyperparameter tuning for **Random Forest** and **SVM** to see if we can improve their performance.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train_scaled, y_train)

print('Best Parameters for Random Forest:', grid_rf.best_params_)
print('Best CV Score:', grid_rf.best_score_)

y_pred_rf = grid_rf.predict(X_test_scaled)
print('Test Accuracy:', accuracy_score(y_test, y_pred_rf))

In [None]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5, scoring='accuracy')
grid_svm.fit(X_train_scaled, y_train)

print('Best Parameters for SVM:', grid_svm.best_params_)
print('Best CV Score:', grid_svm.best_score_)

y_pred_svm = grid_svm.predict(X_test_scaled)
print('Test Accuracy:', accuracy_score(y_test, y_pred_svm))