In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset (replace 'data.csv' with your dataset file)
data = pd.read_csv('breast_cancer_wisconsin_diagnostic/wdbc.data')

# Extract features and target variable
X = data.iloc[:, 2:].values  # Assuming your features start from the third column
y = data.iloc[:, 1].values   # Assuming the diagnosis column is in the second column

# Encode the target variable (Malignant = 1, Benign = 0)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional, but often improves performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)  # Cross-validation scores
    report = classification_report(y_test, y_pred, target_names=['Benign', 'Malignant'])

    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Cross-Validation Scores: {cross_val_scores}")
    print("Classification Report:\n", report)
    print("\n")

# You can also add more classifiers to the 'classifiers' dictionary and compare their performance.


Classifier: Logistic Regression
Accuracy: 0.96
Cross-Validation Scores: [0.98901099 0.96703297 0.98901099 0.95604396 1.        ]
Classification Report:
               precision    recall  f1-score   support

      Benign       0.96      0.99      0.97        68
   Malignant       0.98      0.93      0.96        46

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



Classifier: Support Vector Machine
Accuracy: 0.96
Cross-Validation Scores: [1.         0.96703297 0.95604396 0.94505495 0.97777778]
Classification Report:
               precision    recall  f1-score   support

      Benign       0.94      0.99      0.96        68
   Malignant       0.98      0.91      0.94        46

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



Classifier: Random Forest
Accurac