In [16]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 1. Loading and Preprocessing
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#Explaination for the preprocessing
print("\nStandardScaler is used to normalize the features, which is important because:")
print( " \n1.It ensures all features are on the same scale")
print( "\n2.It improves the performance of algorithms like SVM and Logistic Regression")
print( "\n3.It speeds up the convergence of many machine learning algorithms")

# Dictionary to store results
results = {}



StandardScaler is used to normalize the features, which is important because:
 
1.It ensures all features are on the same scale

2.It improves the performance of algorithms like SVM and Logistic Regression

3.It speeds up the convergence of many machine learning algorithms


In [17]:
# 2. Classification Algorithm Implementation

# Logistic Regression
print("\nLogistic Regression: A linear classifier that works well with numerical features")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
results['Logistic Regression'] = accuracy_score(y_test, lr_pred)


Logistic Regression: A linear classifier that works well with numerical features


In [18]:
# Decision Tree
print("\nDecision Tree: Creates a tree-like model of decisions, good for capturing non-linear patterns ")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
dt_pred = dt_model.predict(X_test_scaled)
results['Decision Tree'] = accuracy_score(y_test, dt_pred)


Decision Tree: Creates a tree-like model of decisions, good for capturing non-linear patterns 


In [19]:
# Random Forest
print("\nRandom Forest: An ensemble of decision trees, generally provides better accuracy than a single tree ")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
results['Random Forest'] = accuracy_score(y_test, rf_pred)


Random Forest: An ensemble of decision trees, generally provides better accuracy than a single tree 


In [20]:
# Support Vector Machine
print("\nSVM: Creates a hyperplane to separate classes, works well with high-dimensional data")
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
results['SVM'] = accuracy_score(y_test, svm_pred)


SVM: Creates a hyperplane to separate classes, works well with high-dimensional data


In [21]:
# k-Nearest Neighbors
print("\nk-Nearest Neighbours: Classifications based on nearest neighbors, good for non-linear patterns ")
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)
results['k-NN'] = accuracy_score(y_test, knn_pred)


k-Nearest Neighbours: Classifications based on nearest neighbors, good for non-linear patterns 


In [22]:
# 3. Model Comparison

results_df = pd.DataFrame({
    'Algorithm': list(results.keys()),
    'Accuracy': list(results.values())
}).sort_values('Accuracy', ascending=False)

print("\nDataset Information:")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {data.feature_names}")


Dataset Information:
Number of samples: 569
Number of features: 30
Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [23]:
print("\nModel Comparison Results:")
print(results_df)


Model Comparison Results:
             Algorithm  Accuracy
3                  SVM  0.982456
0  Logistic Regression  0.973684
2        Random Forest  0.964912
1        Decision Tree  0.947368
4                 k-NN  0.947368


In [24]:
# Print detailed report for the best performing model
best_model = results_df.iloc[0]['Algorithm']
worst_model = results_df.iloc[-1]['Algorithm']
print(f"\nThe best model is :{best_model}")
print(f"\nThe worst model is :{worst_model}")
print(f"\nDetailed Classification Report for {best_model}:")
if best_model == 'Logistic Regression':
    best_pred = lr_pred
elif best_model == 'Decision Tree':
    best_pred = dt_pred
elif best_model == 'Random Forest':
    best_pred = rf_pred
elif best_model == 'SVM':
    best_pred = svm_pred
else:
    best_pred = knn_pred

print(classification_report(y_test, best_pred))


The best model is :SVM

The worst model is :k-NN

Detailed Classification Report for SVM:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

