In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load dataset
data = load_breast_cancer(as_frame=True)
df = data.frame
print(df.head())

# Check for missing values
print(df.isnull().sum())  # No missing values

# Separate features and target
X = df.drop(columns='target')
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Preprocessing completed: No missing values, data split, and scaling applied.")


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train model
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_lr = lr.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

# Logistic Regression is a linear model used for binary classification.
#It's suitable here due to its simplicity and efficiency, especially when the decision boundary is approximately linear.


Logistic Regression Accuracy: 0.9737


In [8]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_dt = dt.predict(X_test_scaled)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")


Decision Tree Accuracy: 0.9474


In [10]:
# Decision Trees are non-linear models that split the data into subsets based on feature values. 
#They can capture complex relationships but may overfit if not properly tuned.

In [12]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")


Random Forest Accuracy: 0.9649


In [14]:
#Random Forests are ensembles of Decision Trees that reduce overfitting by averaging multiple trees.
#They're robust and often perform well on a variety of datasets.

In [16]:
from sklearn.svm import SVC

# Initialize and train model
svm = SVC()
svm.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_svm = svm.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")
#SVMs are powerful classifiers that find the hyperplane maximizing the margin between classes.
#They are effective in high-dimensional spaces and when the number of dimensions exceeds the number of samples.

SVM Accuracy: 0.9825


In [18]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train model
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_knn = knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-NN Accuracy: {accuracy_knn:.4f}")
#k-NN is a non-parametric method that classifies data points based on the majority class of their neighbors.#
#It's simple and effective but can be computationally expensive as the dataset grows.



k-NN Accuracy: 0.9474


In [20]:
# Collecting accuracy scores
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'k-NN']
accuracies = [accuracy_lr, accuracy_dt, accuracy_rf, accuracy_svm, accuracy_knn]

# Creating a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies
})

# Sorting by accuracy
comparison_df = comparison_df.sort_values(by='Accuracy', ascending=False)
print("\nModel Comparison:")
print(comparison_df)





Model Comparison:
                 Model  Accuracy
3                  SVM  0.982456
0  Logistic Regression  0.973684
2        Random Forest  0.964912
1        Decision Tree  0.947368
4                 k-NN  0.947368


In [22]:
#Best Performing Model: Random Forest Classifier achieved the highest accuracy. 


#Worst Performing Model: k-NN had the lowest accuracy.
#While simple, it can struggle with high-dimensional data and requires careful tuning of the number of neighbors.