# Breast Cancer Detection with Logistic Regression


In [29]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [30]:
csv_path = Path("breast_cancer_bd.csv")
if not csv_path.exists():
    raise FileNotFoundError("breast_cancer_bd.csv not found in this folder.")

df = pd.read_csv(csv_path)
print("Using file:", csv_path)
print("Raw shape:", df.shape)
print(df.head())

Using file: breast_cancer_bd.csv
Raw shape: (699, 11)
   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0 

In [31]:
df = df.replace('?', np.nan)
non_info_cols = [c for c in df.columns if c.lower().startswith('unnamed')] + (["id"] if "id" in df.columns else [])
df = df.drop(columns=non_info_cols, errors="ignore")

if "diagnosis" in df.columns:
    y = df["diagnosis"].map({"M": 1, "B": 0})
    X = df.drop(columns=["diagnosis"])
elif "Class" in df.columns:
    y = df["Class"]
    X = df.drop(columns=["Class"])
else:
    y = df.iloc[:, -1]
    X = df.iloc[:, :-1]

X = X.apply(pd.to_numeric, errors='coerce')
clean = pd.concat([X, y], axis=1).dropna()
X = clean.iloc[:, :-1]
y = clean.iloc[:, -1]

print("After cleaning:", X.shape, y.shape)
print("Columns:", X.columns.tolist())

After cleaning: (683, 10) (683,)
Columns: ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (478, 10) Test shape: (205, 10)


In [33]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

print("Trained. Sample predictions:", y_pred[:10])

Trained. Sample predictions: [2 2 2 2 4 2 2 2 2 2]


In [34]:
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

Accuracy: 0.9561

Confusion Matrix:
 [[128   5]
 [  4  68]]

Classification Report:
               precision    recall  f1-score   support

           2       0.97      0.96      0.97       133
           4       0.93      0.94      0.94        72

    accuracy                           0.96       205
   macro avg       0.95      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

