# Breast Cancer Detection with Logistic Regression
Objective: load the dataset, clean it, train a logistic regression model, and evaluate accuracy, confusion matrix, and classification report.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

: 

In [None]:
candidate_paths = [
    Path("breast_cancer_bd.csv"),
    Path(r"C:\Users\asus zenbook\Downloads\breast_cancer_bd.csv"),
    Path("..\\..\\..\\Downloads\\breast_cancer_bd.csv"),
]

csv_path = next((p for p in candidate_paths if p.exists()), None)
if csv_path is None:
    raise FileNotFoundError("breast_cancer_bd.csv not found; place it in this folder or update the path.")

df = pd.read_csv(csv_path)
print("Using file:", csv_path)
print("Raw shape:", df.shape)
print(df.head())

In [None]:
# Drop non-informative columns if present
non_info_cols = [c for c in df.columns if c.lower().startswith("unnamed")]
if "id" in df.columns:
    non_info_cols.append("id")
df = df.drop(columns=non_info_cols, errors="ignore")
print("After dropping non-informative cols:", df.shape)
print("Columns:", df.columns.tolist())


# Identify target column
if "diagnosis" in df.columns:
    y = df["diagnosis"].map({"M": 1, "B": 0}).fillna(df["diagnosis"])
    X = df.drop(columns=["diagnosis"])
elif "Class" in df.columns:
    y = df["Class"]
    X = df.drop(columns=["Class"])
else:
    y = df.iloc[:, -1]
    X = df.iloc[:, :-1]


# Handle missing values
df_clean = pd.concat([X, y], axis=1).dropna()
y = df_clean.iloc[:, -1]
X = df_clean.iloc[:, :-1]


print("After cleaning:", X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)


y_pred = log_reg.predict(X_test_scaled)


print("Model trained. Sample predictions:")
print(y_pred[:10])

In [None]:
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)


print(f"Accuracy: {acc:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)