In [None]:
!pip install medmnist

# STEP 1: Data Exploration 

In [None]:
from medmnist import BreastMNIST
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load dataset
train_ds = BreastMNIST(split='train', download=True)
test_ds = BreastMNIST(split='test', download=True)

# Convert to numpy arrays
X_train_full = train_ds.imgs.reshape(len(train_ds), -1)
y_train_full = train_ds.labels.flatten()

X_test = test_ds.imgs.reshape(len(test_ds), -1)
y_test = test_ds.labels.flatten()


## Train/Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,          # 20% Validation
    random_state=42,
    stratify=y_train_full   # gleiche Klassenverteilung
)

## Normalization 

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)  # Fit nur auf Train
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

In [None]:
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

## Class distribution

In [None]:
import matplotlib.pyplot as plt

plt.hist(y_train, bins=2, edgecolor="black")
plt.xticks([0,1])
plt.title("Class distribution in training set")
plt.xlabel("Label: 0=benign, 1=malignant")
plt.ylabel("Count")
plt.show()

print("Benign:", (y_train==0).sum())
print("Malignant:", (y_train==1).sum())

In [None]:
# import matplotlib.pyplot as plt
# import random

# plt.figure(figsize=(10, 4))
# for i in range(12):
#     idx = random.randint(0, len(X_train)-1)
#     plt.subplot(2, 6, i+1)
#     plt.imshow(X_train[idx], cmap="gray")
#     plt.title(f"Label: {y_train[idx]}")
#     plt.axis("off")
# plt.show()


## Pixel intensity statistics

This shows overall brightness and contrast differences.

In [None]:
print("Min pixel:", X_train.min())
print("Max pixel:", X_train.max())
print("Mean pixel:", X_train.mean())
print("Std pixel:", X_train.std())


In [None]:
plt.hist(X_train.ravel(), bins=50)
plt.title("Pixel intensity distribution")
plt.xlabel("Pixel value")
plt.ylabel("Count")
plt.show()


## Compare class-wise intensity differences

Sometimes malignant vs benign images differ subtly:

In [None]:
benign = X_train[y_train == 0]
malignant = X_train[y_train == 1]

print("Benign mean:", benign.mean())
print("Malignant mean:", malignant.mean())

In [None]:
plt.figure(figsize=(8,5))

plt.hist(benign.ravel(), bins=50, alpha=0.5, label="Benign", density=True)
plt.hist(malignant.ravel(), bins=50, alpha=0.5, label="Malignant", density=True)

plt.legend()
plt.title("Normalized Pixel Intensity Distribution by Class")
plt.xlabel("Pixel Value")
plt.ylabel("Probability Density")
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(8,5))
sns.kdeplot(benign.ravel(), label="Benign")
sns.kdeplot(malignant.ravel(), label="Malignant")
plt.title("KDE Pixel Intensity Distribution")
plt.legend()
plt.show()

## Check image similarity using PCA (2D visualization)

Flatten images → reduce to 2 principal components → scatter plot.

In [None]:
from sklearn.decomposition import PCA

X_flat = X_train.reshape(len(X_train), -1)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_flat)

plt.figure(figsize=(7, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap="coolwarm", s=2)
plt.title("PCA of BreastMNIST")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Label (0=benign, 1=malignant)")
plt.show()

## Average images per class

This is extremely insightful for medical data.

In [None]:
avg_benign = benign.mean(axis=0)
avg_malignant = malignant.mean(axis=0)

plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.imshow(avg_benign.reshape((28,28)), cmap="gray")
plt.title("Average Benign")

plt.subplot(1,2,2)
plt.imshow(avg_malignant.reshape((28,28)), cmap="gray")
plt.title("Average Malignant")

plt.show()

### Difference Map

In [None]:
plt.imshow(abs(avg_malignant.reshape((28,28)) - avg_benign.reshape((28,28))), cmap="seismic")
plt.title("Difference Map (Malignant - Benign)")
plt.colorbar()
plt.show()

In [None]:
import numpy as np
from scipy.stats import sem

# Means
mean_b = benign.mean(axis=0)
mean_m = malignant.mean(axis=0)

# Standardfehler
sem_b = sem(benign, axis=0)
sem_m = sem(malignant, axis=0)

# 95% CI
ci_low_b = mean_b - 1.96 * sem_b
ci_high_b = mean_b + 1.96 * sem_b

ci_low_m = mean_m - 1.96 * sem_m
ci_high_m = mean_m + 1.96 * sem_m


In [None]:
ci_width_b = (ci_high_b - ci_low_b).reshape(28,28)
ci_width_m = (ci_high_m - ci_low_m).reshape(28,28)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.imshow(ci_width_b, cmap="magma")
plt.title("CI Width (Benign)")
plt.colorbar()

plt.subplot(1,2,2)
plt.imshow(ci_width_m, cmap="magma")
plt.title("CI Width (Malignant)")
plt.colorbar()

plt.show()


# STEP 2: Model Training

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

print("Accuracy:", logreg.score(X_test, y_test))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# --- Training Time messen ---
start = time.time()

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

end = time.time()
train_time = end - start

# --- Predictions ---
y_pred = logreg.predict(X_test)

# --- Metrics in Dictionary speichern ---
logreg_results = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
    "train_time_sec": train_time
}

print(logreg_results)


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# --- Training Time messen ---
start = time.time()

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)

end = time.time()
train_time = end - start

# --- Predictions ---
y_pred = rf.predict(X_test)

# --- Metrics als Dictionary ---
rf_results = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
    "train_time_sec": train_time
}

print(rf_results)


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Nimm z.B. den ersten Entscheidungsbaum aus dem Random Forest
estimator = rf.estimators_[0]

plt.figure(figsize=(20, 10))
plot_tree(estimator, 
          filled=True,
          feature_names=[f"pixel_{i}" for i in range(X_train.shape[1])],
          class_names=["Benign", "Malignant"],
          max_depth=5,     
          fontsize=6)
plt.show()


## XGBoost

In [None]:
# from sklearn.model_selection import GridSearchCV

# params = {
#     "max_depth": [3, 4, 5],
#     "learning_rate": [0.01, 0.05, 0.1],
#     "n_estimators": [200, 400],
#     "subsample": [0.7, 0.9],
# }

# grid = GridSearchCV(
#     xgb.XGBClassifier(eval_metric="logloss"),
#     param_grid=params,
#     cv=3,
#     scoring="accuracy",
#     n_jobs=-1
# )

# grid.fit(X_train, y_train)

# print("Beste Parameter:", grid.best_params_)
# print("Beste Accuracy:", grid.best_score_)


##  SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Training Time messen ---
start = time.time()

svm = SVC(kernel="rbf", C=5, gamma="scale")
svm.fit(X_train, y_train)

end = time.time()
train_time = end - start

# --- Predictions ---
y_pred = svm.predict(X_test)

# --- Metrics als Dictionary ---
svm_results = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
    "train_time_sec": train_time
}

print(svm_results)


## KNN

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_values = range(1, 51)   # k = 1 bis 20
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

plt.figure(figsize=(8, 5))
plt.plot(k_values, accuracies, marker="o")
plt.xticks(k_values)
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy")
plt.title("KNN Elbow Criterion")
plt.grid()
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, y_pred))


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Training Time messen ---
start = time.time()

nb = GaussianNB()
nb.fit(X_train, y_train)

end = time.time()
train_time = end - start

# --- Predictions ---
y_pred = nb.predict(X_test)

# --- Metrics als Dictionary ---
nb_results = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
    "train_time_sec": train_time
}

print(nb_results)

## PCA + SVM

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# --- PCA ---
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# --- Training Time messen ---
start = time.time()

svm_pca = SVC(kernel="rbf", C=5, gamma="scale")
svm_pca.fit(X_train_pca, y_train)

end = time.time()
train_time = end - start

# --- Predictions ---
y_pred = svm_pca.predict(X_test_pca)

# --- Metrics als Dictionary ---
svm_pca_results = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
    "train_time_sec": train_time
}

print(svm_pca_results)

## CNN

In [None]:
import numpy as np
from medmnist import BreastMNIST
from sklearn.model_selection import train_test_split

# Datensätze laden
train_ds = BreastMNIST(split='train', download=True)
test_ds  = BreastMNIST(split='test',  download=True)

# Bilder als float32 + Normalisierung auf [0,1]
X_full = train_ds.imgs.astype("float32") / 255.0     # (N, 28, 28)
y_full = train_ds.labels.flatten().astype("int32")   # (N,)

X_test_cnn = test_ds.imgs.astype("float32") / 255.0
y_test_cnn = test_ds.labels.flatten().astype("int32")

# Kanal-Dimension hinzufügen → (N, 28, 28, 1)
X_full     = X_full[..., np.newaxis]
X_test_cnn = X_test_cnn[..., np.newaxis]

print("Full train shape:", X_full.shape)
print("Test shape:", X_test_cnn.shape)

X_train_cnn, X_val_cnn, y_train_cnn, y_val_cnn = train_test_split(
    X_full, y_full,
    test_size=0.2,
    random_state=42,
    stratify=y_full
)

print("Train:", X_train_cnn.shape, "Val:", X_val_cnn.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.metrics import AUC

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
    MaxPooling2D((2,2)),
    
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')   # 1 Output-Neuron für binäre Klassifikation
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', AUC(name='auc')]
)

model.summary()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import AUC

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
    MaxPooling2D((2,2)),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', AUC(name='auc')]
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    "best_cnn_breastmnist.keras",
    monitor="val_loss",
    save_best_only=True
)


In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ---- Training + Zeitmessung ----
start = time.time()

history = model.fit(
    X_train_cnn, y_train_cnn,
    validation_data=(X_val_cnn, y_val_cnn),
    epochs=30,
    batch_size=64,
    callbacks=[early_stop, checkpoint],
    verbose=1
)

train_time = time.time() - start

# ---- Evaluation ----
y_prob = model.predict(X_test_cnn)
y_pred = (y_prob > 0.5).astype(int).flatten()

cnn_results = {
    "accuracy":        accuracy_score(y_test_cnn, y_pred),
    "precision":       precision_score(y_test_cnn, y_pred),
    "recall":          recall_score(y_test_cnn, y_pred),
    "f1_score":        f1_score(y_test_cnn, y_pred),
    "train_time_sec":  train_time
}

print(cnn_results)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title("Accuracy")
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Loss")
plt.legend()

plt.show()


## Ergebnisse

In [None]:
import pandas as pd

# Alle Ergebnisse in eine Liste packen
all_results = {
    "Logistic Regression": logreg_results,
    "Random Forest": rf_results,
    "SVM": svm_results,
    "Naive Bayes (Gaussian)": nb_results,
    "SVM + PCA": svm_pca_results,
    "CNN": cnn_results
}

# DataFrame erstellen
results_df = pd.DataFrame(all_results).T   # .T = Transpose, damit Modelle Zeilen sind

results_df
