<a href="https://colab.research.google.com/github/amirmohammadkalateh/breast-cancer-2/blob/main/Breast_Cancer_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [3]:
# Load the dataset from the uploaded CSV file.
try:
    df = pd.read_csv('Breast_cancer_dataset.csv')
except FileNotFoundError:
    print("Error: The file 'Breast_cancer_dataset.csv' was not found.")
    exit()

print("Dataset loaded successfully.")
print("---")
print("First 5 rows of the dataset:")
print(df.head())
print("---")
print("Dataset information:")
df.info()
print("---")

Dataset loaded successfully.
---
First 5 rows of the dataset:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   



In [4]:
#Data Preprocessing

X = df.drop(columns=['id', 'diagnosis'], axis=1)
y = df['diagnosis']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Original 'diagnosis' categories: {label_encoder.classes_}")
print(f"Encoded 'diagnosis' values: {np.unique(y_encoded)}")
print("---")

Original 'diagnosis' categories: ['B' 'M']
Encoded 'diagnosis' values: [0 1]
---


In [5]:
# Split the data into training and testing sets.
# We use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [6]:
# Scale the features using StandardScaler.
# This is important for many algorithms, especially deep learning and regularized models,
# as it gives all features an equal weighting by standardizing them to a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data has been split and scaled successfully.")
print(f"Shape of training data (X_train_scaled): {X_train_scaled.shape}")
print(f"Shape of testing data (X_test_scaled): {X_test_scaled.shape}")
print("---")

Data has been split and scaled successfully.
Shape of training data (X_train_scaled): (455, 31)
Shape of testing data (X_test_scaled): (114, 31)
---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [7]:

# Define the model.
ml_model = RandomForestClassifier(random_state=42)
# Define the parameter grid for GridSearchCV.
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None]
}
print("Starting GridSearchCV for RandomForestClassifier...")
grid_search = GridSearchCV(estimator=ml_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)
print("\nGridSearchCV completed.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


Starting GridSearchCV for RandomForestClassifier...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

GridSearchCV completed.
Best parameters found: {'max_depth': 10, 'n_estimators': 50}
Best cross-validation score: 0.9604


In [8]:
# Evaluate the best model on the test data.
best_ml_model = grid_search.best_estimator_
y_pred_ml = best_ml_model.predict(X_test_scaled)
print("\nMachine Learning Model Evaluation on Test Data:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ml):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_ml, target_names=label_encoder.classes_))
print("---")


Machine Learning Model Evaluation on Test Data:
Accuracy: 0.9649
Classification Report:
              precision    recall  f1-score   support

           B       0.95      1.00      0.97        72
           M       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114

---


In [9]:
def build_dl_model(input_shape):
    model = Sequential()

    # First hidden layer
    model.add(Dense(128, activation='relu', input_shape=(input_shape,),
                    kernel_initializer='he_uniform',
                    kernel_regularizer=l1_l2(l1=0.001, l2=0.001),
                    kernel_constraint=MaxNorm(3)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # Second hidden layer
    model.add(Dense(64, activation='relu',
                    kernel_initializer='he_uniform',
                    kernel_regularizer=l1_l2(l1=0.001, l2=0.001),
                    kernel_constraint=MaxNorm(3)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # Output layer
    # Sigmoid activation is used for binary classification.
    model.add(Dense(1, activation='sigmoid'))

    return model

# Build the model.
input_shape = X_train_scaled.shape[1]
dl_model = build_dl_model(input_shape)

# Compile the model.
dl_model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

# Print a summary of the model architecture.
print("Deep Learning Model Summary:")
dl_model.summary()
print("---")

# --- Callbacks for training ---
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
tensorboard_callback = TensorBoard(log_dir="./logs")
# Train the deep learning model.
print("Training the Deep Learning Model...")
history = dl_model.fit(X_train_scaled, y_train,
                       epochs=100,
                       batch_size=32,
                       validation_data=(X_test_scaled, y_test),
                       callbacks=[early_stopping_callback, tensorboard_callback],
                       verbose=1)

print("---")
print("Deep Learning Model Training finished.")

#Evaluate the Deep Learning Model

loss, accuracy = dl_model.evaluate(X_test_scaled, y_test, verbose=0)
print("\nDeep Learning Model Evaluation on Test Data:")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Deep Learning Model Summary:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


---
Training the Deep Learning Model...
Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.6285 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5942 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6425 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 4/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6065 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 5/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6086 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 6/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6161 - loss: nan - val_accuracy: 0.6316 - val_loss: nan
Epoch 7/100
[1m15/15[0m [3