<a href="https://colab.research.google.com/github/Yashmaini30/Breast-Cancer-Detection/blob/main/ResNet50_with_ML_classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from __future__ import absolute_import, print_function, division, unicode_literals
import os
import glob
import shutil
import pandas as pd
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, Flatten, MaxPooling2D, Dropout, Dense, Activation, GlobalAveragePooling2D
from keras import regularizers
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

!pip install imagehash
import imagehash

!pip install lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Collecting nvidia-nccl-cu12 (from xgboost->lazypredict)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, lazypredict
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
data_dir = "/content/drive/MyDrive/archive"
data_dir = pathlib.Path(data_dir)
train_path = data_dir / 'train'
test_path = data_dir / 'test'

In [4]:
BATCH_SIZE = 8
IMG_HEIGHT = 224
IMG_WIDTH = 224

In [5]:
image_train_gen = ImageDataGenerator(
    rescale=1./255,
    zoom_range=0.70,
    rotation_range=60,
    horizontal_flip=True,
    vertical_flip=True,
    width_shift_range=0.20,
    height_shift_range=0.20
)
train_data_gen = image_train_gen.flow_from_directory(
    train_path,
    shuffle=True,
    batch_size=BATCH_SIZE,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode='categorical'
)

Found 1075 images belonging to 3 classes.


In [6]:
img_val_gen = ImageDataGenerator(rescale=1./255)
val_data_gen = img_val_gen.flow_from_directory(
    test_path,
    batch_size=BATCH_SIZE,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode='categorical'
)

Found 281 images belonging to 3 classes.


In [7]:
from tensorflow.keras.applications import ResNet50

base_model = ResNet50(input_shape=(IMG_HEIGHT, IMG_WIDTH, 3),
                      include_top=False,
                      weights='imagenet')

# Set layers to non-trainable
base_model.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [8]:
feature_extractor = Model(inputs=base_model.input, outputs=GlobalAveragePooling2D()(base_model.output))

In [10]:
def extract_features_and_save(data_gen, output_file):
    features = []
    labels = []

    for inputs_batch, labels_batch in data_gen:
        features_batch = feature_extractor.predict(inputs_batch)
        features.append(features_batch)
        labels.append(labels_batch)
        if len(features) * BATCH_SIZE >= data_gen.samples:
            break

    features = np.vstack(features)
    labels = np.vstack(labels)
    labels = np.argmax(labels, axis=1)  # Convert one-hot encoding to class indices

    feature_df = pd.DataFrame(features)
    feature_df['label'] = labels
    feature_df.to_csv(output_file, index=False)

extract_features_and_save(train_data_gen, 'train_features.csv')
extract_features_and_save(val_data_gen, 'val_features.csv')

# Load the features from CSV files
train_features = pd.read_csv('train_features.csv')
val_features = pd.read_csv('val_features.csv')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━

In [15]:
from sklearn.utils import resample
# Resampling
majority_class = train_features[train_features['label'] == 0]
minority_class_1 = train_features[train_features['label'] == 1]
minority_class_2 = train_features[train_features['label'] == 2]

# Upsample minority classes
minority_upsampled_1 = resample(minority_class_1,
                               replace=True,     # Sample with replacement
                               n_samples=len(majority_class),  # Match the number of samples in the majority class
                               random_state=123)  # Set random state for reproducibility

minority_upsampled_2 = resample(minority_class_2,
                               replace=True,     # Sample with replacement
                               n_samples=len(majority_class),  # Match the number of samples in the majority class
                               random_state=123)  # Set random state for reproducibility

In [16]:
upsampled_data = pd.concat([majority_class, minority_upsampled_1, minority_upsampled_2])

In [17]:
X_train = upsampled_data.drop('label', axis=1).values
y_train = upsampled_data['label'].values
X_test = val_features.drop('label', axis=1).values
y_test = val_features['label'].values

In [19]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

 97%|█████████▋| 28/29 [03:34<00:11, 11.21s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170769
[LightGBM] [Info] Number of data points in the train set: 1905, number of used features: 1085
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


100%|██████████| 29/29 [04:06<00:00,  8.49s/it]


In [20]:
print(models)

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
BernoulliNB                        0.43               0.60    None      0.50   
NearestCentroid                    0.43               0.60    None      0.52   
KNeighborsClassifier               0.56               0.58    None      0.61   
SGDClassifier                      0.61               0.56    None      0.63   
SVC                                0.46               0.53    None      0.49   
NuSVC                              0.53               0.51    None      0.54   
XGBClassifier                      0.73               0.50    None      0.71   
LGBMClassifier                     0.75               0.48    None      0.71   
BaggingClassifier                  0.73               0.48    None      0.71   
DecisionTreeClassifier             0.70               0.48    None      0.67   
RandomForestClassifier             0.75 