# **Import All Necessary Libaries**

In [1]:
import os  # Helps us work with files and directories on our computer (like loading images from folders)
import numpy as np  # A go-to library for handling data in arrays and performing math operations on them
import tensorflow as tf  # The core library for building and training deep learning models, like neural networks
#from tensorflow.keras.preprocessing.image import ImageDataGenerator  # Used for image preprocessing and augmentation to make our models more robust
from sklearn.model_selection import train_test_split  # Splits our data into training and testing sets so we can evaluate model performance
from PIL import Image  # A handy tool for opening, editing, and manipulating images (like converting to grayscale or resizing)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Mount The Drive And Load Data**

In [2]:
#from google.colab import drive #for drive related functions like loading and labeling data
#drive.mount('/content/drive') #mount the drive

dataset_path = '/content/drive/MyDrive/MammographyDataset' #path for the entire dataset
benign_path = os.path.join(dataset_path, 'Benign Masses') #append for benign masses
malignant_path = os.path.join(dataset_path, 'Malignant Masses')#append for malignant masses

#Data Analysis
#count the samples for both classes
benign_count = len([f for f in os.listdir(benign_path) if os.path.isfile(os.path.join(benign_path, f))])
malignant_count = len([f for f in os.listdir(malignant_path) if os.path.isfile(os.path.join(malignant_path, f))])

print(f"Number of Benign Samples: {benign_count}")
print(f"Number of Malignant Samples: {malignant_count}")


Number of Benign Samples: 10866
Number of Malignant Samples: 13719


# **Load Images Function**
**Load the images**

**Convert to Greyscale**

**Normalize Pixel Values**

**Resize to have same size images**

**Save the labels**

In [3]:
def load_images_and_labels(folder_path, label):
  #lists to hold images and labels
    images = []
    labels = []
    # Loop that iterates over each file in the specified folder
    for filename in os.listdir(folder_path):
      #get full file name with path
        file_path = os.path.join(folder_path, filename)
        #make sure its not a folder
        if os.path.isfile(file_path):
            #Load the images, convert to greyscale
            img = Image.open(file_path).convert('L')
            img = img.resize((224, 224))  # Resize the images to same size
            images.append(np.array(img) / 255.0)  # Normalize pixel values (between 0 and 1)
            labels.append(label)#add the labels
    return images, labels


**Call the load function for both class folders**

**Combine the both class samples**

**Split into training and testing Data**

In [2]:
#call the load function
benign_images, benign_labels = load_images_and_labels(benign_path, 0)  # Label 0 for benign
malignant_images, malignant_labels = load_images_and_labels(malignant_path, 1)  # Label 1 for malignant

#combine images of both classes
images = np.array(benign_images + malignant_images)
#combine labels
labels = np.array(benign_labels + malignant_labels)

#Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(images, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}, Test samples: {len(X_test)}")

Training samples: 17210, Validation samples: 2431, Test samples: 4945
