In [8]:
get_ipython().system('pip install opencv-python')
get_ipython().system('pip install scikit-learn pillow tqdm')
get_ipython().system('pip install torch torchvision numpy matplotlib')
get_ipython().system('pip install tqdm')
get_ipython().system('pip install Augmentor')
get_ipython().system('pip install tensorflow')
get_ipython().system('pip install faiss-cpu')
get_ipython().system('pip install pillow')



In [11]:
import sys
import os

# Add the ProtoPNet directory to the Python path
proto_path = "/home/alan/Documents/SelfReferencing_CNN_Project/ProtoPNetRepoClone/ProtoPNet"
if proto_path not in sys.path:
    sys.path.append(proto_path)
    
import cv2   
import torch
import PIL
import pandas as pd
import numpy as np
import Augmentor
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import tensorflow as tf
import torch.nn as nn
import torch.optim as optim
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
from torchvision import models
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from resnet_features import resnet50_features
from model import PPNet
from model import construct_PPNet



In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.version.cuda)
print(f"Number of GPUs available: {torch.cuda.device_count()}")
print(f"GPU Name: {torch.cuda.get_device_name(0)}")


cuda:0
12.1
Number of GPUs available: 1
GPU Name: NVIDIA GeForce RTX 4070


In [13]:
# Load data set from local folder by class
def load_images(dataset_path):
    data = []
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        if os.path.isdir(class_path):
            for img_file in glob(f"{class_path}/*.jpg"):  # Adjust the file extension if needed
                data.append({"file_path": img_file, "class": class_folder})
    df = pd.DataFrame(data)
    return df

# Apply preprocessing to each image
def preprocess_image(image_path, target_size=(224, 224)):
    image = cv2.imread(image_path)
    image = cv2.resize(image, target_size)
    #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB if needed
    return image

# Create dataframe with ground truth
dataset_path = "/home/alan/Documents/SelfReferencing_CNN_Project/ChestXrays/Curated Dataset for COVID-19 Posterior-Anterior Chest Radiography Images (X-Rays)/Curated X-Ray Dataset"
df = load_images(dataset_path)
print(df.tail())

# Add images to dataframe
df['image'] = df['file_path'].apply(preprocess_image)
print(df.head())



                                              file_path     class
9204  /home/alan/Documents/SelfReferencing_CNN_Proje...  COVID-19
9205  /home/alan/Documents/SelfReferencing_CNN_Proje...  COVID-19
9206  /home/alan/Documents/SelfReferencing_CNN_Proje...  COVID-19
9207  /home/alan/Documents/SelfReferencing_CNN_Proje...  COVID-19
9208  /home/alan/Documents/SelfReferencing_CNN_Proje...  COVID-19
                                           file_path                class  \
0  /home/alan/Documents/SelfReferencing_CNN_Proje...  Pneumonia-Bacterial   
1  /home/alan/Documents/SelfReferencing_CNN_Proje...  Pneumonia-Bacterial   
2  /home/alan/Documents/SelfReferencing_CNN_Proje...  Pneumonia-Bacterial   
3  /home/alan/Documents/SelfReferencing_CNN_Proje...  Pneumonia-Bacterial   
4  /home/alan/Documents/SelfReferencing_CNN_Proje...  Pneumonia-Bacterial   

                                               image  
0  [[[21, 21, 21], [17, 17, 17], [22, 22, 22], [2...  
1  [[[140, 140, 140], [140, 140

In [14]:
class_distribution = df['class'].value_counts()
print(class_distribution)

# Get the size of the smallest class
min_class_count = class_distribution.min()

# Function to undersample a class to the minimum class count
def undersample_class(df, class_name, n_samples):
    return df[df['class'] == class_name].sample(n_samples, random_state=42)

# Create a balanced DataFrame by undersampling each class
balanced_df = pd.concat([
    undersample_class(df, class_name, min_class_count)
    for class_name in class_distribution.index
])

# Shuffle the balanced DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new class distribution
balanced_class_distribution = balanced_df['class'].value_counts()
print(balanced_class_distribution)

class
Normal                 3271
Pneumonia-Bacterial    3001
Pneumonia-Viral        1656
COVID-19               1281
Name: count, dtype: int64
class
Normal                 1281
Pneumonia-Bacterial    1281
COVID-19               1281
Pneumonia-Viral        1281
Name: count, dtype: int64


In [15]:
# Convert images to numpy array
X = np.stack(balanced_df['image'].values)

# Convert class labels to one-hot encoding
y = pd.get_dummies(balanced_df['class']).values

# Normalize pixel values between 0 and 1
X = X / 255.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the resulting arrays
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Convert one-hot encoded labels back to class labels
y_train_labels = np.argmax(y_train, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Get class names from one-hot encoding
class_names = pd.get_dummies(balanced_df['class']).columns

# Create DataFrames for train and test sets
train_df = pd.DataFrame({'image': list(X_train), 'class': [class_names[i] for i in y_train_labels]})
test_df = pd.DataFrame({'image': list(X_test), 'class': [class_names[i] for i in y_test_labels]})

# Define the directories
train_dir = '/home/alan/Documents/SelfReferencing_CNN_Project/datasets/train'
test_dir = '/home/alan/Documents/SelfReferencing_CNN_Project/datasets/test'

# Ensure the directories exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Function to save images to the appropriate directory
def save_images(df, base_dir):
    for idx, row in df.iterrows():
        class_dir = os.path.join(base_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)
        img = (row['image'] * 255).astype(np.uint8)  # Convert back to uint8 format
        img = PIL.Image.fromarray(img)
        img.save(os.path.join(class_dir, f'image_{idx}.jpg'))

# Save training and testing images
save_images(train_df, train_dir)
save_images(test_df, test_dir)





# Convert your numpy arrays to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create data loaders
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)



X_train shape: (4099, 224, 224, 3)
y_train shape: (4099, 4)
X_test shape: (1025, 224, 224, 3)
y_test shape: (1025, 4)


In [16]:
import Augmentor
import os

def makedir(path):
    '''
    if path does not exist in the file system, create it
    '''
    if not os.path.exists(path):
        os.makedirs(path)

datasets_root_dir = '/home/alan/Documents/SelfReferencing_CNN_Project/datasets/'
train_dir = datasets_root_dir + 'train/'
target_dir = datasets_root_dir + 'train_augmented/'

makedir(target_dir)

folders = [os.path.join(train_dir, folder) for folder in next(os.walk(train_dir))[1]]
target_folders = [os.path.join(target_dir, folder) for folder in next(os.walk(train_dir))[1]]

for i in range(len(folders)):
    fd = folders[i]
    tfd = target_folders[i]
    makedir(tfd)
    
    # Create a pipeline for each folder
    p = Augmentor.Pipeline(source_directory=fd, output_directory=tfd)
    
    # Define augmentation operations
    p.rotate(probability=1, max_left_rotation=15, max_right_rotation=15)
    p.flip_left_right(probability=0.5)
    p.skew(probability=1, magnitude=0.2)
    p.shear(probability=1, max_shear_left=10, max_shear_right=10)
    # add random distortion
    # p.random_distortion(probability=1.0, grid_width=10, grid_height=10, magnitude=5)
    
    # Set the number of augmented samples to be generated
    num_samples = 1281  # Adjust the number based on your dataset size
    
    # Generate the augmented images
    p.sample(num_samples)

print("Data augmentation completed.")

Initialised with 1027 image(s) found.
Output directory set to /home/alan/Documents/SelfReferencing_CNN_Project/datasets/train_augmented/Pneumonia-Bacterial.

Processing <PIL.Image.Image image mode=RGB size=224x224 at 0x774CD3D0FFE0>: 100%|██████████| 1281/1281 [00:02<00:00, 606.76 Samples/s]


Initialised with 1036 image(s) found.
Output directory set to /home/alan/Documents/SelfReferencing_CNN_Project/datasets/train_augmented/Pneumonia-Viral.

Processing <PIL.Image.Image image mode=RGB size=224x224 at 0x774CD536B920>: 100%|██████████| 1281/1281 [00:02<00:00, 582.40 Samples/s]


Initialised with 1014 image(s) found.
Output directory set to /home/alan/Documents/SelfReferencing_CNN_Project/datasets/train_augmented/Normal.

Processing <PIL.Image.Image image mode=RGB size=224x224 at 0x774CD3D0C260>: 100%|██████████| 1281/1281 [00:02<00:00, 578.80 Samples/s]


Initialised with 1022 image(s) found.
Output directory set to /home/alan/Documents/SelfReferencing_CNN_Project/datasets/train_augmented/COVID-19.

Processing <PIL.Image.Image image mode=RGB size=224x224 at 0x774E67A5F140>: 100%|██████████| 1281/1281 [00:02<00:00, 574.36 Samples/s] 

Data augmentation completed.





In [18]:
!python3 /home/alan/Documents/SelfReferencing_CNN_Project/ProtoPNetRepoClone/ProtoPNet/main.py -gpuid=0

0
Traceback (most recent call last):
  File "/home/alan/Documents/SelfReferencing_CNN_Project/ProtoPNetRepoClone/ProtoPNet/main.py", line 37, in <module>
    shutil.copy(src=os.path.join(os.getcwd(), 'settings.py'), dst=model_dir)
  File "/home/alan/.config/jupyterlab-desktop/jlab_server/lib/python3.12/shutil.py", line 435, in copy
    copyfile(src, dst, follow_symlinks=follow_symlinks)
  File "/home/alan/.config/jupyterlab-desktop/jlab_server/lib/python3.12/shutil.py", line 260, in copyfile
    with open(src, 'rb') as fsrc:
         ^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/home/alan/Documents/SelfReferencing_CNN_Project/JupyterNotebooks/settings.py'
