<a href="https://colab.research.google.com/github/ashleyghaie1/LungCancerAIDetection/blob/main/(Ashley_Ghaie)_Lung_Cancer_Image_Detection_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import torch
from torchvision.models.resnet import resnet101
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import numpy as np

# Mount Google Drive and Begin Import of Datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#Data Set path for testing and trial of code
testFilePath = "/content/drive/MyDrive/Colab Notebooks/Lung Cancer Image Detection Project/Lung_Cancer_Test"

#Data Set path for dataset 1
dataset1FilePath = "/content/drive/MyDrive/Colab Notebooks/Lung Cancer Image Detection Project/dataset 1"

#Data Set path for dataset 2
dataset2FilePath = "/content/drive/MyDrive/Colab Notebooks/Lung Cancer Image Detection Project/dataset 2"

#Data Set path for dataset 3
dataset3FilePath = "/content/drive/MyDrive/Colab Notebooks/Lung Cancer Image Detection Project/dataset 3"

Mounted at /content/drive


# Custom Made Function for Loading in Different Datasets

In [None]:
def loadDataset(trainPath, testPath, FLAG=False):
  #Adjusts the images to align with the input ResNet101 expects for images
  transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to match ResNet input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize with ImageNet statistics
  ])

  if FLAG:
    fullDataset = datasets.ImageFolder(root=trainPath, transform=transform)

    trainSize = int(0.8 * len(fullDataset))
    testSize = len(fullDataset) - trainSize

    trainDataset, testDataset = random_split(fullDataset, [trainSize, testSize])
    trainDataset.indices

    trainLoader = DataLoader(trainDataset, batch_size=32, shuffle=True)
    testLoader = DataLoader(datasets.ImageFolder(root=testPath, transform=transform), batch_size=32, shuffle=False)

    print("Classes [0 - 1]:", fullDataset.classes)
    print("Number of images in the dataset:", len(fullDataset))
    print("Number of training images:", len(trainDataset))
    print("Number of testing images:", len(testDataset))

    return trainLoader, testLoader

  datasetTrain = datasets.ImageFolder(root=trainPath, transform=transform)
  print(datasetTrain.classes)
  print(datasetTrain.class_to_idx)

  datasetTest = datasets.ImageFolder(root=testPath, transform=transform)
  print(datasetTest.classes)
  print(datasetTest.class_to_idx)

  trainLoader = DataLoader(datasetTrain, batch_size=32, shuffle=True)
  testLoader = DataLoader(datasetTest, batch_size=32, shuffle=False)

  return trainLoader, testLoader

# ResNet101 Model and Training for Cancerous and Normal Lung Images Function

In [None]:
#If there is a GPU use that and if not, use the CPU for processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def loadResNetModel(trainLoader):
  model = resnet101(pretrained=False)
  numClasses = 2 #cancer and normal
  model.fc = nn.Linear(model.fc.in_features, numClasses)

  #Loss and Optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  #Training
  num_epochs = 1 #Number of runs for the entire dataset (10 times usually)
  model.to(device)

  for epoch in range(num_epochs):
    model.train()
    for inputs, labels in trainLoader:
      inputs, labels = inputs.to(device), labels.to(device)

      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
  return model

# Testing the Model Function

In [None]:
def testModel(model, testLoader):
  # Testing and evaluation
  allLabels = []
  allPredictions = []

  with torch.no_grad():
      model.eval()
      for images, labels in testLoader:
          images, labels = images.to(device), labels.to(device)
          outputs = model(images)
          _, predicted = torch.max(outputs.data, 1)
          allLabels.extend(labels.cpu().numpy())
          allPredictions.extend(predicted.cpu().numpy())

  # Calculate metrics
  f1 = f1_score(allLabels, allPredictions, average='weighted')
  accuracy = accuracy_score(allLabels, allPredictions)
  confMatrix = confusion_matrix(allLabels, allPredictions)

  print(f"The Accuracy is: {accuracy}")
  print(f"The Calculated F1 Score is: {f1}")
  print("Confusion Matrix:   [True Positive, False Negative]\n",
    "                   [False Positive, True Negative]\n")
  print(confMatrix)

#Training and Testing the Test Dataset for Debugging

In [None]:
#trainLoader, testLoader = loadDataset(testFilePath + "/train/", testFilePath + "/test/")

In [None]:
#testResNetModel = loadResNetModel(trainLoader)

In [None]:
#testModel(testResNetModel, testLoader)

#Training and Testing Dataset 1
https://www.kaggle.com/datasets/diayruldip/carinocroma

In [None]:
dataset1TrainLoader, dataset1TestLoader = loadDataset(dataset1FilePath + "/train/", dataset1FilePath + "/test/")

['cancer', 'normal']
{'cancer': 0, 'normal': 1}
['cancer', 'normal']
{'cancer': 0, 'normal': 1}


In [None]:
dataset1ResNetModel = loadResNetModel(dataset1TrainLoader)



Epoch 1, Loss: 1.9852302074432373


In [None]:
testModel(dataset1ResNetModel, dataset1TestLoader)

The Accuracy is: 0.834920634920635
The Calculated F1 Score is: 0.765754106520657
Confusion Matrix:   [True Positive, False Negative]
                    [False Positive, True Negative]

[[261   0]
 [ 52   2]]


#Training and Testing Dataset 2 (Manual image splitting of 80% training and 20% test)
https://data.mendeley.com/datasets/p2r42nm2ty/2

In [None]:
dataset2TrainLoader, dataset2TestLoader = loadDataset(dataset2FilePath + "/train/", dataset2FilePath, FLAG=True)

Classes [0 - 1]: ['cancer', 'normal']
Number of images in the dataset: 364
Number of training images: 291
Number of testing images: 73


In [None]:
dataset2ResNetModel = loadResNetModel(dataset2TrainLoader)



Epoch 1, Loss: 0.060320913791656494


In [None]:
testModel(dataset2ResNetModel, dataset2TestLoader)

The Accuracy is: 0.11650485436893204
The Calculated F1 Score is: 0.024314056563951032
Confusion Matrix:   [True Positive, False Negative]
                    [False Positive, True Negative]

[[ 48   0]
 [364   0]]


#Training and Testing Dataset 3 (Big Dataset with manual image splitting of 80% training and 20% test)
https://www.kaggle.com/datasets/rm1000/lung-cancer-histopathological-images?utm_source

In [None]:
dataset3TrainLoader, dataset3TestLoader = loadDataset(dataset3FilePath + "/train/", dataset3FilePath, FLAG=True)

Classes [0 - 1]: ['cancer', 'normal']
Number of images in the dataset: 15000
Number of training images: 12000
Number of testing images: 3000


In [None]:
dataset3ResNetModel = loadResNetModel(dataset3TrainLoader)



Epoch 1, Loss: 0.009985128417611122


In [None]:
testModel(dataset3ResNetModel, dataset3TestLoader)

The Accuracy is: 0.6496666666666666
The Calculated F1 Score is: 0.787633865427359
Confusion Matrix:   [True Positive, False Negative]
                    [False Positive, True Negative]

[[9745 5255]
 [   0    0]]


#Cross Testing of Datasets

##Testing Dataset 1 ResNet101 model

In [None]:
print("Testing ResNet101 Model Trained with Dataset 1:\n\n")
print("Testing with Dataset 2 Images\n")
testModel(dataset1ResNetModel, dataset2TestLoader)

print("Testing with Dataset 3 Images\n")
testModel(dataset1ResNetModel, dataset3TestLoader)

Testing ResNet101 Model Trained with Dataset 1:


Testing with Dataset 2 Images

The Accuracy is: 0.12378640776699029
The Calculated F1 Score is: 0.03891772976494376
Confusion Matrix:   [True Positive, False Negative]
                    [False Positive, True Negative]

[[ 48   0]
 [361   3]]
Testing with Dataset 3 Images



##Testing Dataset 2 ResNet101 model

In [None]:
print("Testing ResNet101 Model Trained with Dataset 2:\n\n")
print("Testing with Dataset 1 Images\n")
testModel(dataset2ResNetModel, dataset1TestLoader)

print("Testing with Dataset 3 Images\n")
testModel(dataset2ResNetModel, dataset3TestLoader)

##Testing Dataset 3 ResNet101 model

In [None]:
print("Testing ResNet101 Model Trained with Dataset 3:\n\n")
print("Testing with Dataset 1 Images\n")
testModel(dataset3ResNetModel, dataset1TestLoader)

print("Testing with Dataset 2 Images\n")
testModel(dataset3ResNetModel, dataset2TestLoader)