In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import sklearn

In [None]:
# Load the input embedding tensor back from the file
X = torch.load('/content/drive/My Drive/tensor.pth')

# Check the loaded tensor
print(X.size())

#X = torch.load('/content/drive/My Drive/tensor.pth')
#torch.Size([25000, 1000])

In [None]:
input_label = torch.empty(25000, 1)
label_count = 0

#Colon_aca = 0  -> 5048
#Colon_n = 1    -> 10048
#lung_aca = 2   -> 15048
#lung_n = 3     -> 20000
#lung_scc = 4   -> 25000

for i in range(25000):
    if i < 5048:
        input_label[i] = 0
    elif i >= 5048 and i < 10048:
        input_label[i] = 1
    elif i >= 10048 and i < 15048:
        input_label[i] = 2
    elif i >= 15048 and i < 20000:
        input_label[i] = 3
    else:
        input_label[i] = 4

In [None]:
#Splitting the data into Train and Test Datasets
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
X_train, X_test, y_train, y_test = train_test_split(X, input_label, test_size = 0.2, shuffle=True)

In [None]:
#Define a Custome Dataset Class to Create Dataloaders
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

In [None]:
# Create DataLoader instances for training and testing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
print(f"Size of Train Loader: {len(train_loader)}")
print(f"Size of Test Loader: {len(test_loader)}")

#Size of Train Loader: 625
#Size of Test Loader: 157

In [None]:
train_features_batch, train_labels_batch = next(iter(train_loader))
train_features_batch.shape, train_labels_batch.shape

#(torch.Size([32, 1000]), torch.Size([32, 1]))

In [None]:
#Starting to create a Baseline model
class CancerHistModel(nn.Module):
    def __init__(self,
                 input_shape: int,
                 hidden_units: int,
                 output_shape: int):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=input_shape, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.Softmax(),
            nn.Linear(in_features=hidden_units, out_features=output_shape)
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
input_embedding_shape = 1000
hidden_units = 100
output_shape = 5
Model = CancerHistModel(input_shape=input_embedding_shape,
                        hidden_units=hidden_units,
                        output_shape=output_shape)
Model

# CancerHistModel(
#   (layers): Sequential(
#     (0): Linear(in_features=1000, out_features=100, bias=True)
#     (1): ReLU()
#     (2): Linear(in_features=100, out_features=100, bias=True)
#     (3): Softmax(dim=None)
#     (4): Linear(in_features=100, out_features=5, bias=True)
#   )
# )

In [None]:
#Accuracy and LOSS FUNCTION DEFINITON FOR MODEL TRAINING
def accuracy_fn(y_true,y_pred):
  correct = torch.eq(y_true,y_pred).sum().item()
  acc = (correct / y_true.numel())
  return acc


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params = Model.parameters(),lr=0.1)

In [None]:
from timeit import default_timer

def print_train_time(start: float,
                     end: float,
                     device: torch.device = None):
  total_time = end=start
  print(f"toral time: {total_time:.3f} seconds")
  return total_time

In [None]:
#Creting a training loop and training a model on Batches of data

from tqdm.auto import tqdm #to get he progress bar
import time

#Set the seed the start the times
torch.manual_seed(42)
train_time_start_on_cpu = time.time

#Set number of epochs
epochs = 4

#Create training and test loop
for epoch in tqdm(range(epochs)):
  print(f"Epoch: {epoch} \n---")

  #Training
  train_loss = 0
  #Add a loop to loop through training batches
  for batch, (X,y) in enumerate (train_loader):
    Model.train()

    #Forward Pass
    y_pred = Model(X)

    # y = y.argmax(dim=1)
    #Calculate loss
    loss = loss_fn(y_pred,y.squeeze().long())
    # print(f"this is my calculates loss: {loss}")
    train_loss += loss

    #Optimizer zero grad
    optimizer.zero_grad()

    #Loss backward
    loss.backward()

    #Optimizer step
    optimizer.step()

    #print out what's happeing
    if batch%100 == 0:
      print(f"Lookd at { batch * len(X)}/{len(train_loader.dataset)} samples.")

  #Divide total train loss by length of train dataloader
  train_loss /= len(train_loader)

  #Testing
  test_loss, test_acc = 0,0
  Model.eval()
  with torch.inference_mode():
    for X,y in test_loader:
      test_pred = Model(X)
      # y = y.argmax(dim=1)
      test_loss += loss_fn(test_pred,y.squeeze().long())

      test_acc += accuracy_fn(y_true=y,y_pred=test_pred.argmax(dim=1))

    #Calculate the test loss average per batch
    test_loss /= len(test_loader)

    #calculate test acc average per batch
    test_acc /= len(test_loader)

  print(f"\nTrain Loss {train_loss:.4f} | Test loss: {test_loss:.4f}, Test acc: {test_acc:.3f}")
  train_time_end_on_cpu = time.time

Epoch: 0
---
Lookd at 0/20000 samples.
Lookd at 3200/20000 samples.
Lookd at 6400/20000 samples.
Lookd at 9600/20000 samples.
Lookd at 12800/20000 samples.
Lookd at 16000/20000 samples.
Lookd at 19200/20000 samples.

Train Loss 0.9771 | Test loss: 0.6682, Test acc: 6.810
Epoch: 1
---
Lookd at 0/20000 samples.
Lookd at 3200/20000 samples.
Lookd at 6400/20000 samples.
Lookd at 9600/20000 samples.
Lookd at 12800/20000 samples.
Lookd at 16000/20000 samples.
Lookd at 19200/20000 samples.

Train Loss 0.6590 | Test loss: 0.6431, Test acc: 6.778
Epoch: 2
---
Lookd at 0/20000 samples.
Lookd at 3200/20000 samples.
Lookd at 6400/20000 samples.
Lookd at 9600/20000 samples.
Lookd at 12800/20000 samples.
Lookd at 16000/20000 samples.
Lookd at 19200/20000 samples.

Train Loss 0.6148 | Test loss: 0.5911, Test acc: 6.850
Epoch: 3
---
Lookd at 0/20000 samples.
Lookd at 3200/20000 samples.
Lookd at 6400/20000 samples.
Lookd at 9600/20000 samples.
Lookd at 12800/20000 samples.
Lookd at 16000/20000 samples.
Lookd at 19200/20000 samples.

Train Loss 0.5500 | Test loss: 0.5308, Test acc: 6.915




As it can be Observed here, both train and test loss decrease as number we train for more epochs but the test accuracy increases only slightly. This shows that Baseline Model is not able to perform a good job at classifying images for detection of cancer type and creates a accuracy of just 6.91%.