## 1. Import needed libraries and Set Device

In [None]:
# --- General Libraries ---
import os
import random
import warnings
import time

# --- Data Handling Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# --- Image Handling and Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# --- PyTorch Libraries ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# --- Progress and Profiling ---
from tqdm import tqdm  # For displaying progress during training
import torch.profiler  # For tracking GPU usage during training

# --- Warnings Handling ---
warnings.filterwarnings('ignore')  # Ignore warnings for cleaner output


In [None]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

### 2. Data Loading and Preprocessing

In this section, we use `torchvision.datasets.ImageFolder` to load lung image data (lung_aca, lung_n, lung_scc). 
We also apply necessary transformations (resizing, normalization), split the dataset into training and validation sets, 
and prepare DataLoaders with GPU support.


In [None]:
# Function to load the dataset from the specified directory
def loading_the_data(data_dir):

    # Initialize empty lists to store file paths and corresponding labels
    filepaths = []  # List to hold paths of the images
    labels = []     # List to hold the labels (folder names)

    # Get the list of subdirectories (each represents a class label)
    folds = os.listdir(data_dir)

    # Loop through each subdirectory
    for fold in folds:
        foldpath = os.path.join(data_dir, fold)  # Get the path of the subdirectory
        filelist = os.listdir(foldpath)  # Get the list of files in the subdirectory

        # Loop through each file in the subdirectory
        for file in filelist:
            fpath = os.path.join(foldpath, file)  # Get the full file path
            
            filepaths.append(fpath)  # Add the file path to the list
            labels.append(fold)      # Add the label (folder name) to the list

    # Convert the lists into pandas Series
    Fseries = pd.Series(filepaths, name='filepaths')  # Series for file paths
    Lseries = pd.Series(labels, name='labels')        # Series for labels

    # Concatenate the two Series into a single DataFrame
    df = pd.concat([Fseries, Lseries], axis=1)
    
    return df  # Return the DataFrame containing the file paths and labels

In [None]:
# Define the path to the dataset directory
data_dir = './lung_colon_image_set/lung_image_sets'  # Update the path based on your current directory structure

# Load the data using the function
df = loading_the_data(data_dir)

# Display the first few rows of the DataFrame to verify the data
df.head()

## 3. Data Visualization

##### Random Images from Three Classes

In [None]:
# Get unique class names
class_names = sorted(df['labels'].unique())
num_classes = len(class_names)

selected_classes = class_names[:3] 
num_images_per_class = 3  

plt.figure(figsize=(15, 10))

for idx, class_name in enumerate(selected_classes):
    class_df = df[df['labels'] == class_name]
    
    random_samples = class_df.sample(num_images_per_class)
    
    for i, (_, row) in enumerate(random_samples.iterrows()):
        image_path = row['filepaths']
        
        image = plt.imread(image_path)
        
        plt.subplot(3, len(selected_classes), idx * num_images_per_class + i + 1)
        plt.imshow(image)
        plt.title(f"Class: {class_name}")
        plt.axis("off")

plt.tight_layout()
plt.show()

In [None]:
data_balance = df.labels.value_counts()

# Data Balance Pie Chart with enhanced details
def custom_autopct(pct):
    total = sum(data_balance)
    val = int(round(pct*total/100.0))
    return "{:.1f}%\n({:d})".format(pct, val)

# Pie Chart
plt.figure(figsize=(6, 4))
plt.pie(data_balance, labels=data_balance.index, autopct=custom_autopct, colors=["#2092E6", "#6D8CE6", "#20D0E6"])
plt.title("Training Data Balance")
plt.axis("equal")
plt.show()

# Bar Chart for Category Distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=data_balance.index, y=data_balance.values, palette="Blues_d")
plt.title("Training Data Distribution (Bar Chart)")
plt.xlabel("Category")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.show()

# Additional Statistical Info (Number of categories, mean, median, etc.)
print("Statistical Summary of the Data Balance:")
print(data_balance.describe())

## 4. Data Preprocessing and Splitting

##### Dataset Preparation
We will use the custom LungDataset class to wrap our DataFrame objects and prepare them for PyTorch models using DataLoader.

In [None]:
# Custom PyTorch Dataset class
class LungDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)  # reset index to ensure continuous indices
        self.transform = transform  # image transformations (e.g. resizing, normalization)
        self.label2idx = {label: idx for idx, label in enumerate(sorted(self.df['labels'].unique()))}
        
        print(f"Dataset loaded with {len(self.df)} samples.")

    def __len__(self):
        return len(self.df)  # total number of samples

    def __getitem__(self, idx):
        image_path = self.df.loc[idx, 'filepaths']  # get image file path
        label = self.label2idx[self.df.loc[idx, 'labels']]  # convert label to index
        
        image = Image.open(image_path)  # open image
        if self.transform:
            image = self.transform(image)  # apply transformations (e.g. ToTensor)

        return image, label  # return transformed image and numeric label


##### Image Transforms
We use torchvision.transforms to resize, normalize, and convert images into PyTorch tensors.

In [None]:
# Define standard transforms (Resizing, Tensor conversion, Normalization)
img_size = 224
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor()
])

##### Splitting the Dataset
We split the original DataFrame into train, validation, and test sets (80/10/10 split).

In [None]:
# 80% train, 20% temp (validation + test)
train_df, temp_df = train_test_split(df, train_size=0.8, shuffle=True, random_state=42)

# 10% validation, 10% test
valid_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=42)

##### Data Split Visualization
This pie chart shows the proportion of the dataset split into training, validation, and test sets.

In [None]:
# Calculate the sizes of each split
sizes = [len(train_df), len(valid_df), len(test_df)]
labels = ['Train', 'Validation', 'Test']

# Plotting the split as a pie chart
plt.figure(figsize=(4, 4))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#99ff99','#ffcc99'])
plt.title('Data Split Visualization')
plt.axis('equal')  # Equal aspect ratio for a perfect circle
plt.show()

##### Creating Dataset and DataLoaders
Now we wrap each split in the custom dataset and use DataLoader for batch processing.

In [None]:
batch_size = 64

# Create dataset objects
train_dataset = LungDataset(train_df, transform=transform)
valid_dataset = LungDataset(valid_df, transform=transform)
test_dataset  = LungDataset(test_df, transform=transform)

# Create DataLoader objects for batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=0, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, num_workers=0, shuffle=False)

In [None]:
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv_stack = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.flatten = nn.Flatten()
        self.fc = nn.Sequential(
            nn.Linear(128 * (img_size // 8) * (img_size // 8), 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.conv_stack(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

In [None]:
model = CNNModel(num_classes=len(class_names)).to(device) 
print(model)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    print("Starting training loop...")
    
    epoch_start_time = time.time()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if batch % 100 == 0:
            loss_val = loss.item()
            current = (batch + 1) * len(X)
            print(f"loss: {loss_val:>7f}  [{current:>5d}/{size:>5d}]")
    
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time
    print(f"Epoch completed in {epoch_duration:.2f} seconds")

In [None]:
def evaluate(dataloader, model, loss_fn, name="Test"):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    
    epoch_start_time = time.time()  # Değerlendirme başı zamanı kaydet
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    
    epoch_end_time = time.time()  # Değerlendirme bitiş zamanı
    epoch_duration = epoch_end_time - epoch_start_time  # Epoch süresi
    
    test_loss /= num_batches
    correct /= size
    print(f"{name} Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:.4f}")
    print(f"{name} evaluation completed in {epoch_duration:.2f} seconds\n")

In [None]:
# Training loop with validation
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    evaluate(valid_loader, model, loss_fn, name="Validation")


In [None]:
# Final test evaluation
evaluate(test_loader, model, loss_fn, name="Test")
print("Testing complete!")