In [1]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from utils_cells import get_images_list, transform_image, transform_target, resize_with_padding
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
import torchvision.transforms.functional as F
import torch
from torchvision import transforms
from torchvision.transforms import functional as F
import cv2
from sklearn.model_selection import train_test_split
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
from torchmetrics import Precision, Recall
import numpy as np
import datetime
import random
import time
import torchvision.models as models

import random

class ImageDataset(Dataset):
    def __init__(self, data_path, transform=None, target_transform=None, reduce=False):
        self.transform = transform
        self.target_transform = target_transform
        self.dataset = shuffle(self.load_dataset(data_path))

    def load_dataset(self, path):
        path = []
        classes = []
        for image_class in os.listdir('cells_final'):
            for img in os.listdir(f'cells_final/{image_class}'):
                path.append(f'cells_final/{image_class}/{img}')
                classes.append(image_class)

        dataset_final = pd.DataFrame()
        dataset_final['path'] = path
        dataset_final['class'] = classes
        return dataset_final                
                          
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        image = cv2.imread(f'{self.dataset["path"].loc[idx]}')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (32, 32), interpolation=cv2.INTER_CUBIC)
        
        #image = resize_with_padding(image, (32, 32))
        image = image.astype(np.float32)
        image = image/255.0
        image = self.transform(image = image)['image'] if self.transform is not None else image

        target = self.dataset['class'].loc[idx]

        if target.strip() == 'normal':
            target_ = [1, 0, 0, 0]
        elif target.strip() == 'inflammatory':
            target_ = [0, 1, 0, 0]
        elif target.strip() == 'tumor':
            target_ = [0, 0, 1, 0]
        elif target.strip() == 'other':
            target_ = [0, 0, 0, 1]
        else:
            print(target)
        
        image = F.to_tensor(image)
        

        return image.float(), torch.Tensor(np.array(target_, dtype=np.float32))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn
from einops import rearrange

import torch
import torch.nn as nn
import torch.optim as optim

import torch
import torch.nn as nn
import torch.optim as optim

class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),  # [3, 32, 32] -> [16, 16, 16]
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),  # [16, 16, 16] -> [32, 8, 8]
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),  # [32, 8, 8] -> [64, 4, 4]
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 4 * 4, 128) , # Flatten and reduce to latent space
        )



        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(128, 64 * 4 * 4) , # Expand back to match the flattened input size
            nn.Unflatten(dim=1, unflattened_size=(64, 4, 4)),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),  # [64, 4, 4] -> [32, 8, 8]
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),  # [32, 8, 8] -> [16, 16, 16]
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=3, stride=2, padding=1, output_padding=1),   # [16, 16, 16] -> [3, 32, 32]
            nn.Sigmoid(),  # To ensure the output pixel values are between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Example usage


model = ConvAutoencoder()

# Sample input
x = torch.randn(1, 3, 32, 32)
reconstructed_img = model(x)

print(reconstructed_img.shape)  # Should output torch.Size([1, 3, 32, 32])


torch.Size([1, 3, 32, 32])


In [3]:
model = model.to('cuda')

In [None]:
import time
import torch
import wandb
from torch.utils.data import DataLoader
import numpy as np

# Initialize WandB
run_name = f'conv_autoencoder_training_{datetime.datetime.now()}'

# Configuration
batch_size = 2048
learning_rate = 1e-3
num_epochs = 200
early_stop_patience = 15  # Number of epochs to wait for improvement
run_path = f'training_checkpoints/{run_name}'

# DataLoader
trainset = ImageDataset(data_path='train_data')
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=3)

# Model, loss function, optimizer
criterion = nn.MSELoss().to('cuda')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Early Stopping
best_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    print('========================================')
    print(f'EPOCH: {epoch}')
    time_start = time.perf_counter()
    model.train()
    
    epoch_loss = 0
    for batch_idx, (inputs, _) in enumerate(trainloader):
        inputs = inputs.to('cuda')
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)  # Reconstruction loss
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(trainloader)
    print(f'Epoch {epoch} Average Loss: {avg_loss}')
    
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), f'{run_path}.pth')
        print(f'Saved new best model with loss {best_loss}')
        patience_counter = 0  # Reset patience counter
    else:
        patience_counter += 1
    
    if patience_counter >= early_stop_patience:
        print(f'Early stopping at epoch {epoch} with best loss {best_loss}')
        break
    
    time_epoch = time.perf_counter() - time_start
    print(f'Epoch {epoch} time: {time_epoch/60} minutes')
    print('--------------------------------')

# Load the best model state dict
print(f'Loading model from {run_path}.pth')
model.load_state_dict(torch.load(f'{run_path}.pth'))

EPOCH: 0
Epoch 0 Average Loss: 0.021730402668349837
Saved new best model with loss 0.021730402668349837
Epoch 0 time: 0.10827679341665923 minutes
--------------------------------
EPOCH: 1
Epoch 1 Average Loss: 0.006779211135213508
Saved new best model with loss 0.006779211135213508
Epoch 1 time: 0.10665459533332372 minutes
--------------------------------
EPOCH: 2
Epoch 2 Average Loss: 0.004764272828316807
Saved new best model with loss 0.004764272828316807
Epoch 2 time: 0.10788177176665764 minutes
--------------------------------
EPOCH: 3
Epoch 3 Average Loss: 0.003518439017505826
Saved new best model with loss 0.003518439017505826
Epoch 3 time: 0.10754923578333546 minutes
--------------------------------
EPOCH: 4
Epoch 4 Average Loss: 0.0028208050965699122
Saved new best model with loss 0.0028208050965699122
Epoch 4 time: 0.10711738325000321 minutes
--------------------------------
EPOCH: 5
Epoch 5 Average Loss: 0.002488949100217341
Saved new best model with loss 0.002488949100217341

In [None]:
model.load_state_dict(torch.load(f'{run_path}.pth'))
features = []
classes = []
paths = []
model.eval()
trainset = ImageDataset(data_path='train_data')
torch.multiprocessing.set_sharing_strategy('file_system')
model = model.to('cuda')
with torch.no_grad():
    for idx in range(0, len(trainset)-1):
        img, cls = trainset[idx]
        classes.append(cls.cpu().detach().numpy())
        feature = model.encoder(img.to('cuda').reshape(1, 3, 32, 32))
        features.append(feature.cpu().detach().numpy())


    


In [None]:
cls = np.argmax(np.array(classes), axis=1)

In [None]:
len(features)

In [None]:
features = [feature[0] for feature in features]

In [None]:
arr = np.array(features)

In [None]:
df = pd.DataFrame()
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap.umap_ as umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
import sklearn

scaler = sklearn.preprocessing.StandardScaler()

arr = scaler.fit_transform(np.array(arr))

tsne = TSNE(n_components=3, metric="euclidean", n_jobs=-1, perplexity=3.0, early_exaggeration=30)
tsne_embs = tsne.fit_transform(np.array(arr))
df['tsne_x'] = tsne_embs[:, 0]
df['tsne_y'] = tsne_embs[:, 1]
df['tsne_z'] = tsne_embs[:, 2]

# PCA
pca = PCA(n_components=3)
pca_embs = pca.fit_transform(np.array(arr))
df['pca_x'] = pca_embs[:, 0]
df['pca_y'] = pca_embs[:, 1]
df['pca_z'] = pca_embs[:, 2]

# UMAP
umap_reducer = umap.UMAP(n_components=3, metric="euclidean", n_jobs=-1)
umap_embs = umap_reducer.fit_transform(np.array(arr))
df['umap_x'] = umap_embs[:, 0]
df['umap_y'] = umap_embs[:, 1]
df['umap_z'] = umap_embs[:, 2]

In [None]:
df['class'] = cls

# Define figure size and number of subplots
fig = plt.figure(figsize=(18, 6))

# t-SNE plot
ax1 = fig.add_subplot(131, projection='3d')
sc = ax1.scatter(df['tsne_x'], df['tsne_y'], df['tsne_z'], c=df['class'], cmap='tab10', alpha=0.1)
ax1.set_title('3D t-SNE')
ax1.set_xlabel('tsne_x')
ax1.set_ylabel('tsne_y')
ax1.set_zlabel('tsne_z')
plt.colorbar(sc, ax=ax1)

# PCA plot
ax2 = fig.add_subplot(132, projection='3d')
sc = ax2.scatter(df['pca_x'], df['pca_y'], df['pca_z'], c=df['class'], cmap='tab10', alpha=0.1)
ax2.set_title('3D PCA')
ax2.set_xlabel('pca_x')
ax2.set_ylabel('pca_y')
ax2.set_zlabel('pca_z')
plt.colorbar(sc, ax=ax2)

# UMAP plot
ax3 = fig.add_subplot(133, projection='3d')
sc = ax3.scatter(df['umap_x'], df['umap_y'], df['umap_z'], c=df['class'], cmap='tab10', alpha=0.1)
ax3.set_title('3D UMAP')
ax3.set_xlabel('umap_x')
ax3.set_ylabel('umap_y')
ax3.set_zlabel('umap_z')
plt.colorbar(sc, ax=ax3)

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
df.to_csv('res_red.csv')

In [None]:
import matplotlib.pyplot as plt

# Define figure size and number of subplots
fig = plt.figure(figsize=(18, 6))

# 2D t-SNE plot
ax1 = fig.add_subplot(131)
sc = ax1.scatter(df['tsne_y'], df['tsne_z'], c=df['class'], cmap='tab10', alpha=0.5)
ax1.set_title('2D t-SNE')
ax1.set_xlabel('tsne_x')
ax1.set_ylabel('tsne_y')
plt.colorbar(sc, ax=ax1)

# 2D PCA plot
ax2 = fig.add_subplot(132)
sc = ax2.scatter(df['pca_y'], df['pca_z'], c=df['class'], cmap='tab10', alpha=0.5)
ax2.set_title('2D PCA')
ax2.set_xlabel('pca_x')
ax2.set_ylabel('pca_y')
plt.colorbar(sc, ax=ax2)

# 2D UMAP plot
ax3 = fig.add_subplot(133)
sc = ax3.scatter(df['umap_y'], df['umap_z'], c=df['class'], cmap='tab10', alpha=0.5)
ax3.set_title('2D UMAP')
ax3.set_xlabel('umap_x')
ax3.set_ylabel('umap_y')
plt.colorbar(sc, ax=ax3)

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()
