In [27]:
from torchvision import datasets, transforms
import torch
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from torch.utils.data import random_split, DataLoader
import matplotlib.pyplot as plt
import numpy as np

from cnn_model import CNNModel
from vit_model import ViTModel

In [28]:
# we'll see if needed
# Define transforms to preprocess the data (you can customize these as needed)
TARGET_WIDTH = 224
TARGET_HEIGHT = 224



In [29]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Define a custom sorting key function
def sorting_key(item):
    # Extract numeric part of the string
    num_str = item.split('.')[0]
    # Convert it to an integer
    return int(num_str)

class CustomDataset(Dataset):
    def __init__(self, folder_path, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        self.image_files = sorted([f for f in os.listdir(folder_path) if f.endswith(('.jpg', '.jpeg', '.png'))], key=sorting_key)
        #print(self.image_files)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.folder_path, self.image_files[idx])
        image = Image.open(img_name).convert("RGB")  # Convert to RGB if needed

        if self.transform:
            image = self.transform(image)

        return image

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),              # Convert the image to a PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize the image
    #transforms.Pad((111, 96, 112, 96), fill=0),  # Add Padding:  Our images are 64x33 (left,right,top, bottom)
    transforms.Pad((95,80, 96, 80), fill=0) # Add padding for ViT (target is 224*224)
])

# Path to your image folder
#folder_path = "inference_dataset/non-holo/video_0"

# Create dataset and data loader
#dataset = CustomDataset(folder_path, transform=transform)
#dataloader = DataLoader(dataset, batch_size=128, shuffle=False)

# Now you can use this dataloader for inference with your PyTorch Lightning model


In [30]:
# Load the ViT model
print(f"Loading model best.ckpt")
best_model = ViTModel.load_from_checkpoint("models/best-ViT.ckpt", num_classes=1, lr=0.0001).to('cuda')
best_model

Loading model best.ckpt


  model = create_fn(


ViTModel(
  (pretrained_ViT): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=192, out_features=576, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=192, out_features=192, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=192, out_features=768, bias=True)
          (act): GELU(approximate='non

In [36]:
NUMBER_OF_VIDEOS = 15
video_paths = [f'inference_dataset/non-holo/video_{index}' for index in range(NUMBER_OF_VIDEOS)]
video_paths

['inference_dataset/non-holo/video_0',
 'inference_dataset/non-holo/video_1',
 'inference_dataset/non-holo/video_2',
 'inference_dataset/non-holo/video_3',
 'inference_dataset/non-holo/video_4',
 'inference_dataset/non-holo/video_5',
 'inference_dataset/non-holo/video_6',
 'inference_dataset/non-holo/video_7',
 'inference_dataset/non-holo/video_8',
 'inference_dataset/non-holo/video_9',
 'inference_dataset/non-holo/video_10',
 'inference_dataset/non-holo/video_11',
 'inference_dataset/non-holo/video_12',
 'inference_dataset/non-holo/video_13',
 'inference_dataset/non-holo/video_14']

In [37]:
percentage_results = []
for video_index, video_path in enumerate(video_paths):
    # Create dataset and data loader
    dataset = CustomDataset(video_path, transform=transform)
    dataloader = DataLoader(dataset, batch_size=128, shuffle=False)

    # Create an empty list to store results
    all_results = []

    # Iterate over the data loader
    for batch in dataloader:
        batch = batch.to('cuda')
        # Forward pass
        with torch.no_grad():  # No need to compute gradients during inference
            y_hat = best_model(batch).squeeze()
        result = torch.sigmoid(y_hat)

        # Append the results of this batch to the list
        all_results.append(result.cpu().numpy())  # Convert to numpy array and move to CPU

    # Concatenate the results into a single numpy array
    all_results = np.concatenate(all_results)

    # Count the number of values above 0.5
    count_above_threshold = np.sum(all_results < 0.3)
    percentage = count_above_threshold / len(all_results) * 100
    percentage_results.append(percentage)
    print(f"{percentage}% of the segments were considered as holo")

    ########Generate the visual representation
    VISUAL_THRESHOLD = 0
    import json

    # Read JSON file
    with open(video_path + '/segments.json', 'r') as f:
        segments = json.load(f)
    import cv2
    import numpy as np

    # Specify the dimensions of the image
    width = 615
    height = 432

    # Create a black image
    hologram_reproduction_image = np.zeros((height, width), dtype=np.uint8)

    for index, result in enumerate(all_results):
        result = 1 - result #Reverse because 0 is holo and 1 is non-holo
        if result > VISUAL_THRESHOLD:
            result = result * 225 #Turn it into a grayscale pixel value
            segment_coordinates = segments[index] #Get the corresponding segment coordinates
            for coordinates in segment_coordinates:
                current_pixel_value = hologram_reproduction_image[coordinates['y'], coordinates['x']]
                if current_pixel_value < result:
                    hologram_reproduction_image[coordinates['y'], coordinates['x']] = result
    # Write the image to disk
    output_path = f"inference_result_video_{video_index}.jpg" 
    cv2.imwrite(output_path, hologram_reproduction_image)

len(segments)

percentage_results

22.14% of the segments were considered as holo
1.1900000000000002% of the segments were considered as holo
5.66% of the segments were considered as holo
0.52% of the segments were considered as holo
2.36% of the segments were considered as holo
4.51% of the segments were considered as holo
1.1400000000000001% of the segments were considered as holo
1.83% of the segments were considered as holo
2.1399999999999997% of the segments were considered as holo
0.42% of the segments were considered as holo
0.03% of the segments were considered as holo
3.91% of the segments were considered as holo
0.33% of the segments were considered as holo
0.63% of the segments were considered as holo
2.06% of the segments were considered as holo


[22.14,
 1.1900000000000002,
 5.66,
 0.52,
 2.36,
 4.51,
 1.1400000000000001,
 1.83,
 2.1399999999999997,
 0.42,
 0.03,
 3.91,
 0.33,
 0.63,
 2.06]

In [5]:
next(iter(dataloader)).shape

torch.Size([128, 3, 224, 224])

Loading model best.ckpt


  model = create_fn(


ViTModel(
  (pretrained_ViT): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=192, out_features=576, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=192, out_features=192, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=192, out_features=768, bias=True)
          (act): GELU(approximate='non

In [7]:


# Now you have all the inference results in order in `all_results`


  return F.conv2d(input, weight, bias, self.stride,


In [8]:
all_results[:100]

array([0.41945726, 0.8979322 , 0.11893477, 0.90015894, 0.80030566,
       0.42777866, 0.7068645 , 0.8317218 , 0.6697964 , 0.4364496 ,
       0.9226982 , 0.899286  , 0.9460429 , 0.85258937, 0.87987524,
       0.88957113, 0.44141182, 0.65450335, 0.7830512 , 0.80017775,
       0.5513655 , 0.5558293 , 0.14128192, 0.8991721 , 0.6612003 ,
       0.9226816 , 0.8456048 , 0.6272571 , 0.85766125, 0.8895159 ,
       0.4066031 , 0.87853104, 0.4161484 , 0.8586633 , 0.79706913,
       0.8444689 , 0.9249105 , 0.67386115, 0.91392773, 0.7947407 ,
       0.8600748 , 0.8973306 , 0.6510875 , 0.9263765 , 0.82937837,
       0.8819239 , 0.91117287, 0.88095325, 0.8273515 , 0.7667948 ,
       0.9136046 , 0.7580514 , 0.9349749 , 0.8580112 , 0.831481  ,
       0.68360466, 0.9287972 , 0.810836  , 0.8223137 , 0.90773886,
       0.74768615, 0.91920847, 0.88625157, 0.8715999 , 0.7892127 ,
       0.9169009 , 0.90976095, 0.74834645, 0.7936889 , 0.739724  ,
       0.92919356, 0.88499975, 0.88722014, 0.85915905, 0.74911

3.2% of the segments were considered as holo


# Read the json

5000

True