In [10]:
# import torch
# from transformers import CLIPModel, CLIPProcessor
# import torch.nn as nn
# from torch.optim import Adam

# # Load Teacher CLIP Model
# teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# teacher_model.eval()

# # Define a smaller student model by reducing the layers and hidden dimensions
# class StudentCLIPModel(nn.Module):
#     def __init__(self):
#         super(StudentCLIPModel, self).__init__()
#         # A smaller version of CLIP with fewer transformer layers and smaller hidden size
#         self.text_projection = nn.Linear(256, 128)
#         self.vision_projection = nn.Linear(384, 128)

#     def forward(self, text_features, vision_features):
#         # Forward pass through the projection layers
#         text_proj = self.text_projection(text_features)
#         vision_proj = self.vision_projection(vision_features)
#         return text_proj, vision_proj

# # Initialize Student Model
# student_model = StudentCLIPModel()




In [16]:
import os
import torch
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from PIL import Image


def load_data(data_dir):
    images = []
    labels = []

    for label, folder in enumerate(['no_cell_phones', 'cell_phones']):
        folder_path = os.path.join(data_dir, folder)
        for filename in os.listdir(folder_path):
            if filename.endswith(('jpg', 'jpeg')):
                images.append(os.path.join(folder_path, filename))
                labels.append(label)

    return images, labels

images, labels = load_data('/home/ajeet/codework/datasets/train_clip/testing/')
dataset = {'image': images, 'label': labels}


from torch.utils.data import Dataset

class ImageLabelDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx]).convert("RGB")
        label = self.labels[idx]
        # encoding = self.processor(images=image, return_tensors="pt")
        inputs = self.processor(text=["no cell phone", "a cell phone"], images=image, return_tensors="pt", padding=True)
        return inputs, label

custom_dataset = ImageLabelDataset(images, labels)
print(custom_dataset)


model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
model.train()  # Set model to training mode


from torch.utils.data import DataLoader

dataloader = DataLoader(custom_dataset, batch_size=16, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

num_epochs = 5

for epoch in range(num_epochs):
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()

        # Initialize the total loss for the batch
        total_loss = 0.0

        # Process each image in the batch
        for i in range(inputs['pixel_values'].shape[0]):  # Assuming 'pixel_values' holds your images
            # single_image_input = {k: v[i:i+1] for k, v in inputs.items()}  # Take a single image from the batch
            single_image_input = {k: v[i:i+1] if v.ndim == 4 else v[i] for k, v in inputs.items()}  # Ensure the correct shape
            
            label = labels[i:i+1]  # Get the corresponding label
            
            # Forward pass
            outputs = model(**single_image_input)
            logits_per_image = outputs.logits_per_image  # Similarity scores
            
            # Calculate the loss for the single image
            loss = torch.nn.functional.cross_entropy(logits_per_image, label)

            # Accumulate the loss
            total_loss += loss

        # Average the loss over the batch
        total_loss /= inputs['pixel_values'].shape[0]

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss.item():.4f}")



# for epoch in range(num_epochs):
#     for batch in dataloader:
#         inputs, labels = batch
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(**inputs)
#         logits_per_image = outputs.logits_per_image  # Similarity scores
#         loss = torch.nn.functional.cross_entropy(logits_per_image, labels)

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#         print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")



<__main__.ImageLabelDataset object at 0x7f400a6bffd0>
Epoch [1/5], Loss: 0.4878
Epoch [2/5], Loss: 0.0861
Epoch [3/5], Loss: 0.0000
Epoch [4/5], Loss: 0.0002
Epoch [5/5], Loss: 0.0023


In [25]:
# student_model

from PIL import Image
from transformers import CLIPModel, CLIPConfig, AutoProcessor

image = Image.open("/home/ajeet/codework/datasets/train_clip/testing/no_cell_phones/0_2.jpg")
text = ["no cell phone", "a cell phone"]
# text = ["a cell phone", "no cell phone"]

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[1.0000e+00, 1.2831e-10]])


In [50]:
from transformers import CLIPModel, CLIPConfig
import torch

def reduce_clip_layers(teacher_model: CLIPModel, num_text_layers: int, num_vision_layers: int) -> CLIPModel:
    config = teacher_model.config.to_dict()
    config['text_config']['num_hidden_layers'] = num_text_layers
    config['vision_config']['num_hidden_layers'] = num_vision_layers

    config['text_config']['intermediate_size'] = 512
    config['vision_config']['intermediate_size'] = 512

    config['text_config']['num_attention_heads'] = 2
    config['vision_config']['num_attention_heads'] = 2

    config['text_config']['hidden_size'] = 256
    config['vision_config']['hidden_size'] = 256
    
    student_config = CLIPConfig.from_dict(config)
    student_model = CLIPModel(student_config)
    
    return student_model

teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

num_text_layers = 1
num_vision_layers = 1

student_model = reduce_clip_layers(teacher_model, num_text_layers, num_vision_layers)

without_fine_tune = student_model

# torch.save(student_model.state_dict(), "student_clip_model.pth")
# print(f"Reduced student model with {num_text_layers} text layers and {num_vision_layers} vision layers created.")


In [51]:
import os
import torch
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from PIL import Image


def load_data(data_dir):
    images = []
    labels = []

    for label, folder in enumerate(['no_cell_phone', 'cell_phone']):
        folder_path = os.path.join(data_dir, folder)
        for filename in os.listdir(folder_path):
            if filename.endswith(('jpg', 'jpeg')):
                images.append(os.path.join(folder_path, filename))
                labels.append(label)

    return images, labels

images, labels = load_data('/home/ajeet/codework/datasets/train_clip/')
dataset = {'image': images, 'label': labels}

val_images, val_labels = load_data('/home/ajeet/codework/datasets/val_clip/')  # Load validation dataset
val_dataset = {'image': val_images, 'label': val_labels}


from torch.utils.data import Dataset

class ImageLabelDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx]).convert("RGB")
        label = self.labels[idx]
        # encoding = self.processor(images=image, return_tensors="pt")
        inputs = self.processor(text=["no cell phone", "a cell phone"], images=image, return_tensors="pt", padding=True)
        return inputs, label

custom_dataset = ImageLabelDataset(images, labels)
print(custom_dataset)


# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
model = student_model
model.train()  # Set model to training mode


from torch.utils.data import DataLoader

dataloader = DataLoader(custom_dataset, batch_size=16, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()

        # Initialize the total loss for the batch
        total_loss = 0.0

        # Process each image in the batch
        for i in range(inputs['pixel_values'].shape[0]):  # Assuming 'pixel_values' holds your images
            # single_image_input = {k: v[i:i+1] for k, v in inputs.items()}  # Take a single image from the batch
            single_image_input = {k: v[i:i+1] if v.ndim == 4 else v[i] for k, v in inputs.items()}  # Ensure the correct shape
            
            label = labels[i:i+1]  # Get the corresponding label
            
            # Forward pass
            outputs = model(**single_image_input)
            logits_per_image = outputs.logits_per_image  # Similarity scores
            
            # Calculate the loss for the single image
            loss = torch.nn.functional.cross_entropy(logits_per_image, label)

            # Accumulate the loss
            total_loss += loss

        # Average the loss over the batch
        total_loss /= inputs['pixel_values'].shape[0]

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss.item():.4f}")

    # Validation phase
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation
        for val_batch in val_dataloader:
            val_inputs, val_labels = val_batch
            val_outputs = model(**val_inputs)
            val_logits_per_image = val_outputs.logits_per_image

            # Get predictions
            _, predicted = torch.max(val_logits_per_image, 1)
            total += val_labels.size(0)  # Number of samples in this batch
            correct += (predicted == val_labels).sum().item()  # Count correct predictions

    accuracy = correct / total * 100  # Calculate accuracy as a percentage
    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {accuracy:.2f}%")


# for epoch in range(num_epochs):
#     for batch in dataloader:
#         inputs, labels = batch
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(**inputs)
#         logits_per_image = outputs.logits_per_image  # Similarity scores
#         loss = torch.nn.functional.cross_entropy(logits_per_image, labels)

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#         print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")



<__main__.ImageLabelDataset object at 0x7f400a3bc160>
Epoch [1/2], Loss: 0.6746
Epoch [1/2], Loss: 0.6276
Epoch [1/2], Loss: 0.6168
Epoch [1/2], Loss: 0.5853
Epoch [1/2], Loss: 0.5254
Epoch [1/2], Loss: 0.5417
Epoch [1/2], Loss: 0.5847
Epoch [1/2], Loss: 0.4891
Epoch [1/2], Loss: 0.4704
Epoch [1/2], Loss: 0.5068
Epoch [1/2], Loss: 0.5623
Epoch [1/2], Loss: 0.5191
Epoch [1/2], Loss: 0.5949
Epoch [1/2], Loss: 0.4043
Epoch [1/2], Loss: 0.2761
Epoch [1/2], Loss: 0.6601
Epoch [1/2], Loss: 0.6240
Epoch [1/2], Loss: 0.4881
Epoch [1/2], Loss: 0.2672
Epoch [1/2], Loss: 0.3640
Epoch [1/2], Loss: 0.5205
Epoch [1/2], Loss: 0.4776
Epoch [1/2], Loss: 0.4811
Epoch [1/2], Loss: 0.3984
Epoch [1/2], Loss: 0.3199
Epoch [1/2], Loss: 0.3285
Epoch [1/2], Loss: 0.2825
Epoch [1/2], Loss: 0.3992
Epoch [1/2], Loss: 0.4582
Epoch [1/2], Loss: 0.3545
Epoch [1/2], Loss: 0.4635
Epoch [1/2], Loss: 0.4438
Epoch [1/2], Loss: 0.2570
Epoch [1/2], Loss: 0.3242
Epoch [1/2], Loss: 0.5414
Epoch [1/2], Loss: 0.4147
Epoch [1/2

In [59]:
# student_model

from PIL import Image
from transformers import CLIPModel, CLIPConfig, AutoProcessor

image = Image.open("/home/ajeet/codework/datasets/train_clip/val/no_cell_phone/0_2873.jpg")
text = ["no cell phone", "a cell phone"]
# text = ["a cell phone", "no cell phone"]

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[1.2410e-06, 1.0000e+00]])


In [40]:
# student_model

from PIL import Image
from transformers import CLIPModel, CLIPConfig, AutoProcessor

image = Image.open("/home/ajeet/codework/datasets/train_clip/testing/no_cell_phones/0_2.jpg")
text = ["no cell phone", "a cell phone"]
# text = ["a cell phone", "no cell phone"]

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = without_fine_tune(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[0.8286, 0.1714]])


In [55]:
import os
import shutil

def copy_images(source_dir, destination_dir, num_images=10000):
    # Create destination directory if it does not exist
    os.makedirs(destination_dir, exist_ok=True)

    # Get all image files from the source directory
    images = [f for f in os.listdir(source_dir) if f.lower().endswith(('.jpg', '.jpeg'))]

    # Limit the number of images to copy
    images_to_copy = images[:14000]

    for image in images_to_copy:
        # Construct full file path
        src = os.path.join(source_dir, image)
        dst = os.path.join(destination_dir, image)
        
        # Copy the file
        shutil.copy2(src, dst)
        # print(f"Copied: {image}")

    print(f"Copied {len(images_to_copy)} images from {source_dir} to {destination_dir}.")


    for image in images[14000:]:
        # Construct full file path
        src = os.path.join("/home/ajeet/codework/datasets/Cellphone_train/train", image)
        dst = os.path.join("/home/ajeet/codework/datasets/train_clip/val/cell_phone", image)
        
        # Copy the file
        shutil.copy2(src, dst)
        print(f"Copied: {image}")

# Example usage
source_folder = '/home/ajeet/codework/datasets/Cellphone_train/train'  # Replace with your source folder path
destination_folder = '/home/ajeet/codework/datasets/train_clip/train/cell_phone'  # Replace with your destination folder path
copy_images(source_folder, destination_folder)


Copied 14000 images from /home/ajeet/codework/datasets/Cellphone_train/train to /home/ajeet/codework/datasets/train_clip/train/cell_phone.
Copied: 23409_mob_jitter.jpg
Copied: 26635_mob_poison.jpg
Copied: 89441_2020-07-14_12263__mob_poison.jpg
Copied: 71713_2020-06-10_459_.jpg
Copied: 94883_2020-07-17_12329_.jpg
Copied: 740_dec_bright.jpg
Copied: 44334_2020-04-14_9222_.jpg
Copied: 7125_mob_poison.jpg
Copied: 48112_2020-04-22_9235__mob_jitter.jpg
Copied: 26108_mob_poison.jpg
Copied: 24508_mob_poison.jpg
Copied: fti_Claire.Campbell@fti-acuity.com_136586_2020-08-21_14542__mob_poison.jpg
Copied: 25568_mob_jitter.jpg
Copied: 24511.jpg
Copied: 96220_2020-07-20_17404__mob_jitter.jpg
Copied: 96410_2020-07-20_13658__mob_poison.jpg
Copied: 106740_2020-07-29_12830_.jpg
Copied: 8220.jpg
Copied: 26233_mob_jitter.jpg
Copied: concentrix_dhivya.n@concentrix.com_111915_2020-08-03_22030_.jpg
Copied: 26403_mob_poison.jpg
Copied: 42661_2020-04-08_19367_.jpg
Copied: 82198_2020-07-06_16183__mob_jitter.jpg
C

In [12]:
total_params = sum(p.numel() for p in student_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

0.82176
Model size in memory: 0.31 MB


In [None]:
# Define loss function (e.g., MSE loss to match student with teacher)
criterion = nn.MSELoss()

# Define optimizer
optimizer = Adam(student_model.parameters(), lr=1e-4)

# Prepare input data
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
inputs = processor(text=["a photo of a cat"], images=["cat.jpg"], return_tensors="pt", padding=True)

# Get teacher model outputs
with torch.no_grad():
    teacher_outputs = teacher_model(**inputs)
    teacher_text_features = teacher_outputs.text_embeds
    teacher_vision_features = teacher_outputs.image_embeds

# Training Loop
for epoch in range(10):
    optimizer.zero_grad()

    # Forward pass through the student model
    student_text_features, student_vision_features = student_model(
        teacher_text_features, teacher_vision_features
    )

    # Compute loss
    loss = criterion(student_text_features, teacher_text_features) + \
           criterion(student_vision_features, teacher_vision_features)

    # Backward pass
    loss.backward()

    # Optimize
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item()}")

# # Save student model (you can apply quantization after training)
# torch.save(student_model.state_dict(), "student_clip_model.pth")

In [26]:
from transformers import CLIPModel, CLIPConfig
import torch

def reduce_clip_layers(teacher_model: CLIPModel, num_text_layers: int, num_vision_layers: int) -> CLIPModel:
    config = teacher_model.config.to_dict()
    config['text_config']['num_hidden_layers'] = num_text_layers
    config['vision_config']['num_hidden_layers'] = num_vision_layers

    config['text_config']['intermediate_size'] = 512
    config['vision_config']['intermediate_size'] = 512

    config['text_config']['num_attention_heads'] = 2
    config['vision_config']['num_attention_heads'] = 2

    config['text_config']['hidden_size'] = 256
    config['vision_config']['hidden_size'] = 256
    
    student_config = CLIPConfig.from_dict(config)
    student_model = CLIPModel(student_config)
    
    return student_model

teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

num_text_layers = 1
num_vision_layers = 1

student_model = reduce_clip_layers(teacher_model, num_text_layers, num_vision_layers)

# torch.save(student_model.state_dict(), "student_clip_model.pth")
# print(f"Reduced student model with {num_text_layers} text layers and {num_vision_layers} vision layers created.")


In [12]:
from transformers import CLIPModel, CLIPConfig
import torch

def reduce_clip_layers(teacher_model: CLIPModel, num_text_layers: int, num_vision_layers: int) -> CLIPModel:
    config = teacher_model.config.to_dict()
    config['text_config']['num_hidden_layers'] = num_text_layers
    config['vision_config']['num_hidden_layers'] = num_vision_layers

    config['text_config']['intermediate_size'] = 512
    config['vision_config']['intermediate_size'] = 512

    config['text_config']['num_attention_heads'] = 2
    config['vision_config']['num_attention_heads'] = 2

    config['text_config']['hidden_size'] = 256
    config['vision_config']['hidden_size'] = 256
    
    student_config = CLIPConfig.from_dict(config)
    student_model = CLIPModel(student_config)
    
    return student_model

teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

num_text_layers = 1
num_vision_layers = 1

student_model = reduce_clip_layers(teacher_model, num_text_layers, num_vision_layers)

# torch.save(student_model.state_dict(), "student_clip_model.pth")
# print(f"Reduced student model with {num_text_layers} text layers and {num_vision_layers} vision layers created.")


Reduced student model with 1 text layers and 1 vision layers created.


In [6]:
# Define loss function (e.g., MSE loss to match student with teacher)
import torch.nn as nn
from torch.optim import Adam
from transformers import CLIPModel, CLIPProcessor
from PIL import Image


criterion = nn.MSELoss()

# Define optimizer
optimizer = Adam(student_model.parameters(), lr=1e-4)

# Prepare input data
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("/home/ajeet/codework/datasets/video_incidents_ajeet/f044cf1d-8a5b-4703-a05c-3b57c5c14989_merged/0_34.jpg")
inputs = processor(text=["a cell phone"], images=image, return_tensors="pt", padding=True)

# Get teacher model outputs for distillation (used only during training)
with torch.no_grad():
    teacher_outputs = teacher_model(**inputs)
    teacher_text_features = teacher_outputs.text_embeds
    teacher_vision_features = teacher_outputs.image_embeds

# Training Loop (distillation process)
for epoch in range(25):
    optimizer.zero_grad()

    # Forward pass through the student model (on same inputs as teacher)
    student_outputs = student_model(**inputs)
    student_text_features = student_outputs.text_embeds
    student_vision_features = student_outputs.image_embeds

    # Compute loss by comparing student outputs to teacher outputs
    loss = criterion(student_text_features, teacher_text_features) + \
           criterion(student_vision_features, teacher_vision_features)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 0.007755913306027651
Epoch 1, Loss: 0.0072165182791650295
Epoch 2, Loss: 0.006696888245642185
Epoch 3, Loss: 0.006197899580001831
Epoch 4, Loss: 0.005751561839133501
Epoch 5, Loss: 0.005362555384635925
Epoch 6, Loss: 0.005026225931942463
Epoch 7, Loss: 0.004732792265713215
Epoch 8, Loss: 0.004468045197427273
Epoch 9, Loss: 0.004226185847073793
Epoch 10, Loss: 0.004005833528935909
Epoch 11, Loss: 0.0038033686578273773
Epoch 12, Loss: 0.0036125751212239265
Epoch 13, Loss: 0.003429584437981248
Epoch 14, Loss: 0.003254310227930546
Epoch 15, Loss: 0.0030884970910847187
Epoch 16, Loss: 0.0029321727342903614
Epoch 17, Loss: 0.0027829993050545454
Epoch 18, Loss: 0.002639773767441511
Epoch 19, Loss: 0.0025026327930390835
Epoch 20, Loss: 0.00237056496553123
Epoch 21, Loss: 0.002242261776700616
Epoch 22, Loss: 0.002118496224284172
Epoch 23, Loss: 0.0020005260594189167
Epoch 24, Loss: 0.0018875164678320289


In [None]:
class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        # Initialize image paths and corresponding texts
        self.image_path = list_image_path
        # Tokenize text using CLIP's tokenizer
        self.title  = clip.tokenize(list_txt)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        # Preprocess image using CLIP's preprocessing function
        image = preprocess(Image.open(self.image_path[idx]))
        title = self.title[idx]
        return image, title

In [11]:
from transformers import CLIPModel, CLIPConfig, CLIPProcessor
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os

# Function to reduce the number of layers in the student CLIP model
def reduce_clip_layers(teacher_model: CLIPModel, num_text_layers: int, num_vision_layers: int) -> CLIPModel:
    # Get the config from the teacher model and modify the layer counts
    config = teacher_model.config.to_dict()
    config['text_config']['num_hidden_layers'] = num_text_layers
    config['vision_config']['num_hidden_layers'] = num_vision_layers

    config['text_config']['intermediate_size'] = 512
    config['vision_config']['intermediate_size'] = 512

    config['text_config']['num_attention_heads'] = 2
    config['vision_config']['num_attention_heads'] = 2

    config['text_config']['hidden_size'] = 256
    config['vision_config']['hidden_size'] = 256
    
    # Create a new student model configuration with fewer layers
    student_config = CLIPConfig.from_dict(config)
    student_model = CLIPModel(student_config)
    
    return student_model

# Custom dataset class for loading image-text pairs
class ImageTextDataset(Dataset):
    def __init__(self, image_folder, text_list, processor):
        self.image_folder = image_folder
        self.text_list = text_list
        self.processor = processor
        self.image_filenames = os.listdir(image_folder)

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        image_name = self.image_filenames[idx]
        image_path = os.path.join(self.image_folder, image_name)
        image = Image.open(image_path).convert("RGB")
        text = self.text_list[idx]
        
        # Preprocess the image and text using the processor
        inputs = self.processor(text=text, images=image, return_tensors="pt", padding=True, truncation=True)
        
        return inputs['pixel_values'][0], inputs['input_ids'][0]

teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
num_text_layers = 1
num_vision_layers = 1
student_model = reduce_clip_layers(teacher_model, num_text_layers, num_vision_layers)

# # Save the smaller student model
# torch.save(student_model.state_dict(), "student_clip_model.pth")

image_folder = "/home/ajeet/codework/datasets/train_clip/testing"  # Replace with your image folder path
text_list = ["a cell phone", "a cell phone"]  # Replace with your text data

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


dataset = ImageTextDataset(image_folder, text_list, processor)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)  # Set batch_size=1 for single image training


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
student_model.to(device)


optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-5)

num_epochs = 10
for epoch in range(num_epochs):
    for images, text_ids in dataloader:
        images = images.to(device)
        text_ids = text_ids.to(device)
        
        images = images.squeeze(1)
        optimizer.zero_grad()

        # Forward pass to get logits
        outputs = student_model(input_ids=text_ids, pixel_values=images)
        
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text
        
        # Create labels for cross-entropy loss
        labels = torch.arange(1).to(device)  # Labels for single image and text pair

        # Compute loss
        loss_i = F.cross_entropy(logits_per_image, labels)
        loss_t = F.cross_entropy(logits_per_text, labels)
        loss = (loss_i + loss_t) / 2
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

print("Training completed.")


Epoch [1/10], Loss: 0.0000
Epoch [1/10], Loss: 0.0000
Epoch [2/10], Loss: 0.0000
Epoch [2/10], Loss: 0.0000
Epoch [3/10], Loss: 0.0000
Epoch [3/10], Loss: 0.0000
Epoch [4/10], Loss: 0.0000
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
Training completed.


In [34]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Custom dataset class
class CellPhoneDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Load images and labels
        for label, folder in enumerate(os.listdir(root_dir)):
            folder_path = os.path.join(root_dir, folder)
            for img_file in os.listdir(folder_path):
                self.image_paths.append(os.path.join(folder_path, img_file))
                self.labels.append(label)  # 0 for cell phone, 1 for no cell phone

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.labels[idx]

        # if self.transform:
        #     image = self.transform(image)

        return image, label

# Transformations
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),  # Resize to CLIP input size
#     transforms.ToTensor(),
# ])

# Create dataset and dataloader
dataset = CellPhoneDataset(root_dir='/home/ajeet/codework/datasets/train_clip')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Load the teacher model
teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    for images, labels in dataloader:
        optimizer.zero_grad()

        # Process images and texts with the processor
        inputs = processor(text=["a cell phone", "no cell phone"], images=images, return_tensors="pt", padding=True)

        # Get teacher model outputs
        with torch.no_grad():
            teacher_outputs = teacher_model(**inputs)
            teacher_text_features = teacher_outputs.text_embeds
            teacher_vision_features = teacher_outputs.image_embeds

        # Forward pass through the student model
        student_outputs = student_model(**inputs)
        student_text_features = student_outputs.text_embeds
        student_vision_features = student_outputs.image_embeds

        # Compute loss
        loss = criterion(student_text_features, teacher_text_features) + \
               criterion(student_vision_features, teacher_vision_features)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # optimizer.zero_grad()

        # # Process images and texts with the processor
        # inputs = processor(text=["a cell phone", "no cell phone"], images=images, return_tensors="pt", padding=True)

        # # Get teacher model outputs
        # with torch.no_grad():
        #     teacher_outputs = teacher_model(**inputs)
        #     teacher_text_features = teacher_outputs.text_embeds
        #     teacher_vision_features = teacher_outputs.image_embeds

        # # Forward pass through the student model
        # student_outputs = student_model(**inputs)
        # student_text_features = student_outputs.text_embeds
        # student_vision_features = student_outputs.image_embeds

        # # Compute the similarity scores (dot product or cosine similarity)
        # text_similarity = torch.matmul(student_text_features, teacher_text_features.T)
        # vision_similarity = torch.matmul(student_vision_features, teacher_vision_features.T)

        # # Compute loss using CrossEntropy on the similarities and labels
        # loss_text = criterion(text_similarity, labels)
        # loss_vision = criterion(vision_similarity, labels)
        # loss = loss_text + loss_vision

        # # Backward pass and optimization
        # loss.backward()
        # optimizer.step()


    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

# Save student model
torch.save(student_model.state_dict(), "student_clip_model.pth")


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>

In [7]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Define transformations
transform = transforms.Compose([
    # transforms.Resize((224, 224)),  # Resize to CLIP input size
    transforms.ToTensor(),           # Convert PIL Image to Tensor
])

# Custom dataset class
class CellPhoneDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Load images and labels
        for label, folder in enumerate(os.listdir(root_dir)):
            folder_path = os.path.join(root_dir, folder)
            for img_file in os.listdir(folder_path):
                self.image_paths.append(os.path.join(folder_path, img_file))
                self.labels.append(label)  # 0 for cell phone, 1 for no cell phone

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# Create dataset and dataloader
dataset = CellPhoneDataset(root_dir='/home/ajeet/codework/datasets/train_clip', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Load the teacher model
teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    for images, labels in dataloader:
        optimizer.zero_grad()

        # Process images and texts with the processor
        inputs = processor(text=["a cell phone", "no cell phone"], images=images, return_tensors="pt", padding=True)

        # Get teacher model outputs
        with torch.no_grad():
            teacher_outputs = teacher_model(**inputs)
            teacher_text_features = teacher_outputs.text_embeds
            teacher_vision_features = teacher_outputs.image_embeds

        # Forward pass through the student model
        student_outputs = student_model(**inputs)
        student_text_features = student_outputs.text_embeds
        student_vision_features = student_outputs.image_embeds

        # Compute loss
        loss = criterion(student_text_features, teacher_text_features) + \
               criterion(student_vision_features, teacher_vision_features)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

# Save student model
torch.save(student_model.state_dict(), "student_clip_model.pth")


RuntimeError: stack expects each tensor to be equal size, but got [3, 180, 320] at entry 0 and [3, 240, 320] at entry 2

In [10]:
# student_model

from PIL import Image
from transformers import CLIPModel, CLIPConfig, AutoProcessor

image = Image.open("/home/ajeet/codework/datasets/video_incidents_ajeet/f044cf1d-8a5b-4703-a05c-3b57c5c14989_merged/0_34.jpg")
text = ["no cell phone", "a cell phone"]

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = student_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[0.5369, 0.4631]])


In [31]:
# student_model

from PIL import Image
from transformers import CLIPModel, CLIPConfig, AutoProcessor

image = Image.open("/home/ajeet/codework/datasets/video_incidents_ajeet/f044cf1d-8a5b-4703-a05c-3b57c5c14989_merged/0_1.jpg")
text = ["a person", "a cell phone"]

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = student_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[0.3611, 0.6389]])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import CLIPModel, CLIPConfig
from torch.utils.data import DataLoader, Dataset

# Custom dataset class (you can modify this based on your use case)
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

# Knowledge Distillation Loss (KL Divergence)
class DistillationLoss(nn.Module):
    def __init__(self, temperature=1.0):
        super(DistillationLoss, self).__init__()
        self.temperature = temperature
        self.kl_div = nn.KLDivLoss(reduction='batchmean')

    def forward(self, student_logits, teacher_logits):
        # Teacher and student logits are passed through softmax with temperature
        teacher_probs = torch.softmax(teacher_logits / self.temperature, dim=-1)
        student_log_probs = torch.log_softmax(student_logits / self.temperature, dim=-1)
        
        # KL divergence between teacher and student
        distillation_loss = self.kl_div(student_log_probs, teacher_probs) * (self.temperature ** 2)
        return distillation_loss

# Define a smaller student model using CLIP's config
def create_student_clip_model():
    student_config = CLIPConfig(
        hidden_size=256,  # Reduce hidden size
        num_hidden_layers=4,  # Reduce transformer layers
        intermediate_size=1024,  # Reduce intermediate layer size
        num_attention_heads=4,  # Reduce attention heads
        projection_dim=256  # Reduce projection dimension
    )
    return CLIPModel(student_config)

# Load pre-trained teacher model (you can choose any pre-trained CLIP model here)
teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Create student model
student_model = create_student_clip_model()

# Set both models to evaluation mode for inference
teacher_model.eval()
student_model.train()

# Optimizer for student model
optimizer = optim.Adam(student_model.parameters(), lr=1e-4)

# Hyperparameters
epochs = 10
temperature = 3.0  # Temperature for distillation
batch_size = 32

# Dataset (dummy data, replace with actual data)
# Assume inputs are tuples of (image, text) for CLIP, and labels are irrelevant for distillation.
dummy_inputs = [(torch.rand(3, 224, 224), torch.randint(0, 49408, (20,))) for _ in range(100)]
dummy_labels = [0 for _ in range(100)]  # Labels are irrelevant for distillation task
dataset = CustomDataset(dummy_inputs, dummy_labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Distillation loss function
distillation_loss_fn = DistillationLoss(temperature=temperature)

# Training loop
for epoch in range(epochs):
    total_loss = 0.0
    for batch in dataloader:
        image_inputs, text_inputs = batch[0]  # Unpack batch inputs
        
        # Forward pass with teacher (detach to avoid backprop through teacher)
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=text_inputs, pixel_values=image_inputs)
            teacher_logits = teacher_outputs.logits_per_image  # For CLIP, logits for images

        # Forward pass with student
        student_outputs = student_model(input_ids=text_inputs, pixel_values=image_inputs)
        student_logits = student_outputs.logits_per_image

        # Compute distillation loss
        loss = distillation_loss_fn(student_logits, teacher_logits)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

print("Training complete.")
