**Task 01**

In [None]:
pip install einops



In [None]:
import torch
import requests
import torch.nn as nn
from PIL import Image
import torch.nn.functional as F
from transformers import CLIPProcessor, CLIPModel

class SwinEmbedding(nn.Module):
    def __init__(self, patch_size=4, C=96):
        super().__init__()
        self.linear_embedding = nn.Conv2d(3, C, kernel_size=patch_size, stride=patch_size)
        self.layer_norm = nn.LayerNorm(C)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear_embedding(x)
        b, c, h, w = x.size()
        x = x.view(b, c, -1).transpose(1, 2).contiguous().view(b, -1, c)
        x = self.relu(self.layer_norm(x))
        return x

#SwinEmbedding instance
swin_embedding = SwinEmbedding()

#random input tensor
input_tensor = torch.rand((32, 3, 512, 512))

#shapes of the input and output
print("Input shape:", input_tensor.shape)
print("Output shape:", swin_embedding(input_tensor).shape)


Input shape: torch.Size([32, 3, 512, 512])
Output shape: torch.Size([32, 16384, 96])


**Task 02**

In [None]:
class CLIPPredictor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.text_inputs = self.processor(text=["we see a cat", "we see a dog"], return_tensors="pt", padding=True)
        with torch.no_grad():
            self.text_embeddings = self.model.get_text_features(**self.text_inputs)
    #forward pass
    def forward(self, url):
        picture = Image.open(requests.get(url, stream=True).raw)
        image_inputs = self.processor(images=picture, return_tensors="pt", padding=True)
        image_embeddings = self.model.get_image_features(**image_inputs)
        logits_per_image = torch.matmul(image_embeddings, self.text_embeddings.T)
        probabilities = torch.nn.functional.log_softmax(logits_per_image, dim=-1).exp()
        return probabilities
# instance of CLIP
clip_pred = CLIPPredictor()
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
probabilities = clip_pred(url)
probabilities

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tensor([[0.9603, 0.0397]], grad_fn=<ExpBackward0>)