In [1]:
import requests
from pathlib import Path
import io
import zipfile
import os
import pandas as pd
import shutil
from pydantic import BaseModel
from typing import Dict, List, Optional, Any
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Tuple
from PIL import Image
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path
from typing import List, Dict
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForSequenceClassification
import accelerate
import transformers
from torchvision.models import resnet18, ResNet18_Weights

from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizerFast
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class ResNetVisionEncoder(nn.Module):
    def __init__(self, embed_dim=256):
        super().__init__()
        base = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)

        self.backbone = nn.Sequential(
            base.conv1, base.bn1, base.relu, base.maxpool,
            base.layer1, base.layer2, base.layer3, base.layer4
        )
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.projection = nn.Linear(base.fc.in_features, embed_dim)

    def forward(self, x):
        x = self.backbone(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.projection(x)       # (batch, embed_dim)
        x = F.normalize(x, dim=-1)   # CLIP normalisation L2
        return x


In [3]:
class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length: int, vocab_size:int, embed_dim:int):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(sequence_length, embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len = x.size()
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        return self.token_embeddings(x) + self.position_embeddings(positions)


In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout_rate: float = 0.1) -> None:
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout_rate, batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x: torch.Tensor, padding_mask: torch.Tensor = None) -> torch.Tensor:
        attn_output, _ = self.att(
            x, x, x,
            key_padding_mask=padding_mask  # <-- correct masking
        )

        x = self.layernorm1(x + self.dropout1(attn_output))
        ffn_output = self.ffn(x)
        out = self.layernorm2(x + self.dropout2(ffn_output))
        return out


In [5]:
class SmallBERT(nn.Module):
    def __init__(self, sequence_length: int, vocab_size: int, embed_dim: int,
                 num_heads: int, ff_dim: int, num_layers: int) -> None:
        super().__init__()
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        self.pos_embedding: PositionalEmbedding = PositionalEmbedding(sequence_length, vocab_size, embed_dim)
        self.blocks: nn.ModuleList = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim)
            for i in range(num_layers)
        ])
        self.layernorm: nn.LayerNorm = nn.LayerNorm(embed_dim)
        self.dropout: nn.Dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len) token ids
        return: (batch, seq_len, embed_dim)
        """
        x = self.pos_embedding(x)
        mask: torch.Tensor = (x == 0)[:, :, 0]  # PAD mask

        for block in self.blocks:
            x = block(x, mask)

        x = self.layernorm(x)
        return self.dropout(x)


In [None]:
class TextEncoder(nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, num_heads, ff_dim, num_layers, out_dim=256):
        super().__init__()

        self.encoder = SmallBERT(sequence_length, vocab_size, embed_dim, num_heads, ff_dim, num_layers)
        self.projection = nn.Linear(embed_dim, out_dim)

    def forward(self, x):
        enc = self.encoder(x)           # (batch, seq_len, embed_dim)
        pooled = enc.mean(dim=1)        # (batch, embed_dim)
        z = self.projection(pooled)     # (batch, out_dim)
        z = F.normalize(z, dim=-1)      # important pour CLIP
        return z


In [6]:
class SmallCLIP(nn.Module):
    def __init__(self, vision_encoder: nn.Module, text_encoder: nn.Module, temperature=0.07):
        super().__init__()
        self.vision = vision_encoder
        self.text = text_encoder
        self.temperature = nn.Parameter(torch.tensor(temperature))

    def forward(self, images, captions):
        img_emb = self.vision(images)     # (batch, d)
        txt_emb = self.text(captions)     # (batch, d)

        logits = img_emb @ txt_emb.T       # similarité cosinus * car embeddings normalisés
        logits = logits / self.temperature

        return logits, img_emb, txt_emb


In [7]:
def clip_loss(logits):
    batch = logits.size(0)
    labels = torch.arange(batch, device=logits.device)

    loss_img = F.cross_entropy(logits, labels)
    loss_txt = F.cross_entropy(logits.T, labels)

    return (loss_img + loss_txt) / 2


In [10]:
class CLIPDataset(Dataset):
    def __init__(self, df, base_dir: Path, tokenizer, image_transform, max_length=32):
        self.img_paths = df["image_path"].tolist()
        self.labels = df["label"].tolist()
        self.captions = df["caption"].tolist()

        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length

        # mapping constant
        self.class_to_idx = {cls: i for i in sorted(set(self.labels))}

        self.base_dir = base_dir

    def __getitem__(self, idx):
        # ----- image -----
        img_path = Path(build_augmented_path(self.img_paths[idx], self.base_dir))
        img = Image.open(img_path).convert("RGB")
        img = self.image_transform(img)

        # ----- text -----
        caption = self.captions[idx]
        enc = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        label = self.class_to_idx[self.labels[idx]]

        return {
            "index": idx,
            "image": img,
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label)
        }

    def __len__(self):
        return len(self.img_paths)


In [15]:
metadata_path = Path("../data/augmented/metadata.csv")
df = pd.read_csv(metadata_path)
print(df.columns)
print(df.iloc[1])

Index(['image_path', 'label', 'caption'], dtype='object')
image_path                                water_070_spatial.jpg
label                                               Label.WATER
caption       A kayaker wearing a blue wetsuit and black hel...
Name: 1, dtype: object


In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 1000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df["caption"])
actual_vocab_size = len(tokenizer.word_index) + 1
print(f"   taille de vocabulaire: {len(tokenizer.word_index)}")
print(f"   vocab_size: {actual_vocab_size}")

   taille de vocabulaire: 973
   vocab_size: 974


In [17]:
metadata_path = Path("../data/augmented/metadata.csv")
df = pd.read_csv(metadata_path)
print(df.columns)
print(df.iloc[1])

augmented_dir = Path("../data/augmented")

transform_resnet = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

df_train, df_temp = train_test_split(df, test_size=0.3, random_state=11)
df_test, df_val = train_test_split(df_temp, test_size=0.5, random_state=11)


train_dataset = CLIPDataset(df_train, augmented_dir, tokenizer, transform_resnet)
val_dataset   = CLIPDataset(df_val, augmented_dir, tokenizer, transform_resnet)
test_dataset  = CLIPDataset(df_test, augmented_dir, tokenizer, transform_resnet)


Index(['image_path', 'label', 'caption'], dtype='object')
image_path                                water_070_spatial.jpg
label                                               Label.WATER
caption       A kayaker wearing a blue wetsuit and black hel...
Name: 1, dtype: object


NameError: name 'cls' is not defined

In [12]:
epochs = 11
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = SmallCLIP()

TypeError: SmallCLIP.__init__() missing 2 required positional arguments: 'vision_encoder' and 'text_encoder'

In [None]:

for epoch in range(epochs):
    for images, (input_ids, attention_mask), labels_text in dataloader:
        images = images.to(device)
        input_ids = input_ids.to(device)

        logits, img_emb, txt_emb = clip_model(images, input_ids)

        loss = clip_loss(logits)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print("Epoch", epoch+1, "loss", loss.item())


NameError: name 'dataloader' is not defined