In [1]:
import numpy as np
from pathlib import Path
from typing import List, Dict
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForSequenceClassification
import accelerate
import transformers
from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizerFast


  from .autonotebook import tqdm as notebook_tqdm


Chargement du dataset text


In [2]:
metadata_path = Path("../data/augmented/metadata.csv")
df = pd.read_csv(metadata_path)
print(df.columns)
print(df.iloc[1])

Index(['image_path', 'label', 'caption'], dtype='object')
image_path                                water_070_spatial.jpg
label                                               Label.WATER
caption       A kayaker wearing a blue wetsuit and black hel...
Name: 1, dtype: object


In [3]:
def get_label(filename: str):
    return filename.split("_")[0]


def get_uuid(filename: str):
    name = Path(filename).stem          
    parts = name.split("_")
    return "_".join(parts[:2])          


def build_augmented_path(img_path: Path, base_dir: Path):
    img_path = Path(img_path)
    filename = img_path.name
    label = get_label(filename)
    uuid = get_uuid(filename)
    print(uuid)

    return base_dir / label / uuid / filename

get_label(df["image_path"].iloc[1])
get_uuid(df["image_path"].iloc[1])


'water_070'

In [25]:
class TextCLIPDataset(Dataset):
    def __init__(self, imgs, labels, base_dir: Path, captions: str, tokenizer, max_length=32):
        self.img_paths = [Path(build_augmented_path(img, base_dir)) for img in imgs]
        self.labels = list(labels)
        self.captions = list(captions)
        self.classes = sorted(set(self.labels))                  
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}  
        self.tokenizer = tokenizer
        self.max_length = max_length



    def __getitem__(self, idx):
        caption = self.captions[idx]
        label = self.class_to_idx[self.labels[idx]]

        encoded = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        print(encoded["input_ids"])
        print(encoded["attention_mask"])
        return (
            encoded["input_ids"].squeeze(0),      # shape (seq_len,)
            encoded["attention_mask"].squeeze(0), # shape (seq_len,)
            torch.tensor(label, dtype=torch.long) # label index
        )

    def _get_all_infos(self, idx) -> str:
        img = Image.open(self.img_paths[idx]).convert("RGB")
        label_str = self.labels[idx]
        label = self.class_to_idx[label_str]    
        caption = self.captions[idx]
        return idx, img, caption, label
    
    def __len__(self) -> int:
        return len(self.img_paths)
    
    def _get_img_path_from_idx(self, idx: int) -> Path:
        return self.img_paths[idx]
    
    def _get_label_from_idx(self, idx: int) -> str :
        return self.labels[idx]
    
    def _get_caption_from_idx(self, idx: int) -> str:
        return self.captions[idx]

    def _get_tokenized_cpt_from_idx(self, idx: int):
        caption = self.captions[idx]
        encoded = self.tokenizer(
                    caption, 
                    padding = "max_length",
                    truncation = True,
                    max_length = self.max_length,
                    return_tensors = "pt",
                )
    
        return encoded
            
    



#Entr√©e attendue par SmallBERT
{
  'input_ids': Tensor(batch_size, seq_len),
  'attention_mask': Tensor(batch_size, seq_len),
  'labels': Tensor(batch_size)
}


In [26]:

df_train, df_temp = train_test_split(df, test_size=0.3, random_state=11)
df_test, df_val = train_test_split(df_temp, test_size=0.5, random_state=11)

print(len(df_train), len(df_test), len(df_val))
print(df_train["label"].value_counts(normalize=True) * 100)
print(df_val["label"].value_counts(normalize=True) * 100)
print(df_test["label"].value_counts(normalize=True) * 100)

X_train, y_train, caption_train = df_train["image_path"], df_train["label"], df_train["caption"]
X_val, y_val, caption_val     = df_val["image_path"], df_val["label"], df_val["caption"]
X_test, y_test, caption_test   = df_test["image_path"], df_test["label"], df_test["caption"]

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


1260 270 270
label
Label.BALL     25.873016
Label.BIKE     25.476190
Label.WATER    24.682540
Label.DOG      23.968254
Name: proportion, dtype: float64
label
Label.DOG      29.259259
Label.WATER    24.814815
Label.BIKE     23.333333
Label.BALL     22.592593
Name: proportion, dtype: float64
label
Label.WATER    26.666667
Label.DOG      25.555556
Label.BIKE     24.444444
Label.BALL     23.333333
Name: proportion, dtype: float64


In [27]:
model_name = "prajjwal1/bert-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
augmented_dir = Path("../data/augmented")

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [30]:
train_dataset = TextCLIPDataset(X_train, y_train, augmented_dir, caption_train, tokenizer)
val_dataset   = TextCLIPDataset(X_val, y_val, augmented_dir, caption_val, tokenizer)
test_dataset  = TextCLIPDataset(X_test, y_test, augmented_dir, caption_test, tokenizer)


water_139
bike_046
bike_067
dog_014
ball_029
dog_062
bike_007
dog_103
ball_113
ball_020
dog_136
water_064
ball_052
water_093
ball_023
water_008
bike_082
water_014
dog_018
dog_033
water_050
dog_017
bike_014
bike_147
bike_019
water_093
dog_034
ball_019
ball_056
water_056
ball_085
ball_107
ball_099
water_141
dog_087
dog_035
ball_055
bike_096
bike_094
water_012
dog_084
bike_071
water_074
bike_048
water_039
water_045
bike_023
water_029
dog_024
bike_141
dog_013
bike_013
water_092
water_050
water_065
ball_128
dog_077
ball_140
bike_085
water_133
ball_101
ball_043
water_060
dog_050
dog_015
water_075
bike_057
ball_008
dog_004
ball_082
water_028
dog_026
dog_091
bike_115
ball_132
water_072
dog_102
bike_078
bike_074
bike_118
bike_085
water_023
bike_086
water_062
dog_019
dog_033
ball_035
dog_095
bike_139
dog_053
water_084
bike_087
water_081
bike_012
bike_008
ball_030
bike_068
bike_042
bike_084
ball_053
ball_131
water_143
bike_024
bike_063
water_075
dog_122
bike_136
dog_054
dog_049
bike_090
dog_026
d

In [31]:
print(train_dataset.__getitem__(4))
print("\n")
print(train_dataset._get_label_from_idx(4))
print("\n")
print(train_dataset._get_tokenized_cpt_from_idx(4))
print("\n")
print(train_dataset._get_tokenized_cpt_from_idx(4)['input_ids'])


tensor([[ 101, 1037, 2829, 3899, 2003, 8660, 2044, 1037, 5093, 3608, 1012,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
(tensor([ 101, 1037, 2829, 3899, 2003, 8660, 2044, 1037, 5093, 3608, 1012,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), tensor(0))


Label.BALL


{'input_ids': tensor([[ 101, 1037, 2829, 3899, 2003, 8660, 2044, 1037, 5093, 3608, 1012,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,

In [9]:
def check_caption_label_alignment(self):
    issues = []

    for idx, (label, caption) in enumerate(zip(self.labels, self.captions)):
        cls = label.split(".")[-1].lower().strip()
        cap = caption.lower()
        if cls not in cap:
            issues.append((idx, label, caption, ))
    

    print(len(issues), len(self), len(self)/len(issues))
    return issues


In [10]:
check_caption_label_alignment(train_dataset)
check_caption_label_alignment(test_dataset)
check_caption_label_alignment(val_dataset)

413 1260 3.0508474576271185
77 270 3.5064935064935066
92 270 2.9347826086956523


[(6,
  'Label.BIKE',
  'A man on a bicycle jumping off a dirt ramp with one foot on the ground'),
 (7, 'Label.WATER', 'A father and a child are playing in the pool'),
 (10,
  'Label.WATER',
  'A boy is jumping on an inflatable ring and a girl is watching him'),
 (13,
  'Label.BIKE',
  'A cyclist wearing a red helmet is riding on the pavement .'),
 (16, 'Label.BIKE', 'A man racing an orange motorcycle .'),
 (22,
  'Label.WATER',
  'A male and a female are leaning against railing looking off with the ocean in the background'),
 (26, 'Label.WATER', 'A man showering in a home outdoor shower facility .'),
 (30,
  'Label.BIKE',
  'a long bicyclist wearing a white helmet riding down a mountain path'),
 (33, 'Label.BIKE', 'A motocross rider is on a dirt hill .'),
 (36, 'Label.BALL', 'Two big dogs play in the snow under a tree .'),
 (37, 'Label.DOG', 'A black puppy is biting a tree limb .'),
 (38, 'Label.BALL', 'A boy runs in a race while onlookers watch .'),
 (43, 'Label.BIKE', 'A motorcycle r

In [11]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }


In [20]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
model.base_model.config.output_hidden_states = True

print(model)
print(tokenizer.cls_token)      
print(tokenizer.cls_token_id)   



training_args = TrainingArguments(
    output_dir="./bert-small-sst2",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
 
)
collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collator,
    compute_metrics=compute_metrics,  
)



trainer.train()





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-1

Step,Training Loss
50,0.7928
100,0.251
150,0.1298
200,0.08
250,0.0291
300,0.0177
350,0.0119
400,0.0114


TrainOutput(global_step=400, training_loss=0.16544809103012084, metrics={'train_runtime': 15.0916, 'train_samples_per_second': 834.9, 'train_steps_per_second': 26.505, 'total_flos': 24119497049760.0, 'train_loss': 0.16544809103012084, 'epoch': 10.0})

In [None]:
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
print(val_dataset._get_all_infos(0))

(0, <PIL.Image.Image image mode=RGB size=500x317 at 0x7262746779D0>, 'A black and white dog jumping into a pool .', 2)


In [None]:
predictions = trainer.predict(val_dataset)
logits = predictions.predictions
y_pred = np.argmax(logits, axis=-1)
y_true = predictions.label_ids

print(classification_report(y_true, y_pred, target_names=train_dataset.classes))


              precision    recall  f1-score   support

  Label.BALL       1.00      1.00      1.00        61
  Label.BIKE       1.00      0.97      0.98        63
   Label.DOG       0.94      1.00      0.97        79
 Label.WATER       1.00      0.96      0.98        67

    accuracy                           0.98       270
   macro avg       0.99      0.98      0.98       270
weighted avg       0.98      0.98      0.98       270



In [None]:
faux_predic = np.where(y_pred != y_true)[0]
probas = torch.softmax(torch.tensor(logits), dim=-1)

print(f"Nombre d'erreurs : {len(faux_predic)}")
for idx in faux_predic:
    true_label = val_dataset.labels[idx]
    pred_label = train_dataset.classes[y_pred[idx]]
    caption = val_dataset.captions[idx]
    confidence = probas[idx][y_pred[idx]].item()


    print(f"Index: {idx}")
    print(f"Caption: {caption}")
    print(f"True: {true_label} | Pred: {pred_label} | Confidence: {confidence:.3f}")
    print("---")



Nombre d'erreurs : 5
Index: 64
Caption: black dogs jump in pool to a hand .
True: Label.WATER | Pred: Label.DOG | Confidence: 0.988
---
Index: 70
Caption: a froup of sled dogs pulling a man wearing an orange vest .
True: Label.BIKE | Pred: Label.DOG | Confidence: 0.995
---
Index: 145
Caption: A black dog in the water
True: Label.WATER | Pred: Label.DOG | Confidence: 0.977
---
Index: 151
Caption: black dogs jump in pool to a hand .
True: Label.WATER | Pred: Label.DOG | Confidence: 0.988
---
Index: 201
Caption: a froup of sled dogs pulling a man wearing an orange vest .
True: Label.BIKE | Pred: Label.DOG | Confidence: 0.995
---


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F



class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length: int, vocab_size:int, embed_dim:int):
        super().__init__()
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        self.token_embeddings: nn.Embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings: nn.Embedding = nn.Embedding(sequence_length, embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: Tensor shape (batch, seq_len) contenant les IDs des tokens
        return: embeddings shape (batch, seq_len, embed_dim)
        """
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0)
        tokens = self.token_embeddings(x)
        pos = self.position_embeddings(positions)
        return tokens + pos








In [17]:
class TransformerBlock(nn.Module):
     def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout_rate: float = 0.1) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.att: nn.MultiheadAttention = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout_rate, batch_first=True
        )

        self.ffn: nn.Sequential = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

        self.layernorm1: nn.LayerNorm = nn.LayerNorm(embed_dim)
        self.layernorm2: nn.LayerNorm = nn.LayerNorm(embed_dim)
        self.dropout1: nn.Dropout = nn.Dropout(dropout_rate)
        self.dropout2: nn.Dropout = nn.Dropout(dropout_rate)

In [None]:
class SmallBERT(nn.Module):
    def __init__(self, sequence_length: int, vocab_size: int, embed_dim: int,
                 num_heads: int, ff_dim: int, num_layers: int) -> None:
        super().__init__()
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        self.pos_embedding: PositionalEmbedding = PositionalEmbedding(sequence_length, vocab_size, embed_dim)
        self.blocks: nn.ModuleList = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim)
            for i in range(num_layers)
        ])
        self.layernorm: nn.LayerNorm = nn.LayerNorm(embed_dim)
        self.dropout: nn.Dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len) token ids
        return: (batch, seq_len, embed_dim)
        """
        x = self.pos_embedding(x)
        mask: torch.Tensor = (x == 0)[:, :, 0]  # PAD mask

        for block in self.blocks:
            x = block(x, mask)

        x = self.layernorm(x)
        return self.dropout(x)


In [19]:
class SmallBERTPourClassification(nn.Module):
    def __init__(self, sequence_length: int, vocab_size: int, embed_dim: int,
                 num_heads: int, ff_dim: int, num_layers: int,
                 num_classes: int = 4) -> None:
        super().__init__()

        self.encoder: SmallBERT = SmallBERT(sequence_length, vocab_size, embed_dim, num_heads, ff_dim, num_layers)
        self.dropout: nn.Dropout = nn.Dropout(0.3)
        self.classifier: nn.Linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len) token ids
        return: (batch, num_classes) logits
        """
        enc: torch.Tensor = self.encoder(x)   # (batch, seq_len, embed_dim)
        pooled: torch.Tensor = enc.mean(dim=1)
        pooled = self.dropout(pooled)
        logits: torch.Tensor = self.classifier(pooled)

        return F.softmax(logits, dim=-1)


In [33]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SmallBERTPourClassification(
    sequence_length=train_dataset.max_length,
    vocab_size=,
    embed_dim=128,
    num_heads=4,
    ff_dim=256,
    num_layers=2,
    num_classes=4
).to(device)


AttributeError: 'TextCLIPDataset' object has no attribute 'vocab_size'