In [None]:
from IPython.display import clear_output
!pip install  transformers --quiet
!pip install rouge_score -q
!pip install deep-phonemizer -q
!pip install opendatasets -q
!pip install  datasets -q
!pip uninstall -y transformers accelerate -q
!pip install transformers accelerate -q
!pip install evaluate -q
!pip install pycocoevalcap -q
!pip install huggingface_hub -q
!pip install -U nltk 

clear_output()

In [None]:
import nltk
import requests

import os
import evaluate
import datasets
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset

from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments
from transformers import VisionEncoderDecoderModel , ViTImageProcessor, ViTFeatureExtractor
from transformers import GPT2Config , default_data_collator
from transformers import GPT2TokenizerFast
from transformers import pipeline,VisionEncoderDecoderConfig

import evaluate

if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
nltk.download('wordnet')

In [None]:
!unzip   /usr/share/nltk_data/corpora/wordnet.zip -d   /usr/share/nltk_data/corpora/

# Intializing Encoder-Decoder Model

In [None]:
encoder_model = "microsoft/swin-base-patch4-window7-224-in22k"
decoder_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
).to(device)

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(decoder_model)
image_processor = ViTImageProcessor.from_pretrained(encoder_model)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id

In [None]:

max_length = 32 

train_ds = load_dataset("HuggingFaceM4/COCO", split=f"train[:{50}%]")
valid_ds = load_dataset("HuggingFaceM4/COCO", split=f"validation[:{50}%]")
test_ds = load_dataset("HuggingFaceM4/COCO", split=f"test[:{50}%]")


In [None]:

train_ds = train_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
valid_ds = valid_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
test_ds = test_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)

In [None]:
def preprocess(items):
  pixel_values = image_processor(items["image"], return_tensors="pt").pixel_values.to(device)
  targets = tokenizer([ sentence["raw"] for sentence in items["sentences"] ], 
                      max_length=max_length, padding="max_length", truncation=True, return_tensors="pt").to(device)
  return {'pixel_values': pixel_values, 'labels': targets["input_ids"]}

train_dataset = train_ds.with_transform(preprocess)
valid_dataset = valid_ds.with_transform(preprocess)
test_dataset  = test_ds.with_transform(preprocess)

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.stack([x['labels'] for x in batch])
    }

In [None]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor =  evaluate.load("meteor")

def compute_metrics(eval_pred):
    preds = eval_pred.label_ids
    labels = eval_pred.predictions

    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #rouge score
    rouge_result = rouge.compute(predictions=pred_str, references=labels_str)
    rouge_result = {k: round(v * 100, 4) for k, v in rouge_result.items()}#{k: round(v , 4) for k, v in rouge_result.items()}# 

    # meteor
    meteor_result = meteor.compute(predictions=pred_str, references=labels_str)

    # bleu scores
    bleu1_result = bleu.compute(predictions=pred_str, references=labels_str, max_order =1)
    bleu2_result = bleu.compute(predictions=pred_str, references=labels_str, max_order =2)
    bleu3_result = bleu.compute(predictions=pred_str, references=labels_str, max_order =3)
    bleu4_result = bleu.compute(predictions=pred_str, references=labels_str, max_order =4)
    
    '''#  cider
    # convert lists to dictionaries
    ref_captions = {image_id: [caption] for image_id, caption in enumerate(labels_str)}
    pred_captions = {image_id: [caption] for image_id, caption in enumerate(pred_str)}

    cider_scorer = Cider()
    cider_score, cider_scores = cider_scorer.compute_score(ref_captions, pred_captions)
    #spice
    spice_scorer = Spice()
    spice_score, spice_scores = spice_scorer.compute_score(ref_captions, pred_captions)'''
    return {
      **rouge_result, 
      "meteor": round(meteor_result['meteor']*100, 2) , #round(meteor_result['meteor'], 4),
      "bleu1": round(bleu1_result["bleu"] * 100, 4), #round(bleu1_result["bleu"] , 4), 
      "bleu2": round(bleu2_result["bleu"] * 100, 4), # round(bleu2_result["bleu"], 4), 
      "bleu3": round(bleu3_result["bleu"] * 100, 4), #round(bleu3_result["bleu"] , 4), 
      "bleu4": round(bleu4_result["bleu"]* 100 , 4),#round(bleu4_result["bleu"] , 4)
      #"cider": cider_score,
      #"spice": spice_scores
      }

In [None]:
num_epochs = 2 
batch_size = 16 

In [None]:
 '''I used this to send the model to my hugging face account you can comment this 
 to save the model locally 
 yu need a token to send the model to hugging face'''
    
from huggingface_hub import notebook_login
notebook_login()

In [None]:
 '''if the last is not commented it will send
    the check points and final model to my hugginface 
    account but you need the token'''
    
''' for eval_steps, logging_steps and save_steps
 If  more epochs are used I recommend  increasing their valuse because 
 the trainer will save a copy of the model at each checkpoint and it doesnot 
 remove the previous ones this will fillup the disk space quickly if the # of steps is
 small 
'''
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,           
    optim = "adamw_torch",
    learning_rate = 1.5e-6,
    num_train_epochs=num_epochs,          
    evaluation_strategy="steps",         
    eval_steps=2000,                        
    logging_steps=2000,                     
    save_steps=2000,                        
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,     
    output_dir="AsmaMassad/swin-gpt2-image-captioning-coco", # if the line below is commented change this to the local directory
    push_to_hub=True # comment this if you want to save the model locally 
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,                     
    tokenizer=image_processor,      
    args=training_args,              
    compute_metrics=compute_metrics, 
    train_dataset=train_dataset,     
    eval_dataset=valid_dataset,      
    data_collator=collate_fn,        
)

In [None]:
from torch.utils.data import DataLoader

def get_eval_loader(eval_dataset=None):
  return DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=batch_size)

def get_test_loader(eval_dataset=None):
  return DataLoader(test_dataset, collate_fn=collate_fn, batch_size=batch_size)

trainer.get_train_dataloader = lambda: DataLoader(train_dataset, collate_fn=collate_fn, batch_size=batch_size)
trainer.get_eval_dataloader = get_eval_loader
trainer.get_test_dataloader = get_test_loader

In [None]:
trainer.train()

In [None]:
trainer.save_model()