In [1]:
import os
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from PIL import Image, UnidentifiedImageError
import pandas as pd
import pickle


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Define a path to save the fine-tuned model
FINETUNED_MODEL_DIR = "./exterior-finetuned-model"
PICKLE_FILE_PATH = os.path.join(FINETUNED_MODEL_DIR, "exterior_finetuned_model.pkl")

def save_pickled_model(model, tokenizer, feature_extractor, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Save the model components in a dictionary
    model_data = {
        "model": model.state_dict(),
        "tokenizer": tokenizer,
        "feature_extractor": feature_extractor
    }
    
    # Save the model as a pickle file
    with open(os.path.join(output_dir, "exterior_finetuned_model.pkl"), "wb") as f:
        pickle.dump(model_data, f)

def load_pickled_model(pickle_file):
    with open(pickle_file, "rb") as f:
        model_data = pickle.load(f)
    
    # Load the model components
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    model.load_state_dict(model_data["model"])

    tokenizer = model_data["tokenizer"]
    feature_extractor = model_data["feature_extractor"]
    
    return model, tokenizer, feature_extractor


In [3]:
def load_model():
    if os.path.exists(PICKLE_FILE_PATH):
        # Load fine-tuned model from pickle if it exists
        model, tokenizer, feature_extractor = load_pickled_model(PICKLE_FILE_PATH)
    else:
        # Load pre-trained model if no fine-tuned model exists
        model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    return model, feature_extractor, tokenizer, device

# Load the model
model, feature_extractor, tokenizer, device = load_model()


  _torch_pytree._register_pytree_node(


In [4]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, tokenizer, root_dir, max_target_length=16):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.root_dir = root_dir
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.dataframe.iloc[idx]['image_path'])
        
        try:
            image = Image.open(img_path)
            if image.mode != "RGB":
                image = image.convert("RGB")
        except (UnidentifiedImageError, FileNotFoundError) as e:
            # Skip the image if it's not identifiable
            print(f"Skipping image {img_path}: {e}")
            return None
        
        pixel_values = self.feature_extractor(images=[image], return_tensors="pt").pixel_values.squeeze()
        
        caption = self.dataframe.iloc[idx]["caption"]
        labels = self.tokenizer(caption, return_tensors="pt", max_length=self.max_target_length, truncation=True, padding="max_length").input_ids.squeeze()
        
        return {"pixel_values": pixel_values, "labels": labels}


In [5]:
def collate_fn(batch):
    # Filter out None values from the batch
    batch = [item for item in batch if item is not None]
    return {
        "pixel_values": torch.stack([item["pixel_values"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch])
    }


In [6]:
def fine_tune_model(model, train_dataset, output_dir=FINETUNED_MODEL_DIR, epochs=4, batch_size=10):
    if output_dir is None:
        output_dir = FINETUNED_MODEL_DIR
    
    os.makedirs(output_dir, exist_ok=True)

    # Use DataLoader with collate_fn to filter out None values
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="no",  # Disable evaluation
        num_train_epochs=epochs,
        save_steps=500,
        save_total_limit=2,
        remove_unused_columns=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=collate_fn
    )

    trainer.train()

    # Save the fine-tuned model using Pickle
    save_pickled_model(model, tokenizer, feature_extractor, output_dir)

    return model


In [7]:
# Load your dataset (ensure 'image_path' and 'caption' columns are available)
df = pd.read_csv("exterior_dataset.csv")
root_dir = "Exterior Images"  # Folder where images are stored

# Create Dataset instances
train_dataset = ImageCaptioningDataset(df, feature_extractor, tokenizer, root_dir)

# Fine-tune the model
model = fine_tune_model(model, train_dataset)

print("Fine-tuning completed and model saved.")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,4.8194
20,2.9579
30,2.5056
40,2.3846
50,2.3586
60,2.2967
70,2.0
80,2.0516
90,1.9098
100,1.9864


Skipping image Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg: cannot identify image file 'Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg'




Skipping image Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg: cannot identify image file 'Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg'
Skipping image Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg: cannot identify image file 'Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg'
Skipping image Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg: cannot identify image file 'Exterior Images/348-362a5274-97a6-4715-979a-fc51bcafdd4a.jpeg'
Fine-tuning completed and model saved.


In [8]:
# Load the fine-tuned model from the pickle file
model, tokenizer, feature_extractor = load_pickled_model(PICKLE_FILE_PATH)

# Test caption generation on a sample image
sample_image_path = "/home/anubavam/Desktop/claim assist projects/Image Captioning/Allcat/Exterior Images/2-exterior-damaged-effects.jpg"
image = Image.open(sample_image_path)
if image.mode != "RGB":
    image = image.convert(mode="RGB")

encoding = feature_extractor(images=[image], return_tensors="pt")
pixel_values = encoding.pixel_values.to(device)

gen_kwargs = {
    "max_length": 100,
    "num_beams": 4,
    "length_penalty": 1.0,
    "early_stopping": True
}

output_ids = model.generate(pixel_values, **gen_kwargs)
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Generated Caption: {caption}")




Generated Caption: Exterior - rear elevation pool mechanical damage not consistent with storm related causes
