In [None]:
# Consolidated imports
import os
import random
import tarfile
import urllib.request
import pandas as pd
from PIL import Image
import torch
from transformers import BlipForConditionalGeneration, BlipProcessor
from tqdm import tqdm
from torchvision import transforms
import openai

In [None]:
# Set up base paths
base_path = "./dataset"
os.makedirs(base_path, exist_ok=True)

In [None]:
# Dataset download links
download_links = {
    "train": "https://food-x.s3.amazonaws.com/train.tar",
    "annot": "https://food-x.s3.amazonaws.com/annot.tar",
    "val": "https://food-x.s3.amazonaws.com/val.tar",
    "test": "https://food-x.s3.amazonaws.com/test.tar",
}

In [None]:
# Download and extract datasets
for name, url in download_links.items():
    file_path = os.path.join(base_path, f"{name}.tar")
    if not os.path.exists(file_path):
        print(f"Downloading {name}...")
        urllib.request.urlretrieve(url, file_path)
    else:
        print(f"{name}.tar already exists, skipping download.")

    print(f"Extracting {name}...")
    with tarfile.open(file_path, "r") as tar:
        tar.extractall(base_path)

Downloading train...
Extracting train...
Downloading annot...
Extracting annot...
Downloading val...
Extracting val...
Downloading test...
Extracting test...


In [None]:
# Define paths for datasets
train_set_path = os.path.join(base_path, "train_set")
val_set_path = os.path.join(base_path, "val_set")
test_set_path = os.path.join(base_path, "test_set")
train_csv_path = os.path.join(base_path, "train_info.csv")
val_csv_path = os.path.join(base_path, "val_info.csv")
category_txt_path = os.path.join(base_path, "class_list.txt")

In [None]:
# Load pre-trained BLIP model and processor
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# Ensure model is on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on {device}")

Model loaded on cuda


In [None]:
# Load training data
train_info = pd.read_csv(train_csv_path, header=None, names=["image_name", "category_id"])
categories = pd.read_csv(category_txt_path, sep=" ", header=None, names=["id", "name"])

In [None]:
# Prepare image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

In [None]:
# Caption generation function
def generate_caption(image_path):
    """Generate caption for a single image."""
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=50)
            caption = processor.decode(outputs[0], skip_special_tokens=True)

        return caption
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [None]:
# Generate captions for training set
print("Generating captions for training set...")
captions = []
for _, row in tqdm(train_info.iterrows(), total=len(train_info), desc="Training Set Progress"):
    image_path = os.path.join(train_set_path, row["image_name"])
    category = categories.loc[categories["id"] == row["category_id"], "name"].values[0]
    caption = generate_caption(image_path)
    if caption:
        captions.append({"image_name": row["image_name"], "caption": caption, "category": category})

In [None]:
# Generate and save captions for validation and test sets
def process_and_save_captions(image_list, set_path, output_path, desc):
    captions = []
    for image_name in tqdm(image_list, desc=desc):
        image_path = os.path.join(set_path, image_name)
        caption = generate_caption(image_path)
        if caption:
            captions.append({"image_name": image_name, "caption": caption})
    pd.DataFrame(captions).to_csv(output_path, index=False)

In [None]:
# Test set captions
test_images = os.listdir(test_set_path)
process_and_save_captions(test_images, test_set_path, os.path.join(base_path, "image_dataset_with_metadata.csv"), "Test Set Progress")

print("Caption generation complete.")

In [None]:
# Access the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
data = pd.read_csv('image_dataset_with_metadata.csv')

def generate_caption(metadata):
    prompt = (
        f"Caption: {metadata}. "
        "Create a meaningful food description according to this caption."
    )
    try:
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50  
        )
        return completion.choices[0].message['content'].strip()
    except Exception as e:
        return f"Error generating caption: {e}"


data['meaningful-caption'] = data['caption'].apply(generate_caption)

output_path = 'image_dataset_with_captions.csv'
data.to_csv(output_path, index=False)

print(f"Captions generated and saved to {output_path}")
