In [None]:
#  Temel importlar
import os
import pandas as pd
import re
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt

#  Dizin tanımları
DATA_DIR = "../data/flickr30k"
IMAGE_DIR = os.path.join(DATA_DIR, "images")
CAPTION_FILE = os.path.join(DATA_DIR, "results.csv")  # veya txt/json

#  Caption mapping sözlüğü (image_id → [caption1, caption2...])
mapping = defaultdict(list)

#  Caption verisini oku
df = pd.read_csv(CAPTION_FILE, delimiter='|')
print(df.head())

#  Mapping sözlüğünü doldur
for _, row in df.iterrows():
    img_id = row['image_name'].strip().split('.')[0]
    caption = row[' comment'].strip()
    mapping[img_id].append(f"<start> {caption} <end>")

#  Bir örnek göster
for k in list(mapping.keys())[:1]:
    print(f"{k} => {mapping[k]}")


In [None]:
def clean_caption(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # noktalama işaretlerini kaldır
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Mapping sözlüğünü temizle
for k in mapping:
    mapping[k] = [f"<start> {clean_caption(cap)} <end>" for cap in mapping[k]]


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

all_captions = []
for caps in mapping.values():
    all_captions.extend(caps)

tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(all_captions)

# Tokenizer’ı kaydet (ileride tekrar yüklemek için)
import pickle
with open("utils/flickr30k_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Maksimum uzunluk
max_len = max(len(c.split()) for c in all_captions)
print("Max caption length:", max_len)
