In [2]:
from pathlib import Path
import os
from datasets import load_dataset

In [3]:
CACHE_DIR = os.path.join(Path(os.getcwd()).resolve().parents[0] , "local_only", "data")
os.makedirs(CACHE_DIR, exist_ok=True)
DATASET_NAME = "nlphuji/flickr30k"
CACHE_DIR

'/mnt/f/chatbot_ui_v4/local_only/data'

In [4]:
def download_flickr30k_dataset(dataset_name, download_location=None):
    dataset = load_dataset(dataset_name, cache_dir=download_location)
    return dataset

In [5]:
dataset = download_flickr30k_dataset(dataset_name = DATASET_NAME, download_location = CACHE_DIR)

In [7]:
dataset

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 31014
    })
})

In [8]:
from pymongo import MongoClient, ASCENDING, UpdateOne

def get_mongo_db_client():
    client = MongoClient("mongodb://localhost:27017/")
    assert client.admin.command("ping") == {'ok': 1.0}
    return client

In [9]:
mongo_client  = get_mongo_db_client()

In [12]:
mongo_client.list_database_names()

['admin',
 'agent_evaluation_db',
 'chatbot_ui_v3',
 'chatbot_ui_v4',
 'config',
 'local']

In [14]:
database = mongo_client['chatbot_ui_v4']  
database.list_collection_names()

['caption']

In [22]:
collection = database['caption']
collection.count_documents({})

155070

In [18]:
docs = collection.find().limit(10)
for doc in docs:
    print(doc)

{'_id': ObjectId('686b569ddc158f44299de34d'), 'sent_id': '0', 'caption': 'Two young guys with shaggy hair look at their hands while hanging out in the yard.', 'filename': '1000092795.jpg', 'img_id': '0'}
{'_id': ObjectId('686b569ddc158f44299de34e'), 'sent_id': '1', 'caption': 'Two young, White males are outside near many bushes.', 'filename': '1000092795.jpg', 'img_id': '0'}
{'_id': ObjectId('686b569ddc158f44299de34f'), 'sent_id': '2', 'caption': 'Two men in green shirts are standing in a yard.', 'filename': '1000092795.jpg', 'img_id': '0'}
{'_id': ObjectId('686b569ddc158f44299de350'), 'sent_id': '3', 'caption': 'A man in a blue shirt standing in a garden.', 'filename': '1000092795.jpg', 'img_id': '0'}
{'_id': ObjectId('686b569ddc158f44299de351'), 'sent_id': '4', 'caption': 'Two friends enjoy time spent together.', 'filename': '1000092795.jpg', 'img_id': '0'}
{'_id': ObjectId('686b569ddc158f44299de352'), 'sent_id': '5', 'caption': 'Several men in hard hats are operating a giant pulley 

In [1]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2", local_files_only=True)
kw_model = KeyBERT(model=sentence_model)

In [20]:
%%time
all_keywords = []

for record in collection.find().limit(100):
    caption = record['caption']
    keywords = kw_model.extract_keywords(caption, stop_words='english', top_n=2)
    all_keywords.extend([kw[0] for kw in keywords])

CPU times: user 1.1 s, sys: 109 ms, total: 1.2 s
Wall time: 1.18 s


In [32]:
unique_keywords = set(all_keywords)

In [47]:
keyword_caption_pairs = []  # Stores tuples of (keyword, caption)

for record in collection.find().limit(100):
    caption = record['caption']
    keywords = kw_model.extract_keywords(caption, stop_words='english', top_n=2, use_maxsum=True,  # Optional, ensures diversity
        nr_candidates=20, keyphrase_ngram_range=(1, 2) ) # Focus on 2 to 3 word phrases)
    for kw in keywords:
        keyword_caption_pairs.append((kw[0], caption))


In [48]:
keyword_caption_pairs

[('look',
  'Two young guys with shaggy hair look at their hands while hanging out in the yard.'),
 ('hanging yard',
  'Two young guys with shaggy hair look at their hands while hanging out in the yard.'),
 ('near', 'Two young, White males are outside near many bushes.'),
 ('males outside', 'Two young, White males are outside near many bushes.'),
 ('yard', 'Two men in green shirts are standing in a yard.'),
 ('shirts standing', 'Two men in green shirts are standing in a yard.'),
 ('shirt standing', 'A man in a blue shirt standing in a garden.'),
 ('garden', 'A man in a blue shirt standing in a garden.'),
 ('spent', 'Two friends enjoy time spent together.'),
 ('friends enjoy', 'Two friends enjoy time spent together.'),
 ('hard', 'Several men in hard hats are operating a giant pulley system.'),
 ('hats operating',
  'Several men in hard hats are operating a giant pulley system.'),
 ('equipment', 'Workers look down from up above on a piece of equipment.'),
 ('workers look', 'Workers look 

In [34]:
keywords = [pair[0] for pair in keyword_caption_pairs]

keyword_embeddings = sentence_model.encode(keywords, show_progress_bar=True)


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [37]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
cluster_labels = clusterer.fit_predict(keyword_embeddings)

In [39]:
from collections import defaultdict

clusters = defaultdict(list)

for idx, label in enumerate(cluster_labels):
    if label != -1:  # Ignore noise
        keyword, caption = keyword_caption_pairs[idx]
        clusters[label].append((keyword, caption))


In [40]:
from collections import Counter

search_hints = []

for cluster_id, items in clusters.items():
    keywords_in_cluster = [kw for kw, _ in items]
    most_common_keyword = Counter(keywords_in_cluster).most_common(1)[0][0]
    search_hints.append({
        "hint": most_common_keyword,
        "examples": [caption for _, caption in items[:3]]  # Show few example captions
    })


In [41]:
search_hints

[{'hint': 'tractor',
  'examples': ['Two young guys with shaggy hair look at their hands while hanging out in the yard.',
   'Two men in green shirts are standing in a yard.',
   'A man in a blue shirt standing in a garden.']},
 {'hint': 'women',
  'examples': ['Two young, White males are outside near many bushes.',
   'Workers look down from up above on a piece of equipment.',
   'Four men on top of a tall structure.']},
 {'hint': 'green',
  'examples': ['Two men in green shirts are standing in a yard.',
   'A man in green holds a guitar while the other man observes his shirt.',
   'A man in a neon green and orange uniform is driving on a green tractor.']},
 {'hint': 'guitar',
  'examples': ['Workers look down from up above on a piece of equipment.',
   'Two people in the photo are playing the guitar and the other is poking at him.',
   'A man in green holds a guitar while the other man observes his shirt.']},
 {'hint': 'stairs',
  'examples': ['A child in a pink dress is climbing up 