## Step1: Preparing Data

In [None]:
from bs4 import BeautifulSoup
import os
from functions import *

from PIL import Image
from transformers import CLIPProcessor, CLIPModel

from torch import cat, save

In [None]:
## load the data
# Extract text and images
filename_list = ["raw/"+f for f in os.listdir('raw')]

text_content_list = []
image_content_list = []
for filename in filename_list:

    with open(filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    text_content_list.extend(parse_html_content(html_content))
    image_content_list.extend(parse_html_images(html_content))

print(len(text_content_list))
print(len(image_content_list))

text_list = []
for content in text_content_list:
    # concatenate title and section header
    section = content['section'] + ": "
    # append text from paragraph to fill CLIP's 256 sequence limit
    text = section + content['text'][:256-len(section)]
    
    text_list.append(text)

image_list = []
for content in image_content_list:
    image_list.append(Image.open(content['image_path']))

print(len(text_list))
print(len(image_list))

In [None]:
## Compute embeddings using CLIP
# import model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# import processor (handles text tokenization and image preprocessing)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") 
# pre-process text and images
inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True)
# compute embeddings with CLIP
outputs = model(**inputs)
# store embeddings in single torch tensor
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds
print(text_embeddings.shape)
print(image_embeddings.shape)
# Save Data
# save content list as JSON
save_to_json(text_content_list, output_file='data/text_content.json')
save_to_json(image_content_list, output_file='data/image_content.json')
# save embeddings to file
save(text_embeddings, 'data/text_embeddings.pt')
save(image_embeddings, 'data/image_embeddings.pt')

## Step 2: Multimodal Article Question Answering Assistant

In [None]:
import json
from functions import *
from transformers import CLIPProcessor, CLIPModel
from torch import load, matmul, argsort
from torch.nn.functional import softmax

from IPython.display import Image

import ollama

In [None]:
# load article contents
text_content_list = load_from_json('data/text_content.json')
image_content_list = load_from_json('data/image_content.json')

# load embeddings
text_embeddings = load('data/text_embeddings.pt', weights_only=True)
image_embeddings = load('data/image_embeddings.pt', weights_only=True)

print(text_embeddings.shape)
print(image_embeddings.shape)

print(text_content_list[4]
)

In [None]:
# query
query = "What is CLIP's contrastive loss function?"
# query = "What are the three paths described for making LLMs multimodal?"
# query = "What is an intuitive explanation of multimodal embeddings?"

# embed query
query_embed = embed_text(query)
print(query_embed.shape)

In [None]:
## Multimodal search
k = 5
threshold = 0.1

# multimodal search over articles
text_similarities = matmul(query_embed, text_embeddings.T)
image_similarities = matmul(query_embed, image_embeddings.T)

# rescale similarities via softmax
temp=0.25
text_scores = softmax(text_similarities/temp, dim=1)
image_scores = softmax(image_similarities/temp, dim=1)

# return top k filtered text results
isorted_scores = argsort(text_scores, descending=True)[0]
sorted_scores = text_scores[0][isorted_scores]

itop_k_filtered = [idx.item() for idx, score in zip(isorted_scores, sorted_scores) if score.item() >= threshold][:k]
top_k = [text_content_list[i] for i in itop_k_filtered]

top_k

In [None]:
## text and image search
text_results, text_scores = similarity_search(query_embed, text_embeddings, text_content_list, k=15, threshold=0.01, temperature=0.25)
image_results, image_scores = similarity_search(query_embed, image_embeddings, image_content_list, k=5, threshold=0.25, temperature=0.5)

i=1
for text in text_results:
    if text_results:
        print(i, "-", text['text'])
        i=i+1
for image in image_results:
    display(Image(filename=image['image_path']))

In [None]:
# Prompt Engineering
# to make the prompt ready to use in LLM

#format context
text_context = ""
for text in text_results:
    if text_results:
        text_context = text_context + "**Article title:** " + text['article_title'] + "\n"
        text_context = text_context + "**Section:**  " + text['section'] + "\n"
        text_context = text_context + "**Snippet:** " + text['text'] + "\n\n"
image_context = ""
for image in image_results:
    if image_results:
        image_context = image_context + "**Article title:** " + image['article_title'] + "\n"
        image_context = image_context + "**Section:**  " + image['section'] + "\n"
        image_context = image_context + "**Image Path:**  " + image['image_path'] + "\n"
        image_context = image_context + "**Image Caption:** " + image['caption'] + "\n\n"

In [None]:
# prompt construction
# construct prompt template
prompt = f"""Given the query "{query}" and the following relevant snippets:

{text_context}
{image_context}

Please provide a concise and accurate answer to the query, incorporating relevant information from the provided snippets where available.

"""

In [None]:
# Prompt LLM

ollama.pull('llama3.2-vision')
response = ollama.chat(
    model='llama3.2-vision',
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [image["image_path"] for image in image_results]
    }]
)

print(response['message']['content'])