In [1]:
! pip install langchain "unstructured[all-docs]" pydantic lxml openai chromadb tiktoken





# Data Loading

##### Partition PDF into tables, text, and images using the unstructured package

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/aashish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "figures")

#get elements from pdf
raw_pdf_elements = partition_pdf(
    filename = os.path.join(input_path, "stylesocialevents.pdf"),
    extract_images_in_pdf =  True,
    infer_table_structure = True,
    chunking_strategy = "by_title",
    max_characters = 4000,
    new_after_n_chars = 3800,
    combine_text_under_n_chars = 2000,
    image_output_dir_path = output_path
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import base64

text_elements = []
table_elements = []
image_elements = []

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
    
for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

#Tables
print(len(table_elements))

#Text
print(len(text_elements))

7
17


In [5]:
for image_file in os.listdir(output_path):
    if image_file.endswith((".jpg", ".jpeg", ".png")):
        image_path = os.path.join(output_path, image_file)
        print(image_path)
        print()
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
print(len(image_elements))

/Users/aashish/Desktop/fashionAI/ragGPT4o/figures/figure-8-5.jpg

/Users/aashish/Desktop/fashionAI/ragGPT4o/figures/figure-5-1.jpg

/Users/aashish/Desktop/fashionAI/ragGPT4o/figures/figure-7-3.jpg

/Users/aashish/Desktop/fashionAI/ragGPT4o/figures/figure-7-4.jpg

/Users/aashish/Desktop/fashionAI/ragGPT4o/figures/figure-6-2.jpg

5


In [6]:
from PIL import Image
import io
import base64

def resize_image(base64_image, max_size=(800, 800)):
    """
    Resizes an image to fit within the specified max_size while maintaining aspect ratio.
    
    Parameters:
        base64_image (str): The base64-encoded image string.
        max_size (tuple): The maximum width and height in pixels.
    
    Returns:
        str: The base64-encoded string of the resized image.
    """
    # Decode the base64 image
    image_data = base64.b64decode(base64_image)
    image = Image.open(io.BytesIO(image_data))
    
    # Resize the image
    image.thumbnail(max_size)
    
    # Re-encode the resized image to JPEG
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    
    # Encode the resized image back to base64
    resized_base64_image = base64.b64encode(buffered.getvalue()).decode()
    
    return resized_base64_image

In [7]:
from langchain_openai import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
import os
import dotenv

dotenv.load_dotenv()


llm = ChatOpenAI(model="gpt-4o", max_tokens=1024, api_key=os.environ['OPENAI_API_KEY'])


#function for image summarization
def summarize_image(encoded_image):
    try:
        prompt = [
            AIMessage(content="You are a bot that is an expert in analyzing images."),
            HumanMessage(content=[
                {"type": "text", "text": "What is this image showing, be as detailed as possible."},
                {
                    "type" : "image_url",
                    "image_url" : {
                        "url" : f"data:image/jpeg;base64,{encoded_image}"
                    },
                }
            ])
        ]
        response = llm.invoke(prompt)
        return response
    except Exception as e:
        print(f"failed to process image: {e}")

#function for text summarization
def summarize_text(text_element):
    prompt = f"Summarize the following text: \n\n{text_element}\n\nSummary:"
    response = llm.invoke([HumanMessage(content=prompt)])
    return response.content

#function for table summarization
def summarize_table(table_element):
    prompt = f"Summarize the following table: \n\n{table_element}\n\nSummary:"
    response = llm.invoke([HumanMessage(content=prompt)])
    return response.content

In [8]:
#processing table elements with feedback and sleep
table_summaries = []
for i, te in enumerate(table_elements):
    summary = summarize_table(te)
    table_summaries.append(summary)
    print(f"{i + 1}th element of tables processed")

1th element of tables processed
2th element of tables processed
3th element of tables processed
4th element of tables processed
5th element of tables processed
6th element of tables processed
7th element of tables processed


In [9]:
#processing text elements with feedback and sleep
text_summaries = []
for i, te in enumerate(text_elements):
    summary = summarize_text(te)
    text_summaries.append(summary)
    print(f"{i + 1}th element of text processed")

1th element of text processed
2th element of text processed
3th element of text processed
4th element of text processed
5th element of text processed
6th element of text processed
7th element of text processed
8th element of text processed
9th element of text processed
10th element of text processed
11th element of text processed
12th element of text processed
13th element of text processed
14th element of text processed
15th element of text processed
16th element of text processed
17th element of text processed


In [10]:
#processing image elements with feedback and sleep
image_summaries = []
for i, te in enumerate(image_elements):
    te = resize_image(te)
    summary = summarize_image(te)
    image_summaries.append(summary)
    print(f"{i + 1}th element of image processed")

1th element of image processed
2th element of image processed
3th element of image processed
failed to process image: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image is below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'image_parse_error'}}
4th element of image processed
5th element of image processed


# Multi-vector retriever

##### Summaries are used to retrieve raw tables and/or raw chunks of text

In [13]:
import uuid
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

In [14]:
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

In [15]:
store = InMemoryStore()
id_key = "doc_id"

In [16]:
retriever  = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key
)

In [19]:
def add_documents_to_retriever(summaries, original_contents):
    """
    Add documents to the retriever.

    Args:
        summaries (list): List of summaries.
        original_contents (list): List of original contents corresponding to the summaries.
    """
    # Generate unique IDs for each summary
    doc_ids = [str(uuid.uuid4()) for _ in summaries]

    # Create a list of Document objects with the summary content and the corresponding ID
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]

    # Add the documents to the vector store
    retriever.vectorstore.add_documents(summary_docs)

    # Store the original contents in the in-memory store using the generated IDs as keys
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

In [20]:
add_documents_to_retriever(table_summaries, table_elements)

In [21]:
add_documents_to_retriever(text_summaries, text_elements)

In [22]:
def add_documents_to_retriever_image(image_summaries, img_base64_list):
    """
    Add documents to the retriever for images.

    Args:
        image_summaries (list): List of summaries.
        img_base64_list (list): List of base64 encoded images corresponding to the summaries.
    """
    # Extract the content from each summary
    extracted_summaries = [summary.content if hasattr(summary, 'content') else str(summary) for summary in image_summaries]

    # Validate that all extracted summaries are strings
    assert all(isinstance(summary, str) for summary in extracted_summaries), "All extracted summaries must be strings"

    # Generate unique IDs for each image
    doc_ids = [str(uuid.uuid4()) for _ in img_base64_list]

    # Create a list of Document objects with the summary content and the corresponding ID
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(extracted_summaries)
    ]

    # Add the documents to the vector store
    retriever.vectorstore.add_documents(summary_docs)

    # Store the original contents in the in-memory store using the generated IDs as keys
    retriever.docstore.mset(list(zip(doc_ids, img_base64_list)))

add_documents_to_retriever_image(image_summaries, image_elements)

In [24]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

template = """Answer the question based only on the following context, which can include text, images, and tables:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0, model = "gpt-4o")

chain = (
    {"context" : retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [26]:
chain.invoke(
    "Using the knowledge base, can you recommend types of clothing to wear for a party?"
)

'Based on the provided context, the recommendation system described in the document can suggest suitable garments for a party by focusing on event-based recommendations. The system uses an event classifier to determine the appropriateness of garments for different social events, including parties. While the specific types of clothing are not detailed in the context, the system is designed to propose outfits that are compatible and suitable for the event in question, ensuring variety and style appropriateness.\n\nTherefore, for a party, the system would analyze the input garment and recommend complementary items that fit the party event category, ensuring that the overall outfit is stylish and appropriate for the occasion.'