In [1]:
from typing_extensions import TypedDict, List, Optional, Union, Dict, Annotated, Literal
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, BaseMessage, AIMessage, trim_messages
from langchain_core.documents import Document
from langchain.tools.retriever import create_retriever_tool
from langgraph.graph import END, StateGraph, START, MessagesState
from langchain_groq import ChatGroq
from langgraph.prebuilt import create_react_agent
# from team_tools import tavily_search_tool, arxiv_search_tool, web_scraper_tool, repl_tool
from qdrant_cloud_ops import initialize_selfquery_retriever, qdrant_vector_store
from llm_chains import decomposition_chain, requires_decomposition, rephrase_chain, get_plan_chain, assign_chat_topic, memory_decision_chain
from dotenv import load_dotenv
from pprint import pprint
import re
import functools
import operator
from langgraph.checkpoint.memory import MemorySaver

from langgraph.store.base import BaseStore
from langgraph.store.memory import InMemoryStore

from token_counter import tiktoken_counter


from datetime import datetime
import os

load_dotenv()

in_memory_store = InMemoryStore()
memory = MemorySaver()


llm = ChatGroq(model='llama-3.2-90b-vision-preview', temperature=0.0)

examples = [
    (
        "what is neural_networks.pdf talking about",
        {
            "query": "neural networks",
            "filter": 'eq("pdf_name", "neural_networks.pdf")',
        },
    ),
    (
        "Can you tell me something about the file cancer_research_study.pdf?",
        {
            "query": "cancer research",
            "filter": 'eq("pdf_name", "cancer_research_study.pdf")',
        },
    ),
    (
        "I need to know what ethics_in_ai.pdf says on ethical concerns.",
        {
            "query": "ethical concerns",
            "filter": 'eq("pdf_name", "ethics_in_ai.pdf")',
        },
    ),
    (
        "Find me anything related to quantum_computing_paper.pdf",
        {
            "query": "quantum computing",
            "filter": 'eq("pdf_name", "quantum_computing_paper.pdf")',
        },
    ),
    (
        "Are there any references to Einstein's theories in physics_papers.pdf?",
        {
            "query": "Einstein's theories",
            "filter": 'eq("pdf_name", "physics_papers.pdf")',
        },
    ),
    (
        "What's in the climate_change_analysis.pdf about global warming?",
        {
            "query": "global warming",
            "filter": 'eq("pdf_name", "climate_change_analysis.pdf")',
        },
    ),
    (
        "Show me papers discussing blockchain from blockchain_articles.pdf",
        {
            "query": "blockchain technology",
            "filter": 'eq("pdf_name", "blockchain_articles.pdf")',
        },
    ),
    (
        "Can you retrieve sections of deep_learning_basics.pdf?",
        {
            "query": "deep learning",
            "filter": 'eq("pdf_name", "deep_learning_basics.pdf")',
        },
    ),
    (
        "Who authored machine_learning_review.pdf and neural_network_overview.pdf?",
        {
            "query": "authors",
            "filter": 'or(eq("pdf_name", "machine_learning_review.pdf"), eq("pdf_name", "neural_network_overview.pdf"))',
        },
    ),
    (
        "Give me the details from data_analysis_guide.pdf",
        {
            "query": "data analysis",
            "filter": 'eq("pdf_name", "data_analysis_guide.pdf")',
        },
    ),
]

trimmer = trim_messages(
    max_tokens=5984,
    strategy="last",
    token_counter=tiktoken_counter,
    include_system=True,
    allow_partial=False,
)

trimmer_first = trim_messages(
    max_tokens=1500,
    strategy="first",
    token_counter=tiktoken_counter,
    # include_system=False,
    allow_partial=True,
)

qdrant_retriever = initialize_selfquery_retriever(llm, qdrant_vector_store=qdrant_vector_store, examples=examples)
qdrant_retriever_tool = qdrant_retriever.as_tool(
    name="retrieve_research_paper_texts",
    description="Search and return information from the vector database containing texts of several research papers, and scholarly articles. optionally, align the search process based on pdf name (.pdf file) if given.",
)



Started Qdrant client.
Collection 'aireas-cloud' already exists.


  qdrant_retriever_tool = qdrant_retriever.as_tool(


In [2]:
decomposer_chain = decomposition_chain(llm=llm)
check_query_chain = requires_decomposition(llm=llm)
rephraser_chain = rephrase_chain(llm=llm)
planner_chain = get_plan_chain(llm=llm)
assign_topic_chain = assign_chat_topic(llm=llm)


In [3]:
# k = create_react_agent(llm, tools=[retriever_tool])


# for s in k.stream({'messages': [HumanMessage('title of s.pdf')]}, stream_mode='values'):
#   print(s['messages'][-1])


In [4]:
class PaperMetadata(TypedDict):
    title: Annotated[str, "The title or heading of the research paper."]
    authors: Annotated[List[str], "The authors of the research paper."]
    publish_date: Annotated[Optional[str], "The publication date of the research paper in the format YYYY-MM-DD."]
    description: Annotated[str, "A concise description (2-3 sentences) summarizing the content of the research paper."]

def basic_metadata_extraction_chain(llm):
    """
    Creates a metadata extraction chain using a given language model.

    Args:
        llm: The language model to be used for metadata extraction.

    Returns:
        Runnable: A chain that extracts the title, authors, publish date, and description from the text.
    """
    # Template for the LLM
    template = '''
    You will be provided with the initial content of a research paper. Your task is to extract the following metadata accurately. 
    If you are uncertain about the answer or cannot find the information, populate the field with a suitable comment explaining why the information is unavailable.

    The details to extract are as follows:
    1. **Title of the paper:** Provide the exact title as it appears in the content. If the title is unclear, state "Title not found in the provided content."
    2. **List of authors:** Extract all authors listed. If no authors are mentioned, state "Authors not mentioned in the provided content."
    3. **Publication date:** Provide the date in the format YYYY-MM-DD. If no date is found, state "Publication date not available."
    4. **Description:** Summarize the paper’s content in 2-3 concise sentences. If the description cannot be inferred, state "Insufficient information to provide a description."

    Use this structure to ensure clarity and completeness. If you need to make assumptions, mention them explicitly in the output.

    Content:
    {content}
    '''

    # Configure the LLM with structured output
    structured_output_llm = llm.with_structured_output(PaperMetadata)

    prompt_template = ChatPromptTemplate.from_messages(
        [
            ('system', template),
            # MessagesPlaceholder(variable_name="messages"),
        ]
    )

    # Define additional processing steps if needed (e.g., trimming input text)
    chain = prompt_template | trimmer_first | structured_output_llm

    return chain

In [14]:
# import fitz

# bmc = basic_metadata_extraction_chain(llm=llm)


# def extract_text_until_introduction(text):
#     index = text.lower().find('introduction')
#     if index != -1:
#         return text[:index]
#     else:
#         return text

# for f in os.listdir(os.path.join('test_dir')):
#   print(f, '\n')

#   pdf_document = fitz.open(os.path.join('test_dir', f))

#   text = ""
  
#   for page in pdf_document:
#       text += page.get_text()
#   pdf_document.close()

#   extracted_text = extract_text_until_introduction(text)

#   # print(extracted_text)

#   result = bmc.invoke(extracted_text)
#   print(result)

In [15]:
import os
import fitz

def process_files_with_chain(llm, chain, files):
    """
    Process a list of PDF files, extract metadata using the given chain, and handle errors gracefully.

    Args:
        llm: The language model used for metadata extraction.
        chain: The chain to invoke for metadata extraction.
        files: A list of file paths to process (from FastAPI or similar sources).

    Returns:
        dict: A dictionary where keys are file names and values are either metadata results or error messages.
    """



    results = {}

    for file_path in files:
        try:
            # Extract the file name
            file_name = os.path.basename(file_path)
            print(f"Processing file: {file_name}")

            # Open the PDF document
            pdf_document = fitz.open(file_path)

            # Extract text from the PDF
            text = ""
            for page in pdf_document:
                text += page.get_text()
            pdf_document.close()

            # Extract text until the 'Introduction' section
            extracted_text = extract_text_until_introduction(text)

            # Invoke the chain to extract metadata
            result = chain.invoke({"content": extracted_text})

            # Store the result
            results[file_name] = result

        except Exception as e:
            # Handle any errors during processing
            error_message = f"Error processing {file_path}: {str(e)}"
            print(error_message)
            results[file_name] = {"error": error_message}

    return results


In [16]:
# # from langgraph.store.memory import InMemoryStore
# from langgraph.store.postgres import *
# from langchain_openai import OpenAIEmbeddings
# import langgraph


# help(langgraph.store.postgres)

In [46]:
metadata= {
  "s.pdf": {
    "title": "EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation",
    "authors": [
      "Rang Meng",
      "Xingyu Zhang",
      "Yuming Li",
      "Chenguang Ma"
    ],
    "publish_date": "Publication date not available",
    "description": "This paper proposes EchoMimicV2, a half-body human animation method that simplifies unnecessary conditions and achieves striking animation quality by leveraging a novel Audio-Pose Dynamic Harmonization strategy. The method surpasses existing methods in both quantitative and qualitative evaluations."
  },
  "sm.pdf": {
    "title": "SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory",
    "authors": [
      "Cheng-Yen Yang",
      "Hsiang-Wei Huang",
      "Wenhao Chai",
      "Zhongyu Jiang",
      "Jenq-Neng Hwang"
    ],
    "publish_date": "Publication date not available.",
    "description": "This paper introduces SAMURAI, an adaptation of the Segment Anything Model 2 for visual object tracking, which incorporates temporal motion cues to achieve robust and accurate tracking without retraining or fine-tuning. SAMURAI demonstrates strong zero-shot performance across diverse benchmark datasets."
  }
}

In [47]:
metadata

{'s.pdf': {'title': 'EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation',
  'authors': ['Rang Meng', 'Xingyu Zhang', 'Yuming Li', 'Chenguang Ma'],
  'publish_date': 'Publication date not available',
  'description': 'This paper proposes EchoMimicV2, a half-body human animation method that simplifies unnecessary conditions and achieves striking animation quality by leveraging a novel Audio-Pose Dynamic Harmonization strategy. The method surpasses existing methods in both quantitative and qualitative evaluations.'},
 'sm.pdf': {'title': 'SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory',
  'authors': ['Cheng-Yen Yang',
   'Hsiang-Wei Huang',
   'Wenhao Chai',
   'Zhongyu Jiang',
   'Jenq-Neng Hwang'],
  'publish_date': 'Publication date not available.',
  'description': 'This paper introduces SAMURAI, an adaptation of the Segment Anything Model 2 for visual object tracking, which incorporates temporal motion cues to ach

In [42]:
from langgraph.store.base import BaseStore
from langgraph.store.memory import InMemoryStore


store = InMemoryStore()

user_id = '1'
conversation_id = '1'



In [48]:
for k,v in metadata.items():
  # print(k, v)

  store.put(('conversations_metadata', user_id, conversation_id), key=f'metadata_{k}', value=v)

In [49]:
l = store.search(('conversations_metadata', user_id, conversation_id))

n = [i.dict() for i in l]
n

[{'value': {'title': 'EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation',
   'authors': ['Rang Meng', 'Xingyu Zhang', 'Yuming Li', 'Chenguang Ma'],
   'publish_date': 'Publication date not available',
   'description': 'This paper proposes EchoMimicV2, a half-body human animation method that simplifies unnecessary conditions and achieves striking animation quality by leveraging a novel Audio-Pose Dynamic Harmonization strategy. The method surpasses existing methods in both quantitative and qualitative evaluations.'},
  'key': 'metadata_s.pdf',
  'namespace': ['conversations_metadata', '1', '1'],
  'created_at': '2024-12-14T10:51:47.560337+00:00',
  'updated_at': '2024-12-14T10:51:47.560340+00:00',
  'score': None},
 {'value': {'title': 'SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory',
   'authors': ['Cheng-Yen Yang',
    'Hsiang-Wei Huang',
    'Wenhao Chai',
    'Zhongyu Jiang',
    'Jenq-Neng Hwang'],
   'publish