In [1]:
import os
import streamlit as st
from dotenv import load_dotenv

from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from langchain_chroma import Chroma
import chromadb
from langchain import hub

from arxivsearcher.llm_agent import create_agent

# Chargement des variables d'environnement
load_dotenv()

# Configuration initiale
CHROMADB_HOST = os.getenv("CHROMADB_HOST")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
LLM_MODEL = os.getenv("LLM_MODEL")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
AGENT_PROMPT = os.getenv("AGENT_PROMPT")

# Initialisation de l'application Streamlit
st.set_page_config(
    page_title="arXiv Researcher",
    page_icon="📚",
    layout="wide"
)

st.title("📚 arXiv Researcher")

# Initialisation des composants
@st.cache_resource
def initialize_components(): 
    # Initialisation des embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    chroma_client = chromadb.HttpClient(host=CHROMADB_HOST, port=8000)
    vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)
    
    # Initialisation du LLM
    llm = HuggingFaceEndpoint(
        repo_id=LLM_MODEL,
        temperature=0.5,
        huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
        task="text-generation"
    )
    
    # Initialisation de l'agent
    prompt = hub.pull(AGENT_PROMPT)
    agent_executor = create_agent(vectorstore, llm, prompt)
    
    return vectorstore, agent_executor

# Initialisation des composants
vectorstore, agent_executor = initialize_components()
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

2025-04-03 12:05:13.852 
  command:

    streamlit run /home/barilanne076/.pyenv/versions/arxiv_env/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
results = retriever.invoke("maladies rares")

In [3]:
results

 Document(id='0709.3056', metadata={'authors': 'John Friedlander and Florian Luca', 'id': '0709.3056', 'title': 'Residue Classes Having Tardy Totients', 'year': '2007'}, page_content='residue classes having tardy totients   we show in an effective way that there exists a sequence of congruence\nclasses akpmod mk such that the minimal solution nnk of the\ncongruence phinequiv akpmod mk exists and satisfies log nklog\nmktoinfty  as ktoinfty here phin is the euler function this\nanswers a question raised in citefs we also show that every congruence\nclass containing an even integer contains infinitely many values of the\ncarmichael function lambdan and the least such n satisfies nll\nm13\n'),
 Document(id='0710.0325', metadata={'authors': 'Takayuki Matsuki, Toshiyuki Morii, and Kazutaka Sudoh', 'id': '0710.0325', 'title': 'Structure of Mass Gap between Two Spin Multiplets', 'year': '2007'}, page_content='structure of mass gap between two spin multiplets   studying our semirelativistic pot

In [7]:
results[0].metadata['authors']

'Grenville J. Croll, Raymond J. Butler'

In [None]:

with tab2:
    st.header("Chat with the agent")
    
    # Initialisation de l'historique de chat
    if "messages" not in st.session_state:
        st.session_state.messages = []
    
    # Affichage de l'historique des messages
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])
    
    # Zone de saisie pour le message de l'utilisateur
    if prompt := st.chat_input("Ask your question..."):
        # Ajout du message de l'utilisateur à l'historique
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)
        
        # Génération de la réponse
        with st.chat_message("assistant"):
            with st.spinner("The agent is thinking..."):
                response = agent_executor.invoke({"input": prompt})
                st.markdown(response["output"])
                st.session_state.messages.append({"role": "assistant", "content": response["output"]}) 

In [2]:
import os
import streamlit as st
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from langchain_chroma import Chroma
from langchain import hub

import arxivsearcher
from arxivsearcher.load_chroma import download_directory_from_gcs
from arxivsearcher.retrieval import search_articles
from arxivsearcher.llm_agent import create_agent

# Chargement des variables d'environnement
load_dotenv()

# Configuration initiale
BUCKET_NAME = os.getenv("BUCKET_NAME")
GCS_PERSIST_PATH = os.getenv("GCS_PERSIST_PATH")
LOCAL_PERSIST_PATH = os.getenv("LOCAL_PERSIST_PATH")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
LLM_MODEL = os.getenv("LLM_MODEL")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
AGENT_PROMPT = os.getenv("AGENT_PROMPT")

# Initialisation de l'application Streamlit
st.set_page_config(
    page_title="arXiv Researcher",
    page_icon="📚",
    layout="wide"
)

st.title("📚 arXiv Researcher")

# Initialisation des composants
@st.cache_resource
def initialize_components():
    # Téléchargement de la base de données Chroma
    download_directory_from_gcs(GCS_PERSIST_PATH, LOCAL_PERSIST_PATH, BUCKET_NAME)
    
    # Initialisation des embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vectorstore = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)
    
    # Initialisation du LLM
    llm = HuggingFaceEndpoint(
        repo_id=LLM_MODEL,
        temperature=0.5,
        huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
        task="text-generation"
    )
    
    # Initialisation de l'agent
    tools = [search_articles]
    prompt = hub.pull(AGENT_PROMPT)
    agent_executor = create_agent(llm, tools, prompt)
    
    return vectorstore, agent_executor

# Initialisation des composants
vectorstore, agent_executor = initialize_components()

2025-04-03 09:34:04.637 
  command:

    streamlit run /home/barilanne076/.pyenv/versions/arxiv_env/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/data_level0.bin to ./local_chromadb/4d8e52f2-6027-41ef-b06b-77f96cb894fa/data_level0.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/header.bin to ./local_chromadb/4d8e52f2-6027-41ef-b06b-77f96cb894fa/header.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/index_metadata.pickle to ./local_chromadb/4d8e52f2-6027-41ef-b06b-77f96cb894fa/index_metadata.pickle
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/length.bin to ./local_chromadb/4d8e52f2-6027-41ef-b06b-77f96cb894fa/length.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/link_lists.bin to ./local_chromadb/4d8e52f2-6027-41ef-b06b-77f96cb894fa/link_lists.bin
Downloaded chroma_db/chroma.sqlite3 to ./local_chromadb/chroma.sqlite3


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import streamlit as st

search_query = st.text_input("Entrez votre requête de recherche:")

2025-04-03 09:35:29.699 Session state does not function when running a script without `streamlit run`


In [5]:
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x768c21f1cf40>

In [None]:
def search_articles(vectorstore, title, nb_articles=3, year=None):
    """look for the most relevant articles about the theme, in a specific year if it's given. input: "give me some articles on AI published after 2005", output: "article 1: title, authors, abstract, etc"."""
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": nb_articles})
    retriever_docs = retriever.get_relevant_documents(title)
    return retriever_docs

In [8]:
from arxivsearcher.retrieval import search_articles


search_articles(vectorstore, "coucou")

AttributeError: 'str' object has no attribute 'parent_run_id'

In [4]:
!pip install -e .

Obtaining file:///home/barilanne076/code/arXiv_researcher
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: arxivsearcher
  Building editable for arxivsearcher (pyproject.toml) ... [?25ldone
[?25h  Created wheel for arxivsearcher: filename=arxivsearcher-0.1.0-py3-none-any.whl size=1281 sha256=de7d234ff54285b0bbd9696177dfd4da693240c869b86f48e099d3fd60302a3d
  Stored in directory: /tmp/pip-ephem-wheel-cache-4py53dtl/wheels/da/53/7e/2ae15bdbf4bee5d7fd5908688e288cf22f8c5f32e3baa40acb
Successfully built arxivsearcher
Installing collected packages: arxivsearcher
Successfully installed arxivsearcher-0.1.0


In [1]:
from arxivsearcher import retrieval