In [18]:
import asyncio
import gc
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

import pickle

import nest_asyncio

nest_asyncio.apply()

In [45]:
EMBEDDING_MODEL = 'nomic-embed-text'
LLM_MODEL = 'phi4'
URL = 'https://python.langchain.com/api_reference/'

In [3]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
def is_internal_link(base_url, link):
    """Checks if a given URL is an internal link."""
    base_domain = urlparse(base_url).netloc
    link_domain = urlparse(link).netloc
    return base_domain == link_domain

def scrape_links(url, max_depth):
    """Scrapes all links up to the specified depth from the base URL."""
    
    visited_urls = set()
    urls_to_visit = [(url, 0)]  # Tuple of (URL, current depth)
    all_internal_links = []

    while urls_to_visit:
        current_url, current_depth = urls_to_visit.pop(0)

        if current_url in visited_urls or current_depth > max_depth:
            continue

        print(f"Visiting: {current_url}, Depth: {current_depth}")
        
        try:
            response = requests.get(current_url)
            response.raise_for_status()  # Raise an error for bad responses
            soup = BeautifulSoup(response.text, 'html.parser')
            
            visited_urls.add(current_url)

            for link in soup.find_all('a', href=True):
                href = link['href']

                if not urlparse(href).netloc:
                    full_url = urljoin(current_url, href)
                else:
                    full_url = href

                if is_internal_link(url, full_url) and full_url not in visited_urls:
                    all_internal_links.append(full_url)

                    # Add the new URL to visit with an incremented depth
                    urls_to_visit.append((full_url, current_depth + 1))
        
        except requests.RequestException as e:
            print(f"Error accessing {current_url}: {e}")
            
    cleaned_links = {urlparse(url)._replace(fragment="").geturl() for url in all_internal_links}

    return list(set(cleaned_links))  # Return unique links


In [9]:


links_up_to_depth_2 = scrape_links(URL, max_depth=2)

Visiting: https://python.langchain.com/api_reference/, Depth: 0
Visiting: https://python.langchain.com/api_reference/#main-content, Depth: 1
Visiting: https://python.langchain.com/api_reference/index.html, Depth: 1
Visiting: https://python.langchain.com/, Depth: 1
Visiting: https://python.langchain.com/api_reference/core/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/langchain/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/text_splitters/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/community/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/experimental/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/ai21/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/anthropic/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/astradb/index.html, Depth: 1
Visiting: https://python.langchain.com/api_reference/aws/index.html

In [13]:
# links_up_to_depth_2 = list({urlparse(url)._replace(fragment="").geturl() for url in links_up_to_depth_2})

In [14]:
len(links_up_to_depth_2)

8105

In [15]:


with open('urls.pkl', 'wb') as f:
    pickle.dump(links_up_to_depth_2, f)

In [7]:
# with open('urls.pkl', 'rb') as f:
#     links_up_to_depth_2 = pickle.load(f)

# len(links_up_to_depth_2)

EOFError: Ran out of input

In [16]:
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [33]:
loader = WebBaseLoader(
    # web_path = "https://www.espn.com/"
    # header_template = None,
    # verify_ssl = True,
    # proxies = None,
    # continue_on_failure = False,
    # autoset_encoding = True,
    # encoding = None,
    web_paths = links_up_to_depth_2,
    requests_per_second = 10,
    # default_parser = "html.parser",
    # requests_kwargs = None,
    # raise_for_status = False,
    # bs_get_text_kwargs = None,
    # bs_kwargs = None,
    # session = None,
    # show_progress = True,
    # trust_env = False,
)

In [19]:

async def process_urls_in_batches(urls, batch_size=10000, requests_per_second=10):

    docs = []
    # Diviser les URLs en sous-listes plus petites
    for i in range(0, len(urls), batch_size):
        url_batch = urls[i:i + batch_size]
        # Créer un nouveau loader pour chaque batch d'URLs
        loader = WebBaseLoader(url_batch,
                               requests_per_second = requests_per_second)
        
        # Traiter les documents dans ce batch
        async for doc in loader.alazy_load():
            docs.append(doc)

        # # Libérer de la mémoire (optionnel)
        await asyncio.sleep(1)  # Pause pour éviter une surcharge
        gc.collect()  # Forcer la collecte de la mémoire si nécessaire
    
    return docs


In [20]:


docs = await process_urls_in_batches(links_up_to_depth_2)

Fetching pages: 100%|##########| 8105/8105 [02:01<00:00, 66.85it/s]


In [None]:
import pickle

with open('documents.pkl', 'wb') as f:
    pickle.dump(docs, f)

In [None]:
docs = []
async for doc in loader.alazy_load():
    docs.append(doc)

Fetching pages:  47%|####7     | 36152/76269 [40:03<46:43, 14.31it/s]  Error fetching https://python.langchain.com/api_reference/community/llms/langchain_community.llms.konko.Konko.html#langchain_community.llms.konko.Konko.callback_manager and aborting, use continue_on_failure=True to continue loading urls after encountering an error.
Traceback (most recent call last):
  File "d:\perso\cb-webscrap\.venv\Lib\site-packages\langchain_community\document_loaders\web_base.py", line 268, in _fetch_with_rate_limit
    return await self._fetch(url)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "d:\perso\cb-webscrap\.venv\Lib\site-packages\langchain_community\document_loaders\web_base.py", line 251, in _fetch
    return await response.text()
           ^^^^^^^^^^^^^^^^^^^^^
  File "d:\perso\cb-webscrap\.venv\Lib\site-packages\aiohttp\client_reqrep.py", line 1265, in text
    return self._body.decode(encoding, errors=errors)  # type: ignore[union-attr]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


def split_documents(documents, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

splited_docs = split_documents(docs)
len(splited_docs)

291421

In [39]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore



In [34]:
from langchain_ollama import OllamaEmbeddings



In [35]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

vector = embeddings.embed_query("test embedding")
print(vector[:5])  # Affiche les 5 premières valeurs de l'embedding

[0.009270579, 0.060888052, -0.16196615, -0.06821778, 0.051177092]


In [28]:
vs = await FAISS.afrom_documents(splited_docs, embeddings)
vs.save_local("faiss_index")

MemoryError: 

In [40]:
# # Définir un batch_size raisonnable (ajuste selon la RAM dispo)
# batch_size = 10000

# # Découper les documents en batchs
# batches = [splited_docs[i : i + batch_size] for i in range(0, len(splited_docs), batch_size)]

# # Créer un index FAISS vide
# faiss_index = None

async def process_batches(docs, batch_size=10000):

    batches = [docs[i : i + batch_size] for i in range(0, len(docs), batch_size)]

    faiss_index = None

    for i, batch in enumerate(batches):
        print(f"Traitement du batch {i+1}/{len(batches)}...")
        
        # Convertir en embeddings
        vs = await FAISS.afrom_documents(batch, embeddings)

        # Fusionner avec l'index existant
        if faiss_index is None:
            faiss_index = vs
        else:
            faiss_index.merge_from(vs)

        # Sauvegarde intermédiaire toutes les 10 batchs
        if (i + 1) % 10 == 0:
            print(f"📌 Sauvegarde intermédiaire de l'index FAISS...")
            faiss_index.save_local("faiss_index_temp")

    print("✅ Traitement terminé, sauvegarde finale...")
    faiss_index.save_local("faiss_index_final")

# Exécuter l'ingestion par batch
await process_batches(splited_docs)

Traitement du batch 1/30...
Traitement du batch 2/30...
Traitement du batch 3/30...
Traitement du batch 4/30...
Traitement du batch 5/30...
Traitement du batch 6/30...
Traitement du batch 7/30...
Traitement du batch 8/30...
Traitement du batch 9/30...
Traitement du batch 10/30...
📌 Sauvegarde intermédiaire de l'index FAISS...
Traitement du batch 11/30...
Traitement du batch 12/30...
Traitement du batch 13/30...
Traitement du batch 14/30...
Traitement du batch 15/30...
Traitement du batch 16/30...
Traitement du batch 17/30...
Traitement du batch 18/30...
Traitement du batch 19/30...
Traitement du batch 20/30...
📌 Sauvegarde intermédiaire de l'index FAISS...
Traitement du batch 21/30...
Traitement du batch 22/30...
Traitement du batch 23/30...
Traitement du batch 24/30...
Traitement du batch 25/30...
Traitement du batch 26/30...
Traitement du batch 27/30...
Traitement du batch 28/30...
Traitement du batch 29/30...
Traitement du batch 30/30...
📌 Sauvegarde intermédiaire de l'index FAISS..

In [57]:
import gzip
with gzip.open('vectorstore.pkl.gz', 'wb') as f:
    pickle.dump(vs, f)

TypeError: cannot pickle '_thread.RLock' object

In [41]:
vs = FAISS.load_local(
    "faiss_index_final", embeddings, allow_dangerous_deserialization=True
)

In [42]:
query = "quel est le loader des pages web ?"
results = vs.similarity_search(query, k=5)  # 'k' est le nombre de résultats que tu souhaites

# Afficher les résultats
for result in results:
    print(result)

page_content='Webpages​
The below document loaders allow you to load webpages.
See this guide for a starting point: How to: load web pages.' metadata={'source': 'https://python.langchain.com/docs/integrations/document_loaders/', 'title': 'Document loaders | 🦜️🔗 LangChain', 'description': 'DocumentLoaders load data into the standard LangChain Document format.', 'language': 'en'}
page_content='How to load web pages | 🦜️🔗 LangChain' metadata={'source': 'https://python.langchain.com/docs/how_to/document_loader_web/', 'title': 'How to load web pages | 🦜️🔗 LangChain', 'description': 'This guide covers how to load web pages into the LangChain Document format that we use downstream. Web pages contain text, images, and other multimedia elements, and are typically represented with HTML. They may include links to other pages or resources.', 'language': 'en'}
page_content='Load AZLyrics webpages.
Initialize loader.' metadata={'source': 'https://python.langchain.com/api_reference/community/document

In [43]:

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama


In [46]:

llm = ChatOllama(
    model = LLM_MODEL,
    # temperature = 0.8,
    # num_predict = 256,
    # other params ...
)

retriever = vs.as_retriever()

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)


In [48]:
query = "comment faire un chatbot avec langchain et langgraph en utilisant ollama en local ainsi qu'un rag de pdf ?"

In [50]:

response = chain.invoke({"input": query})

In [54]:
response

{'input': "comment faire un chatbot avec langchain et langgraph en utilisant ollama en local ainsi qu'un rag de pdf ?",
 'context': [Document(id='8455f347-a9a1-4d34-9d17-9325d752792c', metadata={'source': 'https://python.langchain.com/v0.1/docs/additional_resources/youtube/', 'title': 'YouTube videos | 🦜️🔗 LangChain', 'description': '⛓ icon marks a new addition [last update 2023-09-21]', 'language': 'en'}, page_content='EbbelaarChat with Multiple PDFs | LangChain App Tutorial in Python (Free LLMs and Embeddings) by Alejandro AO - Software & AiChat with a CSV | LangChain Agents Tutorial (Beginners) by Alejandro AO - Software & AiCreate Your Own ChatGPT with PDF Data in 5 Minutes (LangChain Tutorial) by Liam OttleyBuild a Custom Chatbot with OpenAI: GPT-Index & LangChain | Step-by-Step Tutorial by FabrikodFlowise is an open-source no-code UI visual tool to build 🦜🔗LangChain applications by Cobus'),
  Document(id='398e8f1f-76d2-4ff1-83cc-4361a63c4850', metadata={'source': 'https://python.

In [55]:
print(response["answer"])

Pour créer un chatbot en utilisant LangChain, LlGraph, OLLAMA localement, et intégrer un RAG (Retrieval-Augmented Generation) à partir d'un PDF, voici une approche générale que vous pouvez suivre. Ce processus suppose que vous avez des connaissances de base en Python et que vous êtes familiarisé avec les concepts de chatbots et de traitement du langage naturel.

### Étapes nécessaires :

1. **Configuration locale :**
   - Assurez-vous d'avoir un environnement local configuré pour exécuter Python.
   - Installez OLLAMA, une distribution LLM légèrement modifiée conçue pour fonctionner localement.

2. **Installation des bibliothèques nécessaires :**
   - Installez LangChain et d'autres dépendances en utilisant pip :
     ```bash
     pip install langchain ollama
     ```

3. **Préparation du PDF :**
   - Utilisez une bibliothèque comme PyPDF2 ou pdfplumber pour extraire le texte à partir de votre fichier PDF.
   - Stockez le contenu extrait dans un format facilement accessible, tel qu'un 