# Introduction à Langchain 



### Dépendances et clé d'API

In [21]:
#Décommenter sur Google Colab
#%pip install langchain langchain-openai langchain_mistralai openai python-dotenv -q
#from google.colab import userdata
#api_key=userdata.get('OPENAI_API_KEY')


#Décommenter en local
from dotenv import load_dotenv
from os import getenv
load_dotenv()
api_key= getenv("OPENAI_API_KEY")

questions = [
    "Quelle est la définition d’une donnée personnelle selon le RGPD ?"
    "Quelles sont les six bases légales du traitement des données personnelles ?"
    "Quels sont les droits des personnes concernées par le RGPD ?"

]
doc_path = "./media/GDPRPocketGuide.pdf"

## Indexation

### Load and split

In [22]:
from langchain_community.document_loaders import PyPDFLoader



# LOAD
loader = PyPDFLoader(doc_path)
original_document = loader.load()
print(original_document[0].page_content[:250])
print(original_document[0].metadata)


from langchain.text_splitter import RecursiveCharacterTextSplitter

# SPLIT
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
doc_chunks = text_splitter.split_documents(original_document)

doc_chunks

General Data Protection  
Regulation Guide
{'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.1 (Macintosh)', 'creationdate': '2018-04-17T12:25:30-04:00', 'author': 'Jones Day', 'moddate': '2021-08-28T09:14:19-04:00', 'title': 'GDPR Pocket Guide A5 ENGLISH.indd', 'trapped': '/False', 'source': './media/GDPRPocketGuide.pdf', 'total_pages': 24, 'page': 0, 'page_label': 'i'}


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.1 (Macintosh)', 'creationdate': '2018-04-17T12:25:30-04:00', 'author': 'Jones Day', 'moddate': '2021-08-28T09:14:19-04:00', 'title': 'GDPR Pocket Guide A5 ENGLISH.indd', 'trapped': '/False', 'source': './media/GDPRPocketGuide.pdf', 'total_pages': 24, 'page': 0, 'page_label': 'i'}, page_content='General Data Protection  \nRegulation Guide'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.1 (Macintosh)', 'creationdate': '2018-04-17T12:25:30-04:00', 'author': 'Jones Day', 'moddate': '2021-08-28T09:14:19-04:00', 'title': 'GDPR Pocket Guide A5 ENGLISH.indd', 'trapped': '/False', 'source': './media/GDPRPocketGuide.pdf', 'total_pages': 24, 'page': 1, 'page_label': 'ii'}, page_content='Disclaimer: Jones Day publications should not be construed as legal advice on any specific facts or \ncircumstances. The contents are intended for general information purposes on

### Embed and store

#### Vector store en mémoire

`Qdrant` propose une version *in memory* du vecteur store. Pratique pour une rapide démo ou debug. Cette implémentation permet également de persister/charger les vecteurs sur le disk (https://python.langchain.com/docs/integrations/vectorstores/qdrant/#on-disk-storage)

In [23]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_openai import OpenAIEmbeddings

# Initialize OpenAI Embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
mem_client = QdrantClient(":memory:")

# Create a collection in the in-memory Qdrant client
# This is a temporary collection that will not persist after the program ends, unless you save it to disk
mem_client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

# Create a QdrantVectorStore instance with the in-memory client and add documents
mem_vector_store = QdrantVectorStore(
    client=mem_client,
    collection_name="demo_collection",
    embedding=embeddings,
)

# Add the document chunks to the in-memory vector store
mem_vector_store.add_documents(documents=doc_chunks)

['70f95f87edf642c494e93eaec1496e84',
 '119a2cc52a4e4718b575bb9b57f01e68',
 'af9ea39a74d44ea3951cceb5e1be08f2',
 'bae7d8a90f4b42529a7c1504f3af85f7',
 '2313980d7da7455bacadd7ffc1ab4bee',
 'd4dfc8f9afef4c6fb5cb0ca896bd4950',
 'bee1cc6841a84879aa14064717beb7de',
 '6340dcc59adc495789df003c02d982b5',
 '1c841f31b25c48f1ab11a90cfc1b1e86',
 '8efa3a29138e43ef87c4bdd69335e0a1',
 'b2b5e9df2ab34a54bc95a898e2d702a6',
 'd46cab8ab22f4997b079687b171a78c6',
 '41d795debde7475883c481cf5f57cb5d',
 '00a87ef069e34be2b22f582d2163bc4e',
 'c5d298973e874a3b85f0278e761dfbea',
 '74cb8deca8a148fbaae604332021dae7',
 'effc84c79f204ea89c10afe5d474cd34',
 '3f3d0aca07ff42c1bea8894b62050cff',
 '71e27e0c289443a6aa993b0611861b26',
 '2ce26183ea504dab9b88582f59c8c7da',
 '6e30eff8bbe04185a8716dfb9b4a8e0c',
 '9b891f6a84324204891837c048421f6d',
 'e6954baa34a24f799783616112cf8447',
 'd63bc4868e2447c89b8f0e7873126d9d',
 '5085294b08054950b13b8ae31217fd25',
 'af918ca539414402a7b9537b56684f8d',
 'e40ab02c8c4948239227cc988bf6778e',
 

#### Vector store avec serveur

Dans la plupart des contexte évidemment il est hautement recommandé de passer par un serveur déployé.

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings

# Initialize OpenAI Embedding model
embeddings=OpenAIEmbeddings(model="text-embedding-3-large")

# Create the collection (if it does not exists) and store the chunks with their embedding vectors in Qdrant
url = "http://localhost:6333"
vector_store = QdrantVectorStore.from_documents(
    doc_chunks,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name="demo_collection"
)

In [None]:
print(questions[0])
results = vector_store.similarity_search(
    questions[0],
    k=5 # Nombre de documents similaires à retourner
)

mem_results = mem_vector_store.similarity_search(
    questions[0],
    k=5  # Nombre de documents similaires à retourner
)
print("#####IN MEMORY RESULTS#####")
for id, doc in enumerate(mem_results):  # Affiche le contenu du premier document similaire
    print(f"Document {id}: {doc.page_content[:250]}...\n\n")  # Affiche les 250 premiers caractères du document

print("#####REMOTE RESULTS#####")
for id, doc in enumerate(results):  # Affiche le contenu du premier document similaire
    print(f"Document {id}: {doc.page_content[:250]}...\n\n")  # Affiche les 250 premiers caractères du document

Quelle est la définition d’une donnée personnelle selon le RGPD ?Quelles sont les six bases légales du traitement des données personnelles ?Quels sont les droits des personnes concernées par le RGPD ?
#####IN MEMORY RESULTS#####
Document 0: 20
General Data 
Protection 
Regulation (GDPR)
Regulation 2016/679/EU of 27 April 2016, repealing Directive 
95/46/EC, on the protection of natural persons with regard to 
the processing of personal data and on the free movement of 
such data.
Person...


Document 1: 3
continued on page 4
LEGAL BASES FOR DATA PROCESSING
ARTICLES 6, 7 AND 8
Quick Overview
The legal bases for processing personal data under the GDPR are largely the same as 
those under the Directive. However, the GDPR sets new restrictions for conse...


Document 2: 6
RIGHTS OF INDIVIDUALS 
• Individuals must be given details concerning international disclosures; retention peri-
ods; the rights of rectification, erasure, and restriction of processing; and the rights to 
object to proce

## QA Chain

In [None]:
from langchain.chat_models import init_chat_model
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."),
        ("system", "{context}"),
        ("human", "{question}"),
    ]
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    # Runnable parallèles
    {
        "question": itemgetter("question"),
        "context": itemgetter("question") | vector_store.as_retriever() | format_docs

    }
    # Runnable séquentiels
    | prompt
    | llm
    #| StrOutputParser()
)

question0 ="Quelle est la politique de déplacement de Younup ?"

response1 = rag_chain.invoke(
    {
        "question": questions[0]
    }
)
response1