Setup le LLM


In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

llm = OpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)
Settings.llm = llm

Setup la pipeline embedding model, index store, vectore store , split les tokens avec le markdown parser


In [2]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.storage.docstore.simple_docstore import (
    SimpleDocumentStore,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize document store and embedding model
docstore = SimpleDocumentStore()
embed_model = HuggingFaceEmbedding(model_name="baai/bge-small-en-v1.5")

# Create storage contexts
storage_context = StorageContext.from_defaults(docstore=docstore)
storage_context_no_extra_context = StorageContext.from_defaults()
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=256, chunk_overlap=10
)


  from .autonotebook import tqdm as notebook_tqdm


Document Context Extractor


In [6]:
# This is the new part!

from llama_index.core.extractors import DocumentContextExtractor

context_extractor = DocumentContextExtractor(
    # these 2 are mandatory
    docstore=docstore,
    max_context_length=128000,
    # below are optional
    llm=llm,  # default to Settings.llm
    oversized_document_strategy="warn",
    max_output_tokens=100,
    key="context",
    prompt=DocumentContextExtractor.SUCCINCT_CONTEXT_PROMPT,
)

Load DATA


In [7]:
#pip install docx2txt

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_files=["Rapport Projet Ouvert.docx"]
)
documents = reader.load_data()

In [None]:
import nest_asyncio

nest_asyncio.apply()

# need to add documents directly for the DocumentContextExtractor to work
storage_context.docstore.add_documents(documents)
index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context,
    embed_model=embed_model,
    transformations=[text_splitter, context_extractor],
)

index_nocontext = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context_no_extra_context,
    embed_model=embed_model,
    transformations=[text_splitter],
)

