# Vector Store creation and topic retrieval


In [1]:
import pandas as pd

In [None]:
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [3]:
%load_ext autoreload
%autoreload 2
import os
from pprint import pprint
from IPython.display import Markdown
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
import pprint
from langchain_core.prompts import ChatPromptTemplate

In [4]:
import re
from langchain_chroma import Chroma
from langchain.chat_models import init_chat_model
from langchain_classic import hub
import json
from langchain_core.documents import Document

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader


In [7]:
from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file

True

In [2]:
df = pd.read_csv("data/speeches-wahlperiode-21.csv")

In [None]:
df.head()

In [None]:
df.groupby("party").count()

## Descriptives

In [None]:
f'{len(df)} speeches from {df['date'].min()} to {df['date'].max()} on {len(set(df['date']))} days'

In [None]:
f'Average amount of {len(df)/len(set(df['date']))} speeches in a session'

## Check if that makes sense

In [None]:
f'Average length of speech {df['text'].apply(len).describe()['mean']} characters. \
ChatGPT says: Speaking 3,721 letters will take about 5â€“7 minutes, depending on your speaking speed.'


In [None]:
f'Sessions takes about {(89*6)/60} hours'

In [8]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [9]:
model = init_chat_model("google_genai:gemini-2.5-flash-lite")

In [10]:
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Use the following context to answer the question. Use maximum 7 sentences. Use specific terms. Highlight important ones."),
    ("human", """Context: {context}  Question: {question}""")
    ])

example_messages = prompt_template.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

In [11]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
)


In [12]:
loader = CSVLoader(file_path="data/speeches-wahlperiode-21.csv",
                   metadata_columns=['date','id','party'])
data = loader.load()

In [13]:
f'{len(data)} speeches from {data[-1].metadata['date']} to {data[0].metadata['date']}'

'3938 speeches from 25.03.2025 to 27.11.2025'

Write a function to populate the Vectore Store with own documents. 

In [14]:
#Shared text splitter for all document embedding
from langchain_text_splitters import NLTKTextSplitter

text_splitter = NLTKTextSplitter(
    chunk_size=500,
    chunk_overlap=200
)


In [15]:
def embed_and_store(doc, vector_store, batch_size=200):
    """Split JSON-loaded documents into chunks and store them in a vector store."""
    # Split the pages into chunks
    all_splits = text_splitter.split_documents(doc)

    # Add the chunks to the vector store in batches
    for i in range(0, len(all_splits), batch_size):
        batch = all_splits[i:i + batch_size]
        vector_store.add_documents(documents=batch)

    return f"{len(all_splits)} chunks embedded"

In [16]:
def embed_and_store_pdf(file_path, vector_store, batch_size=200):
    """Load a PDF file, split it into chunks, and store the chunks in a vector store."""
    # Load the PDF file
    loader = PyPDFLoader(file_path, mode="single")
    pdf = loader.load()

    # Split the pages into chunks
    all_splits = text_splitter.split_documents(pdf)

    # Add the party name to the metadata
    pattern = r"(?<=data/)[^_]+(?=_)"
    party_name = re.search(pattern, file_path)

    for split in all_splits:
        split.metadata["party_name"] = party_name.group()

    # Add the chunks to the vector store in batches
    for i in range(0, len(all_splits), batch_size):
        batch = all_splits[i:i + batch_size]
        vector_store.add_documents(documents=batch)

    return f"{file_path} embedded"

In [28]:
#embed_and_store_pdf('data/B90G_25Wahlprogramm.pdf', vector_store)
#embed_and_store(data, vector_store)
# embed_and_store_pdf('data/AfD_25Wahlprogramm.pdf', vector_store)
# embed_and_store_pdf('data/BSW_25Wahlprogramm.pdf', vector_store)
# embed_and_store_pdf('data/FDP_25Wahlprogramm.pdf', vector_store)
# embed_and_store_pdf('data/SPD_25Wahlprogramm.pdf', vector_store)
# embed_and_store_pdf('data/DieLinke_25Wahlprogramm.pdf', vector_store)
#embed_and_store_pdf('data/CDU_25Wahlprogramm.pdf', vector_store)

Created a chunk of size 639, which is longer than the specified 500
Created a chunk of size 547, which is longer than the specified 500


'data/CDU_25Wahlprogramm.pdf embedded'

In [21]:
def answer(query, vector_store, llm, file, prompt_template=None):
    """Answer a query using the vector store and the language model."""
    # Retrieve similar documents from the vector store
    retrieved_docs = vector_store.similarity_search(query,k=50, filter={'source': file})

    # Create the prompt
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    # If no prompt template is provided, use the default one
    if not prompt_template:
        prompt_template = hub.pull("rlm/rag-prompt")

    prompt = prompt_template.invoke(
        {"context": docs_content, "question": query}
    )

    # Get the answer from the language model
    answer = llm.invoke(prompt)
    return answer.content

ðŸ‘‰ Try out your function with a query of your liking:

In [22]:
query = 'What does the party say about climate?'

In [38]:
Markdown(answer(query, vector_store, model,'data/CDU_25Wahlprogramm.pdf', prompt_template=prompt_template))

The party emphasizes that **climate protection is achievable through market economy principles**, utilizing technology-open approaches and incentives. They consider **emissions trading a central instrument** and align their actions with the **Paris Climate Goals**. The party advocates for a **secure, clean, and affordable energy supply**, aiming to significantly expand renewable energies while acknowledging the current necessity of nuclear power. They view climate protection technologies as an opportunity and strive to be a global role model in this area. Furthermore, they believe in integrating ecology, economy, and social considerations for intergenerational justice.

Export similar chunks grouped found by topic query

In [32]:
# Convert to serializable dicts
serializable_m = [
    {"page_content": doc.page_content, "metadata": doc.metadata}
    for doc in vector_store.similarity_search(query,k=10, filter={'source': 'data/CDU_25Wahlprogramm.pdf'})
]

serializable_s = [
    {"page_content": doc.page_content, "metadata": doc.metadata}
    for doc in vector_store.similarity_search(query,k=10, filter={'party':'CDU/CSU'})

]

In [36]:
with open("m_climate_query_CDUCSU.json", "w") as f:
     json.dump(serializable_m, f, indent=4)

In [37]:
with open("s_climate_query_CDUCSU.json", "w") as f:
     json.dump(serializable_s, f, indent=4)