In [None]:
from langchain_ollama import ChatOllama, OllamaEmbeddings

llm = ChatOllama(
    model="llama3.2",
    temperature=0.7,
)

embeddings = OllamaEmbeddings(
    model="llama3.2"
)

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain_qdrant import QdrantVectorStore

client = QdrantClient(":memory:")

vector_size = len(embeddings.embed_query("test"))

if not client.collection_exists("documents"):
    client.create_collection(
        collection_name="documents",
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )


In [None]:
from mypackage import finder, userinput

download_dir = finder.get_download_folder()

# List files in the input directory
file_list = finder.get_file_list(download_dir, extensions=[".pdf"])
finder.print_enumerated_list(file_list)

# Get selected file from user
selected_file_num = userinput.get_user_input("Select a file by number", default="1")
selected_file_path = file_list[int(selected_file_num) - 1]

# Process document file
file_content_list = finder.read_pdf_file(selected_file_path)

In [None]:
import pandas as pd
from mypackage import splitter

df = pd.DataFrame(file_content_list, columns=['page_number', 'text'])

In [None]:
df.info()
df.head()

In [None]:
chunked_content = list(zip(df['page_number'], df['text'].apply(lambda x: splitter.get_chunks_from_text(x, chunk_size=512))))
df_chunks = pd.DataFrame(chunked_content, columns=['page_number', 'chunks'])
df_chunks = df_chunks.explode('chunks').reset_index(drop=True)
df_chunks.loc[:, 'chunks'] = df_chunks.loc[:, 'chunks'].apply(lambda x: x.lower())
# df_chunks['chunks_lower'] = df_chunks.loc[:, 'chunks'].apply(lambda x: x.lower())

In [None]:
df_chunks.info()
df_chunks.head()

In [None]:
from typing import List, Optional
from sqlalchemy import String, create_engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column

class Base(DeclarativeBase):
    pass

In [None]:
import os
from sqlalchemy import MetaData, Table, Column, Integer, String, create_engine

data_dir = os.path.join(finder.get_git_root(), "data")
db_name = 'documents.db'
db_path = os.path.join(data_dir, db_name)

engine = create_engine(f"sqlite:////{db_path}")

documents_table = Table(
    'documents',
    Base.metadata,
    Column('id', Integer, primary_key=True),
    Column('page_number', Integer),
    Column('text', String),
)
Base.metadata.create_all(engine)

df_chunks.to_sql('documents', con=engine, if_exists='replace')

In [None]:
# returns nothing
from sqlalchemy import text

with engine.connect() as conn:
    conn.execute(text("SELECT * FROM documents")).fetchall()

In [None]:
from sqlalchemy import select
from sqlalchemy.orm import Session


metadata = MetaData()
table = Table(
    'documents',
    metadata,
    autoload_with=engine
    )

stmt = select(table).where(table.columns.page_number == 1)

connection = engine.connect()
results = connection.execute(stmt).fetchall()
for result in results:
    print(result)

In [None]:
vector_store = QdrantVectorStore(
    client=client,
    collection_name="documents",
    embedding=embeddings,
)

In [None]:
limit = 5
texts = df_chunks['chunks'].loc[:limit].tolist()
metadatas = [{'page_number': pn} for pn in df_chunks['page_number']]

vector_store.add_texts(texts, metadatas=metadatas)

In [None]:
from langchain.tools import tool
from langchain.agents import create_agent

# Construct a tool for retrieving context
@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from a blog post. "
    "Use the tool to help answer user queries."
)
agent = create_agent(llm, tools, system_prompt=prompt)

In [None]:
user_query = userinput.get_user_input("Stelle deine Frage!", default="Welche Informationen enth√§lt das Dokument?")
for step in agent.stream(
    {"messages": [{"role": "user", "content": user_query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()