In [112]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


llm = Ollama(
    model = "llama3.1", callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
)
answer = llm.invoke("football winner 2022")


  llm = Ollama(
  llm = Ollama(


You're referring to the winners of the major football tournaments in 2022!

Here are some of the notable ones:

1. **FIFA World Cup 2022**: The winner was Argentina, who defeated France 4-2 in a penalty shootout after the match ended 3-3 after extra time.
2. **UEFA Champions League 2022**: The winner was Real Madrid, who defeated Liverpool 1-0 in the final.
3. **Premier League 2021-22**: The winner was Manchester City, who finished top of the table with 93 points from 38 matches.
4. **La Liga 2021-22**: The winner was Real Madrid, who won their 35th La Liga title with 90 points from 38 matches.

Let me know if you're looking for any other specific information or tournament winners!

### Text Extraction

In [31]:
from langchain_community.document_loaders import PyPDFLoader

pdf_kappa = "./pdfs/Kappe.pdf"
loader = PyPDFLoader(pdf_kappa)

# Load all pages and extract text content
pages = [page.page_content for page in loader.lazy_load()]

### Semantic chunk to split the thesis
Semantic chunking considers the relationship within the text. It divides the text into meaningful, semantically complete chunks.

Semantic chunk involves taking the embeddings of every sentence in the document, comparing the similarity of all sentences with each other and then grouping sentences with the most similar embeddings together.

- Emebedding Models:
    - bge-small-en: very light and dedicate for retrieval-augmented language tasks. It's designed to officially handle tasks like passage retrieval and semantic similarity.


- Create langchain docs object from the text

In [46]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings


embedding_model = FastEmbedEmbeddings(model_name="BAAI/bge-small-en")



In [None]:
## chunk the text

semantic_chunker = SemanticChunker(
            #using 'percentile' to split the text, which is based on computing 
        #all differences between sentences and then se if any differences is greater that X percentile
    
    embedding_model, breakpoint_threshold_type='percentile' )


## split the text
docs = semantic_chunker.create_documents(pages)


#### Vector Store

Storing the chunks in Memory for efficient retreivel 

In [70]:
from langchain_core.vectorstores import InMemoryVectorStore


# Prepare texts for storage 
texts = [chunk.page_content for chunk in docs]

# create a vector store
vector_store = InMemoryVectorStore.from_texts(
    texts,
    embedding=embedding_model
)

Vectorstore to retreive documents based. light wrapper around the vector store class to make it conform to the retriever interface

In [144]:
retriever = vector_store.as_retriever(search_type = "similarity", search_kwargs={"score_threshold":0.5,"k":30} )


user_query = "what are the research questions of the thesis"

retrieved_docs = retriever.invoke(user_query)

#### Prompt Creation

In [201]:
from langchain_core.prompts import ChatPromptTemplate


prompt_template = """
You are an assistant helping me  preparing for my thesis defense:
Use the content provide to answer my query:

content:
{retrieved_docs}

query:
{user_query}

Provide a clear, scientific  answer based on given content. 

If I asked about a summary, give a coherent, high-level overview.

Never include document ids or metadata in your response.
"""

# Structure of prompt

structured_prompt = ChatPromptTemplate.from_template(prompt_template)



In [146]:
## chain creation
from langchain_core.output_parsers import StrOutputParser
chain = structured_prompt |llm | StrOutputParser()

In [147]:
## Invoke the chain

response = chain.invoke({
    "retrieved_docs": retrieved_docs,
    "user_query": user_query
})

Based on the provided content, we can infer that the research questions for this thesis are not explicitly stated in the given snippet. However, from Document(id='93a229a6-8d42-4498-bebc-7163a0e5e10c', metadata={}, page_content='24 3. ResearchMethodology\napplyingartifacts( Hevneretal.'), we can see that it is related to the chapter "Research Methodology" which might give us a hint on what research questions are being addressed.

But, from Document(id='baf3aa59-3992-40fa-b4d3-3cf8b3d9b2a8', metadata={}, page_content='. .1\n1.2 ProblemStatement&ResearchQuestions . . . . . . . . . . . . . . . . . . .5\n1.3 ObjectiveandContributions . . . . . . . . . . . . . . . . . . . . . . . . .''), we can see that there is a section titled "Problem Statement & Research Questions" which might hold the answer to our query.

Unfortunately, we don't have enough content to provide a clear and accurate answer. However, from Document(id='5eb17a09-2d7a-444b-839b-7ae59bcb89cc', metadata={}, page_content='. . .

###  Recursive Chunking
 We divides the input text into smaller chunks in a h

In [None]:
## parse the Pdf using Unstructured.io
## from unstructured.partition.auto import partition

elements = partition("./pdfs/Kappe.pdf")

In [175]:
## convert element into strings
raw_texts = [str(el) for el in elements if str(el).strip()]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = True,
    separators=["chapter"]
)

# chunk the text
rec_chunk = text_splitter.create_documents(raw_texts)

In [192]:
from langchain_core.vectorstores import InMemoryVectorStore


# Prepare texts for storage 
Rec_texts = [chunk.page_content for chunk in rec_chunk]

# create a vector store
vector_store_rec = InMemoryVectorStore.from_texts(
    Rec_texts,
    embedding=embedding_model
)

In [202]:
retriever = vector_store.as_retriever(search_type = "similarity", 
                                      search_kwargs={"score_threshold":0.7,"k":30} )


user_query = "What are the main experiments from the thesis?"

retrieved_docs = retriever.invoke(user_query)

In [204]:
## Invoke the chain

response = chain.invoke({
    "retrieved_docs": retrieved_docs,
    "user_query": user_query
})

After analyzing the provided content, it appears that there is no direct mention of "main experiments" in the abstract or introduction sections. However, we can infer some information about the research methodology and experiments conducted.

According to Document #39024c0b-1a8f-4179-84d0-c4e889ace382 (page_content='30 3. Research Methodology'), it is mentioned that the thesis follows a conceptually grounded approach, which involves applying artifacts (Hevner et al.) as part of the research methodology.

Additionally, Document #c7f76522-f74d-41e5-8921-779250ab821e (page_content='40 4. Results and Discussions') contains some references to user experiments (e.g., [29] B. P. Knijnenburg, M. C. Willemsen, Evaluating recommender systems with user experiments). However, these are not explicitly stated as the main experiments of the thesis.

Based on this information, it is difficult to provide a clear and definitive answer about the main experiments of the thesis without more context or spec

## re-Build Using LLamaIndex

Build a data ingestion pipeline into vector database, and then build a retrieval pipeline. Using the following Stack
-  bge-small-en as embedding model
- PostgresSql as the vectorstore 
- Lama3.1 as the LLm

#### Upload the model

- the model is already uploaded using Ollama
- emebedding model with fastembedding

In [73]:
import psycopg2

db_name = "vct_db"
host = "localhost"
password = "ayoub123"
port= "5432"
user = "ayoub"


# connect to postgresdb

conn = psycopg2.connect(
    dbname =db_name,
    host = host,
    password = password,
    port = port,
    user = user
)

conn.autocommit = True

# ## remove if exist data
# with conn.cursor() as c:
#     c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#     c.execute(f"CREATE DATABASE {db_name}")

In [74]:
from sqlalchemy.engine import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    
    database=db_name,
    host = host,
    password=password,
    port=port,
    user = user,
    table_name="kappa_db",
    embed_dim=384
)

### Build Data ingestion pipeline

In [75]:
## Load the data
from langchain_community.document_loaders import PyPDFLoader

pdf_kappa = "./pdf/thesis.pdf"
loader = PyPDFLoader(pdf_kappa)

# Load all pages and extract text content
pages = [page.page_content for page in loader.lazy_load()]

In [76]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
documents = loader.load(file_path="./pdf/thesis.pdf")

In [77]:
## Text spliter to split document
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size= 1024,
)

text_chunk = []
doc_idxs = []

for doc_idx, docs in enumerate(documents):
    cur_text_chunks = text_parser.split_text(docs.text)
    text_chunk.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [78]:
## Construct nodes from techchunk

from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunk):
    node = TextNode(
        text = text_chunk,
    )
    str_doc = documents[doc_idxs[idx]]
    node.metadata = str_doc.metadata
    nodes.append(node)

In [79]:
## Generate embedding for each node

for node in nodes:
    node_embeddings = embedding_model.embed_documents(
        node.get_content(metadata_mode="all")
    )
    
    node.embedding = node_embeddings[0]

In [80]:
## Load nodes into vector store
vector_store.add(nodes)

['63441419-acc3-45c0-9377-1acc56b1a05b',
 '44866717-7a2e-423b-93be-5aafd67e607e',
 '60044aa6-2b59-44cb-b1f6-f07950faaddf',
 '6906f8d5-1a3a-4bdb-b2e5-24a915ac5536',
 '6d9d101e-b85d-4cb3-8666-892ae698bae5',
 'aedfd3d7-ab78-440f-9085-396ca28cfec9',
 '25f9db6c-be2c-4999-90d4-166771553a67',
 'ffd71ca7-946b-4a9e-863d-49c4351b4dd4',
 '9f97f38c-114f-4a0c-b2e8-f88ccd505ef1',
 'aef7e0e0-8759-4f5e-8cac-b6a4a9691191',
 '58044267-e3e7-4e4b-aef8-d941b1024674',
 'b808b1e4-ce59-472a-88a3-8d718bc92a20',
 'ffec7f48-18a0-473f-aa78-026956a0b8c7',
 '9d967630-09c9-4147-9837-17872da85057',
 '0a92f4e3-e4a3-41bc-afe9-15d7a542d352',
 'f4d0c0db-35a4-4eb8-8baa-eba7427812b0',
 '90372ef3-ded1-4a48-b336-fd0d50111527',
 '39c69d6c-96ea-4379-93c0-23afe5318908',
 '70933611-00e2-46da-bfc5-7dbffef034ad',
 '59f3981d-b075-4c4f-ae2e-411e80093d92',
 '98772fd3-2efa-4fbc-916b-8f6c9e3790ad',
 '1cf743d7-d1e1-4c4d-a9db-8b22b8b17e74',
 '0f63a143-fe56-449d-928f-11e409bd253d',
 'ea2d55b8-b688-48d7-8105-4b7d7444a47c',
 '9d0a455a-4d1a-

### Build Retrieval pipeline

In [93]:
query_str = "what is the thesis about"

- Generate a query embeddings 

In [94]:
query_embeddings = embedding_model.embed_query(query_str)

- Query the vector database

In [102]:
# construct vector store query
from llama_index.core.vector_stores import VectorStoreQuery


query_mode = "default"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embeddings,
    similarity_top_k=20,
    mode=query_mode   
    
)

# return a vectorstorequeryresults
query_results = vector_store.query(vector_store_query)
print(query_results.nodes[1].get_content())

8
1. Introduction


- Parse results into a set of nodes

In [103]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_results.nodes):
    score : Optional[float] = None
    if query_results.similarities is not None:
        score = query_results.similarities[index]
    nodes_with_scores.append(NodeWithScore(node = node, score = score))

- Put into a retriever

In [127]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class VectorRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""
    
    def __init__(
        self, 
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k : int = 20
    )-> None:
        """Init parameter """
        self._vector_Store = vector_store,
        self._embed_model = embedding_model,
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()
        
    def _retrieve(self, query_bundel: QueryBundle)->List[NodeWithScore]:
        """Retriever"""
        query_embeddings = embedding_model.embed_query(
            query_bundel.query_str
        )
        
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embeddings,
            similarity_top_k=self._similarity_top_k,
            mode = self._query_mode,
        )
        query_results = vector_store.query(vector_store_query)

        nodes_with_scores = []
        
        for index, node in enumerate(query_results.nodes):
            score: Optional[float] = None
            if query_results.similarities is not None:
                score = query_results.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores


In [None]:
retriever = VectorRetriever(
    vector_store,
    embedding_model,
    query_mode="default",
    similarity_top_k=20
)

- Query Engine to systhesize the response 

In [129]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.ollama import Ollama


llm_ol = Ollama(
    model = "llama3.1",
    request_timeout = 120,
    context_window=8000
    )

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm_ol)

In [154]:
query_str = "which recommendation approach leads to more healthier food choices?"
response = query_engine.query(query_str)

print(str(response))

2025-09-18 16:30:38,707 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


Constraint-based recommenders tend to perform better in promoting healthier options, especially for users with a low level of health consciousness. This is because they allow users to interact with recipe features, making it easier for them to locate recipes that meet their specific needs. However, the effectiveness of this approach can vary depending on the user's level of experience and familiarity with nutritional information.
