In [None]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


llm = Ollama(
    model = "llama3.1", callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
)
answer = llm.invoke("football winner 2022")


  llm = Ollama(


You're referring to the winners of various football competitions in 2022!

Here are some notable ones:

1. **FIFA World Cup Qatar 2022**:
	* Winners: Argentina (won against France in the final on December 18, 2022)
	* Golden Ball (best player): Lionel Messi
	* Golden Glove (best goalkeeper): Emiliano Martínez
2. **UEFA Champions League 2021-22**:
	* Winners: Real Madrid (beat Liverpool 1-0 in the final on May 28, 2022)
3. **English Premier League 2021-22**:
	* Winners: Manchester City
4. **La Liga 2021-22**:
	* Winners: Barcelona (on points average, as the team finished with the same number of points but fewer goals conceded than Real Madrid)
5. **Bundesliga 2021-22**:
	* Winners: Bayern Munich
6. **Serie A 2021-22**:
	* Winners: AC Milan
7. **UEFA Europa League 2021-22**:
	* Winners: Eintracht Frankfurt (beat Rangers 2-1 in the final on May 18, 2022)
8. **Copa Libertadores 2022**:
	* Winners: Independiente del Valle (from Ecuador)

These are just a few examples of football winners fro

### Text Extraction

In [31]:
from langchain_community.document_loaders import PyPDFLoader

pdf_kappa = "./pdfs/Kappe.pdf"
loader = PyPDFLoader(pdf_kappa)

# Load all pages and extract text content
pages = [page.page_content for page in loader.lazy_load()]

### Semantic chunk to split the thesis
Semantic chunking considers the relationship within the text. It divides the text into meaningful, semantically complete chunks.

Semantic chunk involves taking the embeddings of every sentence in the document, comparing the similarity of all sentences with each other and then grouping sentences with the most similar embeddings together.

- Emebedding Models:
    - bge-small-en: very light and dedicate for retrieval-augmented language tasks. It's designed to officially handle tasks like passage retrieval and semantic similarity.


- Create langchain docs object from the text

In [44]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings


embedding_model = FastEmbedEmbeddings(model_name="BAAI/bge-small-en")



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

In [None]:
## chunk the text

semantic_chunker = SemanticChunker(
            #using 'percentile' to split the text, which is based on computing 
        #all differences between sentences and then se if any differences is greater that X percentile
    
    embedding_model, breakpoint_threshold_type='percentile' )


## split the text
docs = semantic_chunker.create_documents(pages)


#### Vector Store

Storing the chunks in Memory for efficient retreivel 

In [70]:
from langchain_core.vectorstores import InMemoryVectorStore


# Prepare texts for storage 
texts = [chunk.page_content for chunk in docs]

# create a vector store
vector_store = InMemoryVectorStore.from_texts(
    texts,
    embedding=embedding_model
)

Vectorstore to retreive documents based. light wrapper around the vector store class to make it conform to the retriever interface

In [144]:
retriever = vector_store.as_retriever(search_type = "similarity", search_kwargs={"score_threshold":0.5,"k":30} )


user_query = "what are the research questions of the thesis"

retrieved_docs = retriever.invoke(user_query)

#### Prompt Creation

In [201]:
from langchain_core.prompts import ChatPromptTemplate


prompt_template = """
You are an assistant helping me  preparing for my thesis defense:
Use the content provide to answer my query:

content:
{retrieved_docs}

query:
{user_query}

Provide a clear, scientific  answer based on given content. 

If I asked about a summary, give a coherent, high-level overview.

Never include document ids or metadata in your response.
"""

# Structure of prompt

structured_prompt = ChatPromptTemplate.from_template(prompt_template)



In [146]:
## chain creation
from langchain_core.output_parsers import StrOutputParser
chain = structured_prompt |llm | StrOutputParser()

In [147]:
## Invoke the chain

response = chain.invoke({
    "retrieved_docs": retrieved_docs,
    "user_query": user_query
})

Based on the provided content, we can infer that the research questions for this thesis are not explicitly stated in the given snippet. However, from Document(id='93a229a6-8d42-4498-bebc-7163a0e5e10c', metadata={}, page_content='24 3. ResearchMethodology\napplyingartifacts( Hevneretal.'), we can see that it is related to the chapter "Research Methodology" which might give us a hint on what research questions are being addressed.

But, from Document(id='baf3aa59-3992-40fa-b4d3-3cf8b3d9b2a8', metadata={}, page_content='. .1\n1.2 ProblemStatement&ResearchQuestions . . . . . . . . . . . . . . . . . . .5\n1.3 ObjectiveandContributions . . . . . . . . . . . . . . . . . . . . . . . . .''), we can see that there is a section titled "Problem Statement & Research Questions" which might hold the answer to our query.

Unfortunately, we don't have enough content to provide a clear and accurate answer. However, from Document(id='5eb17a09-2d7a-444b-839b-7ae59bcb89cc', metadata={}, page_content='. . .

###  Recursive Chunking
 We divides the input text into smaller chunks in a h

In [None]:
## parse the Pdf using Unstructured.io
## from unstructured.partition.auto import partition

elements = partition("./pdfs/Kappe.pdf")

In [175]:
## convert element into strings
raw_texts = [str(el) for el in elements if str(el).strip()]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = True,
    separators=["chapter"]
)

# chunk the text
rec_chunk = text_splitter.create_documents(raw_texts)

In [192]:
from langchain_core.vectorstores import InMemoryVectorStore


# Prepare texts for storage 
Rec_texts = [chunk.page_content for chunk in rec_chunk]

# create a vector store
vector_store_rec = InMemoryVectorStore.from_texts(
    Rec_texts,
    embedding=embedding_model
)

In [202]:
retriever = vector_store.as_retriever(search_type = "similarity", 
                                      search_kwargs={"score_threshold":0.7,"k":30} )


user_query = "What are the main experiments from the thesis?"

retrieved_docs = retriever.invoke(user_query)

In [204]:
## Invoke the chain

response = chain.invoke({
    "retrieved_docs": retrieved_docs,
    "user_query": user_query
})

After analyzing the provided content, it appears that there is no direct mention of "main experiments" in the abstract or introduction sections. However, we can infer some information about the research methodology and experiments conducted.

According to Document #39024c0b-1a8f-4179-84d0-c4e889ace382 (page_content='30 3. Research Methodology'), it is mentioned that the thesis follows a conceptually grounded approach, which involves applying artifacts (Hevner et al.) as part of the research methodology.

Additionally, Document #c7f76522-f74d-41e5-8921-779250ab821e (page_content='40 4. Results and Discussions') contains some references to user experiments (e.g., [29] B. P. Knijnenburg, M. C. Willemsen, Evaluating recommender systems with user experiments). However, these are not explicitly stated as the main experiments of the thesis.

Based on this information, it is difficult to provide a clear and definitive answer about the main experiments of the thesis without more context or spec

## re-Build Using LLamaIndex

Build a data ingestion pipeline into vector database, and then build a retrieval pipeline. Using the following Stack
-  bge-small-en as embedding model
- PostgresSql as the vectorstore 
- Lama3.1 as the LLm

In [None]:
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-llama-cpp

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-readers-file
  Downloading llama_index_readers_file-0.5.4-py3-none-any.whl.metadata (5.7 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-macosx_10_9_x86_64.whl.metadata (3.4 kB)
Collecting defusedxml>=0.7.1 (from llama-index-readers-file)
  Using cached defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Collecting llama-index-core<0.15,>=0.13.0 (from llama-index-readers-file)
  Downloading llama_index_core-0.14.2-py3-none-any.whl.metadata (2.5 kB)
Collecting pandas<2.3.0 (from llama-index-readers-file)
  Downloading pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl.metadata (89 kB)
Collecting striprtf<0.0.27,>=0.0.26 (from llama-index-readers-file)
  Downloading striprtf-0.0.26-py3-none-any.whl.metadata (2.1 kB)
Collecting aiosqlite (from llama-index-core<0.15,>=0.13.0->llama-index-readers-file)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting banks<3,>=2.2.0 (from llama-index-core<0.15,>=0.13.0->llama-index-readers-fil

In [208]:
pip install llama-index-llms-llama-cpp


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-llms-llama-cpp
  Downloading llama_index_llms_llama_cpp-0.5.1-py3-none-any.whl.metadata (4.2 kB)
Collecting llama-cpp-python<0.4,>=0.3.0 (from llama-index-llms-llama-cpp)
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting diskcache>=5.6.1 (from llama-cpp-python<0.4,>=0.3.0->llama-index-llms-llama-cpp)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading llama_index_llms_llama_cpp-0.5.1-py3-none-any.whl (8.4 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.

#### Upload the model

- the model is already uploaded using Ollama


In [209]:
#### Initialize Postgres
!pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [211]:
import psycopg2

db_name = "vct_db"
host = "localhost"
password = "ayoub123"
port= "5432"
user = "ayoub"


# connect to postgresdb

conn = psycopg2.connect(
    dbname =db_name,
    host = host,
    password = password,
    port = port,
    user = user
)

conn.autocommit = True