In [1]:
import os
from dotenv import load_dotenv
import json
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
import re


In [2]:
load_dotenv()

True

In [3]:
def clean_text(text):
    # Replace multiple newlines with a single space
    cleaned_text = text.replace('\n', ' ')
    # Optionally, you can use regex to handle extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [4]:
loader = PDFPlumberLoader("./pdfs/cv.pdf")
docs = loader.load()

# Check the number of pages
print("Number of pages in the PDF:",len(docs))

# Load the random page content
docs[0].page_content

Number of pages in the PDF: 1


"KHALSS YASSINE\nyassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO\ngithub.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/\nPROFILE\nI'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a\nmaster's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my\ncourses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to\napply and further develop these skills in a stimulating professional environment.\nEXPERIENCES\nInternship 04/2023 – 05/2023\nNational Electricity and Drinking Water Board (ONEE)\nAs part of my final year project,I did a two-month internship at ONEE, where I\ndesigned and developed a web application for the company's fuel management\nsystem.\nEDUCATION\nMaster in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025\nEcole normale supérieure de l'enseignement technique(ENSET) Mohammedia

In [5]:
text_splitter = SemanticChunker(HuggingFaceEmbeddings())
documents = text_splitter.split_documents(docs)

  from tqdm.autonotebook import tqdm, trange


In [6]:
documents

[Document(page_content="KHALSS YASSINE\nyassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO\ngithub.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/\nPROFILE\nI'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a\nmaster's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my\ncourses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to\napply and further develop these skills in a stimulating professional environment. EXPERIENCES\nInternship 04/2023 – 05/2023\nNational Electricity and Drinking Water Board (ONEE)\nAs part of my final year project,I did a two-month internship at ONEE, where I\ndesigned and developed a web application for the company's fuel management\nsystem. EDUCATION\nMaster in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025\nEcole normale supérieure de l'enseignement techni

In [7]:
import re

def clean_text(text):
    cleaned_text = text.replace('\n', ' ')
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [8]:
print("Number of chunks created: ", len(documents))

Number of chunks created:  2


In [9]:
type(documents[0])

langchain_core.documents.base.Document

In [10]:
# Assuming 'documents' is a list of Document objects
metadata_list = []
documents_list = []

for document in documents:
    metadata_list.append(document.metadata)  # Extract metadata
    documents_list.append(document.page_content)  # Extract document content (text)

# Now 'metadata_list' contains all the metadata and 'documents_list' contains all the document strings


In [11]:
clean_text(documents_list[0])

"KHALSS YASSINE yassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO github.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/ PROFILE I'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a master's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my courses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to apply and further develop these skills in a stimulating professional environment. EXPERIENCES Internship 04/2023 – 05/2023 National Electricity and Drinking Water Board (ONEE) As part of my final year project,I did a two-month internship at ONEE, where I designed and developed a web application for the company's fuel management system. EDUCATION Master in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025 Ecole normale supérieure de l'enseignement technique(ENSET) Mohammedia, MOROCCO Bachel

In [12]:
documents_list[0]

"KHALSS YASSINE\nyassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO\ngithub.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/\nPROFILE\nI'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a\nmaster's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my\ncourses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to\napply and further develop these skills in a stimulating professional environment. EXPERIENCES\nInternship 04/2023 – 05/2023\nNational Electricity and Drinking Water Board (ONEE)\nAs part of my final year project,I did a two-month internship at ONEE, where I\ndesigned and developed a web application for the company's fuel management\nsystem. EDUCATION\nMaster in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025\nEcole normale supérieure de l'enseignement technique(ENSET) Mohammedia,\

In [13]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedder = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


In [14]:
embeddings = embedder.embed_documents(documents_list)

In [15]:
from langchain.vectorstores.pgvector import DistanceStrategy
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = f"postgresql+psycopg2://{os.getenv("DB_USER")}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"

embeddings = HuggingFaceEmbeddings()

db = PGVector.from_documents(
    documents= docs,
    embedding = embeddings,
    collection_name= "embeddings",
    distance_strategy = DistanceStrategy.COSINE,
    connection_string=CONNECTION_STRING
    )

  warn_deprecated(


In [16]:
from langchain.schema import Document

# Query for which we want to find semantically similar documents
query = "Education?"

#Fetch the k=3 most similar documents
docs =  db.similarity_search(query, k=3)


In [17]:
docs

[Document(page_content="KHALSS YASSINE\nyassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO\ngithub.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/\nPROFILE\nI'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a\nmaster's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my\ncourses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to\napply and further develop these skills in a stimulating professional environment.\nEXPERIENCES\nInternship 04/2023 – 05/2023\nNational Electricity and Drinking Water Board (ONEE)\nAs part of my final year project,I did a two-month internship at ONEE, where I\ndesigned and developed a web application for the company's fuel management\nsystem.\nEDUCATION\nMaster in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025\nEcole normale supérieure de l'enseignement tech

In [18]:
# Interact with a document returned from the similarity search on pgvector
doc = docs[0]

# Access the document's content
doc_content = doc.page_content
# Access the document's metadata object
doc_metadata = doc.metadata

print("Content snippet:" + doc_content[:500])

Content snippet:KHALSS YASSINE
yassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO
github.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/
PROFILE
I'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a
master's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my
courses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to
apply and further


In [19]:
# Create retriever from database
# We specify the number of results we want to retrieve (k=3)
retriever = db.as_retriever(
    search_kwargs={"k": 3}
    )

In [20]:
from langchain_community.llms import Ollama

# Define llm
llm = Ollama(model="mistral")

In [21]:
from langchain.chains import RetrievalQA
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=False,
)

In [22]:
query =  "Tell me the name of the student"

response = qa_stuff.run(query)

from IPython.display import Markdown, display
display(Markdown(response))

  warn_deprecated(


 Khalss Yassine

In [121]:
embeddings

[[0.047174956649541855,
  0.08028335124254227,
  -0.041871327906847,
  -0.0649743378162384,
  0.02570127323269844,
  -0.01008762139827013,
  0.03802723065018654,
  -0.02126866765320301,
  0.017418203875422478,
  -0.018774287775158882,
  0.050203774124383926,
  0.03305479884147644,
  -0.0089908791705966,
  0.10124576836824417,
  0.04185578227043152,
  -0.03320211544632912,
  0.04020291939377785,
  -0.0009326149011030793,
  -0.0400216244161129,
  -0.008918588049709797,
  0.009225307032465935,
  0.013649932108819485,
  0.00015006170724518597,
  0.027524560689926147,
  0.02699390985071659,
  -0.011728819459676743,
  -0.016842318698763847,
  0.01022918801754713,
  0.021133434027433395,
  0.020060865208506584,
  0.03158115595579147,
  -0.027380309998989105,
  -0.0026255701668560505,
  0.006643562112003565,
  2.3671557300986024e-06,
  -0.03102751635015011,
  -0.03651072084903717,
  -0.017882758751511574,
  0.02704613283276558,
  -0.050988584756851196,
  0.035466019064188004,
  0.0058768442831

In [104]:
import psycopg2
import os

try:
    conn = psycopg2.connect(
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        database=os.getenv("DB_NAME"),
    )

    cur = conn.cursor()

    create_extension_query = "CREATE EXTENSION IF NOT EXISTS vector;"

    cur.execute(create_extension_query)
    print("Extension created successfully")

    
    vector_embedding_size = len(embeddings[0])
    table_query = f"CREATE TABLE IF NOT EXISTS embeddings (id bigserial PRIMARY KEY, text TEXT NOT NULL, metadata jsonb, embedding vector({vector_embedding_size}));"
    
    cur.execute(table_query)
    print("Table Created Successfully")

    conn.commit()


except psycopg2.DatabaseError as error:
    print(f"Error executing query: {error}")

finally:
    # Close cursor and connection
    if cur:
        cur.close()
    if conn:
        conn.close()


Extension created successfully
Table Created Successfully


In [105]:
import psycopg2
import json
import os

def insert_data(metadata_list, documents_list, embeddings):
    try:
        # Establish connection to PostgreSQL
        conn = psycopg2.connect(
            user=os.getenv("DB_USER"),
            password=os.getenv("DB_PASSWORD"),
            host=os.getenv("DB_HOST"),
            port=os.getenv("DB_PORT"),
            database=os.getenv("DB_NAME"),
        )

        # Create a cursor object
        cur = conn.cursor()

        # Iterate over metadata, documents, and embeddings
        for metadata, text, embedding in zip(metadata_list, documents_list, embeddings):
            print(str(text))
            # Convert metadata (dict) to JSON string
            metadata_json = json.dumps(metadata)

            # Convert the embedding list to the PostgreSQL array format
            embedding_str = '[' + ','.join(map(str, embedding)) + ']'

            # Create the SQL query with placeholders
            insert_query = """
            INSERT INTO embeddings (text, metadata, embedding)
            VALUES (%s, %s, %s);
            """

            # Execute the query with parameters
            cur.execute(insert_query, (clean_text(text), metadata_json, embedding_str))

        # Commit the transaction
        conn.commit()

        print("Data inserted successfully")

    except psycopg2.DatabaseError as error:
        print(f"Error inserting data: {error}")

    finally:
        # Close cursor and connection
        if cur:
            cur.close()
        if conn:
            conn.close()


insert_data(metadata_list, documents_list, embeddings)


KHALSS YASSINE
yassinekh007007@gmail.com +212 6 27 68 92 95 Casablanca, MOROCCO
github.com/YASSINEKS007 https://khalss-yassine-portfolio-website.vercel.app/
PROFILE
I'm a student with a passion for artificial intelligence and machine learning, currently in my first year of a
master's degree in distributed systems and artificial intelligence at ENSET Mohammedia. Thanks to my
courses and academic projects, I have acquired solid skills in programming and data analysis. I'm keen to
apply and further develop these skills in a stimulating professional environment. EXPERIENCES
Internship 04/2023 – 05/2023
National Electricity and Drinking Water Board (ONEE)
As part of my final year project,I did a two-month internship at ONEE, where I
designed and developed a web application for the company's fuel management
system. EDUCATION
Master in Distributed Systems and ArtificialIntelligence (SDIA) 09/2023 – 06/2025
Ecole normale supérieure de l'enseignement technique(ENSET) Mohammedia,
MOROCCO
Bachelo

In [106]:
import psycopg2
import os

def get_similar_data(query: str, limit: int):
    try:
        # Connect to the PostgreSQL database
        conn = psycopg2.connect(
            user=os.getenv("DB_USER"),
            password=os.getenv("DB_PASSWORD"),
            host=os.getenv("DB_HOST"),
            port=os.getenv("DB_PORT"),
            database=os.getenv("DB_NAME"),
        )
        
        cur = conn.cursor()
        
        # Get the embedded query (vector representation of the query)
        embedded_query = embedder.embed_query(query)
        
        # Convert the list to a format PostgreSQL expects (array format)
        embedded_query_str = '[' + ','.join(map(str, embedded_query)) + ']'
        
        # Parameterize the query to avoid SQL injection
        search_query = """
            SELECT text
            FROM embeddings 
            ORDER BY embedding <=> %s 
            LIMIT %s;
        """
        
        # Execute the query with the embedded_query vector and limit as parameters
        cur.execute(search_query, (embedded_query_str, limit))
        
        # Fetch the results of the query
        data = cur.fetchall()
        
        return data
                
    except psycopg2.DatabaseError as error:
        print(f"Error fetching data: {error}")
    
    finally:
        # Close the cursor and connection
        if cur:
            cur.close()
        if conn:
            conn.close()

# Example usage
# embedded_query = [0.02889273501932621, 0.005871934350579977, -0.014360247179865837, 0.015201905742287636, 0.017568767070770264]
# get_similar_data("example query", 5)


In [110]:
res = get_similar_data(query="profil", limit=3)

In [114]:
type(res[0][0])

str

In [108]:
from langchain_community.llms import Ollama

# Define llm
llm = Ollama(model="mistral")

In [109]:
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n
3. Keep the answer crisp and limited to 3,4 sentences.

Context: {context}

Question: {question}

Helpful Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

llm_chain = LLMChain(
                  llm=llm, 
                  prompt=QA_CHAIN_PROMPT, 
                  callbacks=None, 
                  verbose=False)

document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

combine_documents_chain = StuffDocumentsChain(
                  llm_chain=llm_chain,
                  document_variable_name="context",
                  document_prompt=document_prompt,
                  callbacks=None,
                  verbose=False
              )

qa = RetrievalQA(
                  combine_documents_chain=combine_documents_chain,
                  verbose=False,
                  retriever=res,
                  return_source_documents=False,
              )

ValidationError: 1 validation error for RetrievalQA
retriever
  value is not a valid dict (type=type_error.dict)

In [31]:
question = "What are the frameworks this candidat knows"
response = qa({"query": question})
answer = response.get('result', 'No answer found')
print("Answer:", answer)


Answer:  The candidate is familiar with Backend Frameworks such as Django, Spring Boot, and Express.js, and Frontend Frameworks including React.js and Angular.
