### Imports

In [30]:
import os
import pdfplumber
import openai
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
from qdrant_client import QdrantClient
from qdrant_client.http import models
from dotenv import load_dotenv

load_dotenv('./.env')

True

### Parse document

In [31]:
fulltext = ""
with pdfplumber.open("/Users/alex/Documents/MBIT/MBIT_PROYECTOS/CV-Insights-Project/data/99433371.pdf") as pdf:
    # loop over all the pages
    for page in pdf.pages:
        fulltext += page.extract_text()

In [32]:
text = fulltext

chunks = []
while len(text) > 500:
    last_period_index = text[:500].rfind('.')
    if last_period_index == -1:
        last_period_index = 500
    chunks.append(text[:last_period_index])
    text = text[last_period_index+1:]
chunks.append(text)

In [33]:
chunks[1]

'\nConstruction Jul 2005 to Jan 2006\nCompany Name ï¼\u200b City , State\nExtensive remodeling project.\nNov 2004\nCompany Name ï¼\u200b City , State\ninternship supporting interior design/project teams, researching materials, and organizing the materials resource library.\nAccounts Payable Assistant Jan 1999 to Jan 2000\nCompany Name ï¼\u200b City , State\nHandling petty cash, data entry, payroll distribution, and other administrative duties'

### Qdrant connection

In [34]:
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_API_KEY")
port = 6333

qdrant_client = QdrantClient(
    url=url,
    port=port,
    api_key=api_key,
)

qdrant_client.recreate_collection(
    collection_name="demo",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

True

### Generate embeddings

In [35]:
points = []
i = 1
for chunk in chunks:
    i += 1
    
    embeddings = openai.embeddings.create(
        input=chunk,
        model="text-embedding-3-small"
    ).data[0].embedding

    points.append(PointStruct(id=i, vector=embeddings, payload={"text": chunk}))

### Index the embeddings

In [36]:
operation_info = qdrant_client.upsert(
    collection_name="demo",
    wait=True,
    points=points
)

### Query index

In [37]:
def create_answer_with_context(query):
    embeddings = openai.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    ).data[0].embedding

    search_result = qdrant_client.search(
        collection_name="demo",
        query_vector=embeddings, 
        limit=3
    )

    prompt = """You are a helpful HR assistant who answers 
                questions in brief based on the context below.
                All pdfs are CVs.
 
                Context:\n"""
    for result in search_result:
        prompt += result.payload['text'] + "\n---\n"
    prompt += "Question:" + query + "\n---\n" + "Answer:"

    completion = openai.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [38]:
input = "Where did this candidate go to school?"
answer = create_answer_with_context(input)
print(answer)

The candidate went to school at the University of Oregon and the University of Washington.


In [39]:
input = "When did Daniel shift to Machine Learning?"
answer = create_answer_with_context(input)
print(answer)

There is no information about Daniel shifting to Machine Learning in the provided context.
