# QDRANT search engine
We are using qdrant cloud to store data and then do some quick search. Reranking is done on the python side.

## Setup
simply install:

```bash
pip install qdrant-client
```

In [1]:
import os
import json
import numpy as np
from dotenv import load_dotenv
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("QDRANT_API_KEY")

### Step 1: connect to Qdrant

In [3]:
qdrant_client = QdrantClient(
    url="https://1497c57a-fec5-4169-8998-262cd4f287dc.us-west-1-0.aws.cloud.qdrant.io:6333", 
    api_key=api_key,
)

### Step 2: set up embedding model
We are selecting the small jinaai model since its small (512... which means it takes 512 tokens or less), it's solidly in english, and we're just working on small text with no fancy jargon.

The long descriptions tend to be <512 tokens so no need to chunk.

In [4]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
embedding_dim = 512

### Step 3: create a collection
It's a set of points (vectors with optional payloads)... its a container of my vector search.

In [19]:
# Define the collection name
collection_name = "degree-information"

# Create the collection with specified vector parameters
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "description_vector": models.VectorParams(size=embedding_dim, distance=models.Distance.COSINE),
        "career_vector": models.VectorParams(size=embedding_dim, distance=models.Distance.COSINE),
    }
)

True

### Step 4: create, embed & insert points into the collection

In [20]:
# load the degree data from that json file
degree_data_filename = "../datasets/degree_data_all_wCareers.json"
with open(degree_data_filename, "r") as f:
        documents = json.load(f)

In [21]:
# create point structure that embeds and setups data format
points = []
id = 0

for doc in documents:
    # Concatenate text to embed
    description_text = f"{doc['degreeTitle']} {doc['shortDescription']} {doc['longDescription']}"
    careers_text = ", ".join(doc['careers'])

    point = models.PointStruct(
        id=id,
        vector={
            "description_vector": models.Document(text=description_text, model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
            "career_vector": models.Document(text=careers_text, model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        },
        payload={
            "degreeTitle": doc['degreeTitle'],
            "shortDescription": doc['shortDescription'],
            "careers": careers_text,
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

In [22]:
# insert points into collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## OPTIONAL STEP: add data to the collection

In [5]:
collection_name = "degree-information"

# load the degree data from that json file
degree_data_filename = "../datasets/degree_data_all_wCareers_urls.json"
with open(degree_data_filename, "r") as f:
        documents = json.load(f)

In [14]:
for idx, doc in enumerate(documents):
    qdrant_client.set_payload(
        collection_name=collection_name,
        payload={"url": doc["url"]},
        points=[idx]
    )

In [None]:
# update points into collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

### Step 5: running a similarity search

In [None]:
# def search(query, limit=1):
#     """ This function is for searching amongst all the careers listed per degree"""
#     results = qdrant_client.query_points(
#         collection_name=collection_name,
#         query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
#             text=query,
#             model=model_handle 
#         ),
#         limit=limit, # top closest matches
#         with_payload=True, #to get metadata in the results
#         using="career_vector"

#     )

#     return results

In [57]:
def search(selected_career,user_profile, limit=1):
    """ This function is for searching amongst all the careers listed per degree"""
    results = qdrant_client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(text=selected_career,model=model_handle),
                using="career_vector",
                limit=20,
            ),
            models.Prefetch(
                query=models.Document(text=user_profile,model=model_handle),
                using="description_vector",
                limit=20,
            ),
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=limit,
        with_payload=True
    )

    return results

In [62]:
# make a search
career_selected = "data scientist"
user_profile = """Where are you in your professional development? early-career professional; What are your main interests or passions? Technology, Science and Research; What skills do you feel most confident in? Technical Skills, Analytical Skills; What type of work environment do you prefer? Remote work; What are your career goals or motivations for pursuing a new degree? Advancing in my current field; What industries are you most interested in working in? Information Technology; What type of job roles are you most interested in pursuing? Software Development, Data Analysis; What aspects of a job do you find most rewarding? Solving complex problems, Working with cutting-edge technology; What type of side gigs or freelance work are you interested in exploring? Freelance coding or development, Consulting; What level of education are you aiming to achieve with your new degree? Master's Degree
"""
hits = search(career_selected,user_profile,limit=5)


### Step 6: Ask chatGPT to recommend the top 3 choices

In [67]:
# format the results into text to plug into chatGPT
recommended_degrees_data = []
for hit in hits.points:
    degree_data = {}
    degree_data["degree_title"] = hit.payload['degreeTitle']
    degree_data["careers"] = hit.payload['careers']
    degree_data["degree_description"] = hit.payload['shortDescription']
    recommended_degrees_data.append(degree_data)


In [69]:
recommended_degrees = json.dumps(recommended_degrees_data,indent=2)
print(recommended_degrees)

[
  {
    "degree_title": "Online Master of Science in Information Technology (IT)",
    "careers": "Data analyst, Database architect, Data engineer, DevOps engineer, Full stack developer, Network/cloud engineer, Network forensics engineer, Security engineer, Software engineer, Solutions architect",
    "degree_description": "Earning a Master of Science in information technology online will equip you to become a technology leader who drives business strategy, defines investment priorities and leads successful teams. \u00a0As a graduate, you\u2019ll be skilled in topics ranging from data science to cybersecurity, preparing you to advance your information technology career."
  },
  {
    "degree_title": "Online Master of Science in Program Evaluation and Data Analytics",
    "careers": "Data ambassador, Data analyst, Data fellow, Data programmer, Data science engineer, Database-driven website consultant, Program performance and evaluation manager, Programmer analyst, Senior analyst in pe

In [70]:
SYSTEM_PROMPT = """ 
You are a career counselor with access to a list called "degrees_data" that contains 5 degree options. These degrees were selected based on a user's answers to a "career_quiz" and their chosen "career_selection".

Your task is to recommend the top 3 degrees from "degrees_data" that best match the user's profile and career interest.

Return your answer in the following JSON format — do not include any extra text, explanations, markdown, or code blocks:

[
    {
        "top_choice_1": "<Degree title>",
        "reasoning": "<Concise explanation of why this degree fits the user's selected career>"
    },
    {
        "top_choice_2": "<Degree title>",
        "reasoning": "<Concise explanation of why this degree fits the user's selected career>"
    },
    {
        "top_choice_3": "<Degree title>",
        "reasoning": "<Concise explanation of why this degree fits the user's selected career>"
    }
]


If you think that none of the degrees are a good fit for the user, then return the following:

[
    {
        "top_choice_1": "NONE",
        "reasoning": "<Concise explanation of why you think none of the degrees were a good fit.>"
    },

]

The output must be valid JSON and fully parsable using `json.loads()` in Python.

When wording your reasonings for fit/no fit, talk as if you were talking to the user in a friendly and professional manner.
"""


In [71]:
USER_PROMPT = """ 

<career_selection>
{career_selection}
</career_selection>

<career_quiz>
{career_quiz}
</career_quiz>

<degrees_data>
{degrees_data}
</degrees_data>
"""

In [72]:
selections_user_prompt = USER_PROMPT.format(career_selection=career_selected,career_quiz=user_profile,degrees_data=recommended_degrees)

In [73]:
from openai import OpenAI
from dotenv import load_dotenv

In [74]:
# Load environment variables from .env file
load_dotenv()
# load openai key
client = OpenAI()

In [75]:
# set up fcn to call openai
def llm(user_prompt,system_prompt="you are a helpful assistant",model="gpt-4o-mini",temperature=0.5):
    ''' this function calls the openAI api and feeds it user/system prompts'''
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature,
        )
    return response.choices[0].message.content

In [76]:
#actually call the LLM
response = llm(user_prompt = selections_user_prompt,system_prompt=SYSTEM_PROMPT,model="gpt-4o-mini",temperature=0.5)

In [77]:
print(response)

[
    {
        "top_choice_1": "Online Master of Computer Science – Big Data Systems",
        "reasoning": "This degree is a perfect fit for your interest in becoming a data scientist as it focuses on big data systems and analytical expertise, essential skills for interpreting complex data sets."
    },
    {
        "top_choice_2": "Online Master of Science in Program Evaluation and Data Analytics",
        "reasoning": "This program aligns well with your passion for technology and data analysis, providing you with the skills to drive performance improvements through data insights, which is vital for a data scientist."
    },
    {
        "top_choice_3": "Online Master of Science in Information Technology (IT)",
        "reasoning": "This degree equips you with a broad range of IT skills, including data science, which will help you advance in your current field and prepare you for various roles in data analysis and software development."
    }
]


In [78]:
# extract bot recommendations
recommendations_bot = json.loads(response)
print(recommendations_bot)

[{'top_choice_1': 'Online Master of Computer Science – Big Data Systems', 'reasoning': 'This degree is a perfect fit for your interest in becoming a data scientist as it focuses on big data systems and analytical expertise, essential skills for interpreting complex data sets.'}, {'top_choice_2': 'Online Master of Science in Program Evaluation and Data Analytics', 'reasoning': 'This program aligns well with your passion for technology and data analysis, providing you with the skills to drive performance improvements through data insights, which is vital for a data scientist.'}, {'top_choice_3': 'Online Master of Science in Information Technology (IT)', 'reasoning': 'This degree equips you with a broad range of IT skills, including data science, which will help you advance in your current field and prepare you for various roles in data analysis and software development.'}]
