In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import openai
import pandas as pd

In [27]:

qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.create_collection(
    collection_name="job-postings-collection-02",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [5]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")

In [6]:
def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df['text'] = df.apply(preprocess_data,axis=1)

In [22]:
df_sample = df.sample(n=50, random_state=124)

In [8]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [23]:

data_to_embed = df_sample["text"].tolist()
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data},
        )
    )
     

In [28]:
qdrant_client.upsert(
    collection_name="job-postings-collection-02",
    wait=True,
    points=pointstructs,
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [24]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a job network assistant that can examine whether a job posting is fake but can also answer questions about the job posting itself.
I will provide all of the available postings to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that cannot be answered with the available chunks.


{json.dumps(output_schema, indent=2)}


I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(data_to_embed)]}
"""

In [25]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "What are the primary requirements to be hired as a Data Scientist at Zaius?",
    "chunk_ids": [1],
    "answer_example": "The primary requirements for a Data Scientist at Zaius include a Master's Degree or PhD in a technical field such as computer science, machine learning, or applied mathematics, 3+ years of experience in commercial or government settings, and practical knowledge of data mining techniques and the use of R, Matlab, or python.",
    "reasoning": "Chunk 1 provides a detailed job description, including educational, professional, and technical requirements."
  },
  {
    "question": "Which job postings offer a 401K pension plan as part of their benefits?",
    "chunk_ids": [0, 1, 18, 33],
    "answer_example": "The Sales Representative, Data Scientist at Zaius, Title Account Executive at Network Closing Services, and Systems Engineer III at E Squared C offer a 401K plan as part of their benefits.",
    "reasoning": "Chunks 0, 1, 18, and 33 m

In [26]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json_output.replace("// Five unanswerable questions below", "")
json_output = json.loads(json_output)

In [29]:

from langsmith import Client
import os

client = Client(api_key=os.environ["LANGSMITH_API_KEY"])

dataset_name = "rag-evaluation-job-posting-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG pipeline"
)

In [30]:

for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [qdrant_client.retrieve(collection_name="job-postings-collection-02", ids=[id], with_payload=True)[0].payload["text"] for id in item["chunk_ids"]]
        }
    )

In [31]:
evals_df = pd.DataFrame(json_output)

In [32]:
evals_df.head(10)

Unnamed: 0,question,chunk_ids,answer_example,reasoning
0,What are the primary requirements to be hired ...,[1],The primary requirements for a Data Scientist ...,"Chunk 1 provides a detailed job description, i..."
1,Which job postings offer a 401K pension plan a...,"[0, 1, 18, 33]","The Sales Representative, Data Scientist at Za...","Chunks 0, 1, 18, and 33 mention 401K plans in ..."
2,What is the salary range for the Data-Network ...,[9],The salary for the Data-Network Engineer role ...,Chunk 9 includes the salary range in the job d...
3,Which postings offer remote or telecommute wor...,[5],The Director of Growth position at InVision of...,Chunk 5 specifies that the Director of Growth ...
4,Are there any positions that require proficien...,"[0, 3, 12, 30]","The Sales Representative, Operations Paid Inte...",These chunks explicitly mention Microsoft Offi...
5,Which jobs mention opportunities for relocatio...,"[7, 33]",The Ruby Web Application Developer at Eagle me...,"Chunk 7 describes willingness to relocate, and..."
6,How do the company cultures differ at Aviary a...,"[4, 29]","Aviary emphasizes a creative, team-driven star...",Both chunks give detailed descriptions of thei...
7,Which postings indicate award-winning companies?,"[12, 21, 29]",Le Meridien Kuala Lumpur has won the 2013 Best...,Award mentions are found in the provided compa...
8,What are the educational requirements for the ...,[27],A Bachelor’s degree from a four-year college/u...,Chunk 27 lists the education/experience qualif...
9,Which job postings mention equity or stock opt...,"[1, 14]",The Data Scientist position at Zaius and the A...,Equity and Employee Stock Option Plan are spec...


In [33]:
evals_df.to_csv("evals.csv",index=False)