In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import openai
import pandas as pd

In [8]:

qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.create_collection(
    collection_name="job-postings-collection-02",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [9]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")

In [10]:
def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df['text'] = df.apply(preprocess_data,axis=1)

In [11]:
df_sample = df.sample(n=50, random_state=25)

In [7]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [14]:

data_to_embed = df_sample["text"].tolist()
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data},
        )
    )
     

In [15]:
qdrant_client.upsert(
    collection_name="job-postings-collection-02",
    wait=True,
    points=pointstructs,
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a job network assistant that can answer questions about the job postings we have available and if they are fake or real.
I will provide all of the available postings to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multipple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.


{json.dumps(output_schema, indent=2)}


I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(data_to_embed)]}
"""

In [17]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "What are the main responsibilities and benefits for the Sales Leader - Healthcare Veterans role at SpotSource Solutions LLC?",
    "chunk_ids": [0],
    "answer_example": "The Sales Leader - Healthcare Veterans at SpotSource Solutions LLC is responsible for contacting and networking with Veterans and healthcare referral sources, educating clients about pension benefits and the VetAssist program, assisting with government benefit applications, conducting in-services, attending meetings, managing sales contacts via CRM, and completing administrative duties. Benefits include gas reimbursement for travel.",
    "reasoning": "All job-specific responsibilities, benefits, and some background information are present in chunk 0."
  },
  {
    "question": "What qualifications are required to apply for the Collections Supervisor position at Tidewater Finance Co.?",
    "chunk_ids": [2],
    "answer_example": "The position requires five years of collection experience

In [19]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json_output.replace("// Five unanswerable questions below", "")
json_output = json.loads(json_output)

In [20]:

from langsmith import Client
import os

client = Client(api_key=os.environ["LANGSMITH_API_KEY"])

dataset_name = "rag-evaluation-job-posting-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG pipeline"
)

In [21]:

for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [qdrant_client.retrieve(collection_name="job-postings-collection-02", ids=[id], with_payload=True)[0].payload["text"] for id in item["chunk_ids"]]
        }
    )

In [22]:
evals_df = pd.DataFrame(json_output)

In [23]:
evals_df.head()

Unnamed: 0,question,chunk_ids,answer_example,reasoning
0,What are the main responsibilities and benefit...,[0],The Sales Leader - Healthcare Veterans at Spot...,"All job-specific responsibilities, benefits, a..."
1,What qualifications are required to apply for ...,[2],The position requires five years of collection...,All requirements are explicitly listed in the ...
2,What benefits are offered for the Collections ...,[2],Benefits include vacation hours (40 after 6 mo...,Benefits are directly described in chunk 2.
3,Is a degree required to teach English abroad w...,"[3, 11]","Yes, a university degree (Bachelor's) is requi...",Both chunks mention the university degree as a...
4,Which job postings mention background checks a...,"[7, 21, 37]",The Handyman & Independent Contractors (Home P...,All referenced chunks contain mention of backg...


In [25]:
evals_df.to_csv("evals.csv",index=False)