In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import openai
import pandas as pd

In [85]:

qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.create_collection(
    collection_name="job-postings-collection-02",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [76]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")

In [36]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [77]:
def preprocess_data(row):
    label_map = {0: "No", 1: "Yes"}
    
    # Safely get and truncate company_profile (handle NaN and non-string)
    company_profile = row.get('company_profile', '')
    if not isinstance(company_profile, str):
        company_profile = '' if pd.isna(company_profile) else str(company_profile)
    company_profile = company_profile[:300]
    
    # Safely get other fields
    title = row.get('title', '')
    description = row.get('description', '')
    requirements = row.get('requirements', '')
    location = row.get('location', '')
    
    # Handle telecommuting and fraudulent fields
    telecommuting = label_map.get(row.get('telecommuting', 0), 'No')
    
    text = f"""{title} Company: {company_profile}
Description: {description}
Requirements: {requirements}
Location: {location}
Telecommuting: {telecommuting}
"""
    return text

df['text'] = df.apply(preprocess_data, axis=1)


In [78]:
df_sample = df.sample(n=50, random_state=124)

In [79]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [80]:

data_to_embed = df_sample["text"].tolist()
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data},
        )
    )
     

In [86]:
qdrant_client.upsert(
    collection_name="job-postings-collection-02",
    wait=True,
    points=pointstructs,
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [81]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a job network assistant that can examine whether a job posting is fake but can also answer questions about the job posting itself.
I will provide all of the available postings to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that cannot be answered with the available chunks.


{json.dumps(output_schema, indent=2)}


I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(data_to_embed)]}
"""

In [82]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "What are the key duties and benefits offered for the Sales Representative position at Westview Financial Services?",
    "chunk_ids": [0],
    "answer_example": "The key duties for the Sales Representative position include selling loans and insurance products, reviewing credit applications, closing and servicing loans, collecting delinquent accounts, handling administrative tasks, and more. Benefits include a competitive salary, comprehensive benefits package, paid leave, health and life insurance, and a 401K plan with company match.",
    "reasoning": "Chunk 0 contains a detailed list of duties and benefits for this specific position."
  },
  {
    "question": "What qualifications and technical skills are required for the Data Scientist position at Zaius?",
    "chunk_ids": [1],
    "answer_example": "The Data Scientist at Zaius requires a Master's degree or PhD in a technical field, 3+ years of experience, experience in algorithm design for scale or lat

In [68]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json_output.replace("// Five unanswerable questions below", "")
json_output = json.loads(json_output)

In [83]:

from langsmith import Client
import os

client = Client(api_key=os.environ["LANGSMITH_API_KEY"])

dataset_name = "rag-evaluation-job-posting-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG pipeline"
)

In [87]:

for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [qdrant_client.retrieve(collection_name="job-postings-collection-02", ids=[id], with_payload=True)[0].payload["text"] for id in item["chunk_ids"]]
        }
    )

In [73]:
evals_df = pd.DataFrame(json_output)

In [74]:
evals_df.head(10)

Unnamed: 0,question,chunk_ids,answer_example,reasoning
0,What are some typical qualifications required ...,[1],A Data Scientist at Zaius is expected to have ...,"Chunk 1 describes the role, company, and quali..."
1,Are there any job postings that offer paid vac...,[0],Westview Financial Services offers 40 vacation...,Chunk 0 lists the benefits package including v...
2,What makes a restaurant management job potenti...,[12],The posting for Restaurant Manager at Le Merid...,"Chunk 12 has 'Fraudulent: Yes', directly ident..."
3,Which jobs explicitly state that telecommuting...,"[27, 37]",The Civil Construction Coordinator at Landmark...,Chunks 27 and 37 are among the few listings wi...
4,What types of experience are required for a se...,[6],A Senior Ruby on Rails Developer should have e...,Chunk 6 is specific about the technology stack...
5,Which job posting is for a role supporting a r...,[29],The Internal Recruiter role at The Beans Group...,Chunk 29 describes the Beans Group's growth an...
6,Are there any positions that require a Series ...,[28],The Financial Advisor position at Covestor req...,Chunk 28 lists these licenses as requirements.
7,Which sales roles provide a base salary plus c...,"[8, 22]",Sales Agents at the digital marketing agency (...,Both chunks elaborate on compensation structur...
8,Are there job postings that mention candidate ...,[9],Middle East Recruitment for Data-Network Engin...,Chunk 9 directly describes travel cost reimbur...
9,What are the main duties for an Office Assista...,[15],Main duties include supporting instructional s...,Chunk 15 specifies these responsibilities in d...


In [75]:
evals_df.to_csv("evals.csv",index=False)