# Imports

In [22]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct

import pandas as pd
import openai

# Load Dataset

In [23]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [24]:
def format_data(df):
    mapping = {0:False, 1:True}
    df['telecommuting'] = df['telecommuting'].map(mapping)
    df['has_company_logo'] = df['has_company_logo'].map(mapping)
    df['has_questions'] = df['has_questions'].map(mapping)
    df = df.fillna('Not Available')
    df['fraudulent'] = df['fraudulent'].map(mapping)
    return df


def preprocess_data(row):
    label_map = {0: "No", 1: "Yes"}
    
    # Safely get and truncate company_profile (handle NaN and non-string)
    company_profile = row.get('company_profile', '')
    if not isinstance(company_profile, str):
        company_profile = '' if pd.isna(company_profile) else str(company_profile)
    company_profile = company_profile[:300]
    
    # Safely get other fields
    title = row.get('title', '')
    job_id = row.get('job_id', '')
    description = row.get('description', '')
    requirements = row.get('requirements', '')
    location = row.get('location', '')
    
    # Handle telecommuting and fraudulent fields
    telecommuting = label_map.get(row.get('telecommuting', 0), 'No')
    
    text = f"""{title} Company: {company_profile}
    Job Id: {job_id}
    Description: {description}
    Requirements: {requirements}
    Location: {location}
    Telecommuting: {telecommuting}
    """
    return text

df = format_data(df)
df['text'] = df.apply(preprocess_data, axis=1)
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
0,1,Marketing Intern,"US, NY, New York",Marketing,Not Available,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Not Available,False,True,False,Other,Internship,Not Available,Not Available,Marketing,False,"Marketing Intern Company: We're Food52, and we..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Not Available,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,False,True,False,Full-time,Not Applicable,Not Available,Marketing and Advertising,Customer Service,False,Customer Service - Cloud Video Production Comp...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Not Available,Not Available,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Not Available,False,True,False,Not Available,Not Available,Not Available,Not Available,Not Available,False,Commissioning Machinery Assistant (CMA) Compan...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,Not Available,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,False,True,False,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,False,Account Executive - Washington DC Company: Our...
4,5,Bill Review Manager,"US, FL, Fort Worth",Not Available,Not Available,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,False,True,True,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,False,Bill Review Manager Company: SpotSource Soluti...


# Create new collection for hybrid search

In [9]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [25]:
qdrant_client.create_collection(
    collection_name="job-postings-collection-hybrid-search",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

True

In [26]:
# Create an exact text search index on the payload field "text"

qdrant_client.create_payload_index(
    collection_name="job-postings-collection-hybrid-search",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [27]:
# Sample the dataset, embed text data and add additional fields to payload

df_sample = df.sample(n=100, random_state=123)
df_sample.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
11391,11392,House and Office Cleaners / Housekeepers FT/PT,"US, GA, Atlanta",Not Available,Not Available,Hedge your bets - work with the best domestic ...,Do more of the work you love and earn more wit...,Requirements:* Must have own supplies and reli...,Pay is $15/hr (add tips and you make even more...,False,True,True,Other,Not Applicable,Not Available,Facilities Services,Not Available,False,House and Office Cleaners / Housekeepers FT/PT...
10396,10397,Director of Software Engineering,"US, CA, San Mateo",Engineering,Not Available,#URL_ddb080358fa5eecf5a67c649cfb4ffc343c484389...,As Director of Software Engineering's newly fo...,Requirements: At least 10+ years in software ...,Our core values drive our culture. This is wha...,False,True,True,Full-time,Director,Master's Degree,Marketing and Advertising,Engineering,True,Director of Software Engineering Company: #URL...
5484,5485,Accounting Manager,"US, , Portland",Not Available,65000-80000,Human capital is usually the biggest asset and...,Who are client is…An innovator in solar techno...,"What you need to have….High integrity, ethics,...",Not Available,False,True,True,Full-time,Mid-Senior level,Bachelor's Degree,Accounting,Accounting/Auditing,False,Accounting Manager Company: Human capital is u...
16446,16447,Community Support Manager (CSM),"US, SC, Fort Mill",Not Available,Not Available,We Help Create Communities that Withstand the ...,Job Title: Community Support ManagerGe...,Job RequirementsAbility to listenGood written ...,Not Available,False,True,False,Full-time,Entry level,High School or equivalent,Not Available,Customer Service,False,Community Support Manager (CSM) Company: We He...
4144,4145,Sr. Systems Developer,"US, NY, New York",Not Available,Not Available,"We design, deploy, and maintain advanced techn...",We are looking for highly skilled developer to...,Practical Knowledge &amp; Experience:Strong re...,"At Fivesky, our employees are our greatest ass...",False,True,True,Not Available,Not Available,Not Available,Not Available,Not Available,False,"Sr. Systems Developer Company: We design, depl..."


In [28]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [29]:
data_to_embed = df_sample[["text", "job_id", "department","function", "has_company_logo", "employment_type", "salary_range", "fraudulent"]].to_dict(orient="records")
data_to_embed[:2]

[{'text': 'House and Office Cleaners / Housekeepers FT/PT Company: Hedge your bets - work with the best domestic and commercial cleaning provider in Atlanta.\n    Job Id: 11392\n    Description: Do more of the work you love and earn more with us. Best Bets Atlanta is looking for dedicated independent cleaning professionals who:&gt; Must be reliable and able to work independently&gt; Have some existing cleaning experience, although we can train if needed&gt; Take pride in and are dedicated to their work\xa0&gt; Have high standards of cleanlinessNo resume required to apply. Apply with your name, email, and phone number, and a member of our team will get in touch to complete the application process with you.Work in any (or all) metro Atlanta counties we cover, including Cherokee, Gwinnett, N. Fulton, Coweta, Cobb, Hall, Fulton (including City of Atlanta), Clayton, Douglas, DeKalb, Forsyth, Fayette, Paulding, Rockdale, Henry, S. Fulton.About Best Bets:Hedge Your Bets - work with the best d

In [30]:
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["text"])
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["text"],
                "job_id": data["job_id"],
                "department": data["department"],
                "function": data["function"],
                "employment_type": data["employment_type"],
                "salary_range": data["salary_range"],
                "fraudulent": data["fraudulent"]
            }
        )
    )

In [None]:
def chunk_text(text, max_tokens=800):
    # Use a tokenizer; this is a naive splitter (replace with tiktoken or your preferred lib)
    words = text.split()
    chunks = []
    while words:
        chunk = words[:max_tokens]
        chunks.append(' '.join(chunk))
        words = words[max_tokens:]
    return chunks

for i, data in enumerate(data_to_embed):
    text_chunks = chunk_text(data["text"], max_tokens=800)  # Adjust max_tokens as needed
    for j, chunk in enumerate(text_chunks):
        embedding = get_embedding(chunk)
        pointstructs.append(
            PointStruct(
                id=f"{i}_{j}",   # Unique ID per chunk
                vector=embedding,
                payload={
                    "text": chunk,
                    "job_id": data["job_id"],
                    "department": data["department"],
                    "function": data["function"],
                    "employment_type": data["employment_type"],
                    "salary_range": data["salary_range"],
                    "fraudulent": data["fraudulent"],
                    "parent_id": i  # Optional: link back to original record
                }
            )
        )

In [31]:
# Write embedding vector the Qdrant collection

qdrant_client.upsert(
    collection_name="job-postings-collection-hybrid-search",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

# Hybrid search - Rank Fusion

In [10]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="job-postings-collection-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

# Sandbox

In [33]:
import instructor
from pydantic import BaseModel
from openai import OpenAI
from typing import List

In [34]:
class RAGGenerationResponse(BaseModel):
    answer: str

client = instructor.from_openai(OpenAI())

prompt = """
You are a helpful fraud analyst.
Return an answer to the question.
Question: What is your name?
"""

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4.1",
    response_model=RAGGenerationResponse,
    messages=[{"role": "user", "content": prompt}],
    temperature=0.5,
)

In [35]:
response

RAGGenerationResponse(answer='I am an AI fraud analyst assistant and do not have a personal name. You can refer to me as your fraud analysis assistant.')

In [1]:
import instructor
from pydantic import BaseModel
from openai import OpenAI 

In [None]:
class RAGGenerationResponse(BaseModel):
    answer: str

client = instructor.from_openai(OpenAI())

prompt ="""You are a helpful fraud analyst. Return answer to the question. Question: What is your name?"""

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4.1",
    response_model=RAGGenerationResponse,
    messages=[{"role":"user", "content":prompt}],
    temperature=0.5
)

In [None]:
# With metadata
raw_response

ChatCompletion(id='chatcmpl-Bsq8sIoMTholTH9qIbjuvrrTRPEY6', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_ZuNgyfCKndgzCDb9t4qMGq3b', function=Function(arguments='{"answer":"I am an AI fraud analyst and do not have a personal name."}', name='RAGGenerationResponse'), type='function')]))], created=1752409846, model='gpt-4.1-2025-04-14', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=18, prompt_tokens=91, total_tokens=109, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [20]:
results = retrieve_data("Show me post for House and Office Cleaners?")

In [21]:
retrieved_context_ids = []
retrieved_context = []
similarity_scores = []

for result in results.points:
    retrieved_context_ids.append(result.id)
    retrieved_context.append(result.payload['text'])
    similarity_scores.append(result.score)

{
    "retrieved_context_ids": retrieved_context_ids,
    "retrieved_context": retrieved_context,
    "similarity_scores": similarity_scores
}

{'retrieved_context_ids': [0, 38, 55, 77, 92],
 'retrieved_context': ['House and Office Cleaners / Housekeepers FT/PT Company: Hedge your bets - work with the best domestic and commercial cleaning provider in Atlanta.\n    Description: Do more of the work you love and earn more with us. Best Bets Atlanta is looking for dedicated independent cleaning professionals who:&gt; Must be reliable and able to work independently&gt; Have some existing cleaning experience, although we can train if needed&gt; Take pride in and are dedicated to their work\xa0&gt; Have high standards of cleanlinessNo resume required to apply. Apply with your name, email, and phone number, and a member of our team will get in touch to complete the application process with you.Work in any (or all) metro Atlanta counties we cover, including Cherokee, Gwinnett, N. Fulton, Coweta, Cobb, Hall, Fulton (including City of Atlanta), Clayton, Douglas, DeKalb, Forsyth, Fayette, Paulding, Rockdale, Henry, S. Fulton.About Best Be