# RAG Preprocessing

In [1]:
import pandas as pd
import openai
import os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

In [2]:
# 1. Set up OpenAI and Qdrant clients
openai.api_key = os.getenv("OPENAI_API_KEY")
qdrant = QdrantClient("http://localhost:6333")  # Adjust if needed

In [40]:
# 2. Load the dataset
df = pd.read_csv("../data/fraud/fake_job_postings.csv")
# df.rename(columns={'EVENT_ID':'job_id', 'EVENT_LABEL': 'fraudulent'}, inplace=True)
df = df.fillna('')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [41]:
# 3. Prepare the text to embed (e.g., title + description)
# df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

# df_sample = df.sample(n=100, random_state=123)

def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df['text'] = df.apply(preprocess_data,axis=1)

In [42]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"Marketing Intern Company: We're Food52, and we..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,Customer Service - Cloud Video Production Comp...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,Commissioning Machinery Assistant (CMA) Compan...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,Account Executive - Washington DC Company: Our...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,Bill Review Manager Company: SpotSource Soluti...


In [43]:
# 4. Generate OpenAI embeddings in batches
def get_embeddings(texts, model="text-embedding-3-small"):
    # OpenAI API can take batches up to 2048 inputs, but smaller batches are safer
    batch_size = 500
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        # Each response.data[i].embedding is a list of floats
        batch_embeddings = [item.embedding for item in response.data]
        embeddings.extend(batch_embeddings)
    return embeddings

embeddings = get_embeddings(df['text'].tolist(), model="text-embedding-3-small")

In [44]:

# 5. Create the Qdrant collection (embedding size for text-embedding-3-small is 1536)
collection_name = "job_posting_collection_01"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=1536,
        distance=Distance.COSINE,
    ),
)


  qdrant.recreate_collection(


True

In [46]:


# 6. Prepare metadata (payload)
payloads = df[['job_id', 'title', 'company_profile', 'location','fraudulent']].to_dict(orient='records')

# 7. Upsert to Qdrant in batches
batch_size = 1000
for i in range(0, len(df), batch_size):
    batch_ids = list(range(i, min(i + batch_size, len(df))))
    batch_vectors = embeddings[i:i+batch_size]
    batch_payloads = payloads[i:i+batch_size]
    points = [
        PointStruct(
            id=idx,
            vector=vector,
            payload=payload
        )
        for idx, vector, payload in zip(batch_ids, batch_vectors, batch_payloads)
    ]
    qdrant.upsert(collection_name=collection_name, points=points)

In [33]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [34]:
def retrieve_data(query):
    query_embedding = get_embedding(query)
    results = qdrant.query_points(
        collection_name="job_postings",
        query=query_embedding,
        limit=10,
    )
    return results

In [None]:
retrieve_data("What are the requirements of a Data Analyst?").points

[ScoredPoint(id=4802, version=4, score=0.7047929, payload={'job_id': 1168, 'title': 'Data Analyst', 'company_profile': '', 'location': 'CA, ON, Toronto', 'fraudulent': 0}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11658, version=11, score=0.66488993, payload={'job_id': 9255, 'title': 'Business Analyst', 'company_profile': "The only thing we love more than\xa0our data\xa0is our team.We're a group of developers, designers, mathematicians, data scientists, researchers and marketeers that work relentlessly to measure online behavior worldwide and to generate marketing insights.Together, we are shaping the future of web measurement and competitive intelligence.", 'location': 'IL, TA, Tel Aviv', 'fraudulent': 0}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=13714, version=13, score=0.64069873, payload={'job_id': 9010, 'title': 'Data Analyst (Career Development)', 'company_profile': '', 'location': 'US, MA, Waltham', 'fraudulent': 0}, vector=None, shar

In [37]:
retrieve_data("What are some technology job postings?").points

[ScoredPoint(id=5410, version=5, score=0.58260953, payload={'job_id': 8321, 'title': 'Director of Technology', 'company_profile': 'Orange County Tech Recruiters is based in Orange County and focuses on finding full time technical talent for clients in California with particular emphasis on OC, LA, SF and San Diego. \xa0We are a small focused team and each of us have 20+ years in technology and technology recruiting.\xa0 Please see our LinkedIn profiles and recommendations below:\xa0Robert Ardell - #URL_b6d48964c0b9b0074359c7a207623fd2360c853a418da3bfa1c6c18f79a1a6f6#Jake Villarreal - #URL_8bfb8c2cfcf91aeb1253da2a322b5617a4f92bd358eb0badeefafa67450370af#', 'location': 'US, CA, Irvine', 'fraudulent': 0}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11778, version=11, score=0.5547285, payload={'job_id': 6032, 'title': 'Software Engineer - C#.Net HTML CSS Java XML PHP C#- NV', 'company_profile': 'We Provide Full Time Permanent Positions for many medium to large US compan

In [38]:
retrieve_data("Examples of a fake job posting?").points

[ScoredPoint(id=1101, version=1, score=0.5198959, payload={'job_id': 12701, 'title': 'Awesome Fake Job', 'company_profile': 'Focus Lab, LLC is a boutique brand and interactive design company headquartered in the heart of Savannah, Ga. Our clients—ranging from innovative West Coast startups to established global firms—come to us for clean, intentional design work, customized content management builds and consulting, and a great overall customer experience. We have an integrated, talented team of designers and developers with a penchant for web nerdery, Superman t-shirts, and old school Gameboy. More at #URL_8cf0af10d3b6d6b96ca3cab7993e43c3b759467d96c24660e54ecdf60f92cc74#', 'location': 'US, GA, Savannah', 'fraudulent': 0}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11977, version=11, score=0.45144224, payload={'job_id': 12339, 'title': 'Open Job Application', 'company_profile': 'Massive Media\xa0is the social media company behind the successful digital brands\xa0#UR