# RAG Preprocessing

In [47]:
import pandas as pd
import openai
import os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

In [67]:
# 1. Set up OpenAI and Qdrant clients
openai.api_key = os.getenv("OPENAI_API_KEY")
qdrant = QdrantClient("http://0.0.0.0:6333")  # Adjust if needed

In [49]:
# 2. Load the dataset
df = pd.read_csv("../data/fraud/fake_job_postings.csv")
# df.rename(columns={'EVENT_ID':'job_id', 'EVENT_LABEL': 'fraudulent'}, inplace=True)
df = df.fillna('')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [50]:
# 3. Prepare the text to embed (e.g., title + description)
# df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

# df_sample = df.sample(n=100, random_state=123)

def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df['text'] = df.apply(preprocess_data,axis=1)

In [42]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"Marketing Intern Company: We're Food52, and we..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,Customer Service - Cloud Video Production Comp...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,Commissioning Machinery Assistant (CMA) Compan...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,Account Executive - Washington DC Company: Our...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,Bill Review Manager Company: SpotSource Soluti...


In [51]:
# 4. Generate OpenAI embeddings in batches
def get_embeddings(texts, model="text-embedding-3-small"):
    # OpenAI API can take batches up to 2048 inputs, but smaller batches are safer
    batch_size = 500
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        # Each response.data[i].embedding is a list of floats
        batch_embeddings = [item.embedding for item in response.data]
        embeddings.extend(batch_embeddings)
    return embeddings

embeddings = get_embeddings(df['text'].tolist(), model="text-embedding-3-small")

In [68]:

# 5. Create the Qdrant collection (embedding size for text-embedding-3-small is 1536)
collection_name = "job-postings-collection-01"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=1536,
        distance=Distance.COSINE,
    ),
)


  qdrant.recreate_collection(


True

In [60]:
# 6. Prepare metadata (payload)
payloads = df[['text']].to_dict(orient='records')
payloads[0:10]

[{'text': "Marketing Intern Company: We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, business, and engineering team. We're focused on using technology to find new and better ways to connect people around their specific food interests, and to offer them superb, highly curated information about food and cooking. We attract the most talented home cooks and contributors in the country; we also publish well-known professionals like Mario Batali, Gwyneth Paltrow, and Danny Meyer. And we have partnerships with Whole Foods Market and Random House.Food52 has been named the best food website by the James Beard Foundation and IACP, and has been featured in the New York Times, NPR, Pando Daily, TechCrunch, and on the Today Show.We're located in Chelsea, in New York City. \n    Requirements: Experience with content management systems a major plus (

In [71]:

# 7. Upsert to Qdrant in batches
batch_size = 100
for i in range(0, len(df), batch_size):
    batch_ids = list(range(i, min(i + batch_size, len(df))))
    batch_vectors = embeddings[i:i+batch_size]
    batch_payloads = payloads[i:i+batch_size]
    points = [
        PointStruct(
            id=idx,
            vector=vector,
            payload=payload
        )
        for idx, vector, payload in zip(batch_ids, batch_vectors, batch_payloads)
    ]
    qdrant.upsert(collection_name=collection_name, points=points)

In [33]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [75]:
def retrieve_data(query):
    query_embedding = get_embedding(query)
    results = qdrant.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=10,
    )
    return results

In [76]:
retrieve_data("What are the requirements of a Data Analyst?").points

[ScoredPoint(id=1167, version=11, score=0.7039708, payload={'text': 'Data Analyst Company:  \n    Requirements: \n    Benefits:  \n    Education:  \n    Salary:  \n    '}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9621, version=96, score=0.6721088, payload={'text': 'Data Analyst Company: Maxnet offers Staff Augmentation Solutions for Big Data Analytics in Retail, Healthcare and Regulatory &amp; Compliance.We offer Staff Augmentation solutions for Business Intelligence, Data Analyst, Business Analyst, Quality Analyst, .NET Application Developers.We serve you with Reliability, Honesty &amp; Integrity.Our Recruiters are trained to suggest “best fits” for our client needs.Our solutions carry in-depth understanding of Client Needs with regards to Skills, Knowledge, Experience, Corporate Attitude.Our Consultant Database is strong and built with the right insights to serve quick and accurate solutions. \n    Requirements: MUST HAVE SKILLS:Know how to map DataPerform Gap 

In [77]:
retrieve_data("What are some technology job postings?").points

[ScoredPoint(id=8320, version=83, score=0.58725035, payload={'text': "Director of Technology Company: Orange County Tech Recruiters is based in Orange County and focuses on finding full time technical talent for clients in California with particular emphasis on OC, LA, SF and San Diego. \xa0We are a small focused team and each of us have 20+ years in technology and technology recruiting.\xa0 Please see our LinkedIn profiles and recommendations below:\xa0Robert Ardell - #URL_b6d48964c0b9b0074359c7a207623fd2360c853a418da3bfa1c6c18f79a1a6f6#Jake Villarreal - #URL_8bfb8c2cfcf91aeb1253da2a322b5617a4f92bd358eb0badeefafa67450370af# \n    Requirements: Qualification requirements include:Familiarity in the Following Areas:C/C++ or Java Software DevelopmentPython, Perl, and PHP DevelopmentPersistent Client/Server and Database-Driven ApplicationsCritical Software Development8+ years Technical Leadership/Management Experience5+ years Hands-On Software Development Experience5+ years Hands-On System

In [78]:
retrieve_data("Examples of a fake job posting?").points

[ScoredPoint(id=12700, version=127, score=0.5275068, payload={'text': 'Awesome Fake Job Company: Focus Lab, LLC is a boutique brand and interactive design company headquartered in the heart of Savannah, Ga. Our clients—ranging from innovative West Coast startups to established global firms—come to us for clean, intentional design work, customized content management builds and consulting, and a great overall customer experience. We have an integrated, talented team of designers and developers with a penchant for web nerdery, Superman t-shirts, and old school Gameboy. More at #URL_8cf0af10d3b6d6b96ca3cab7993e43c3b759467d96c24660e54ecdf60f92cc74# \n    Requirements: Must communicate well with othersMust appreciate\xa0fine cinema such as Monty Python and the Holy GrailFluent in English and\xa0ParseltongueExperience with computer science, dentistry, and\xa0fountain pen repair is a plus\n    Benefits: Full medical, dental, and vision401k plus company matchProfit sharingOnly work 1 hour per w