In [5]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import openai
import json
import tiktoken
import re
import string
from nltk.stem import PorterStemmer

In [2]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")

df_sample = df.sample(n=50, random_state=123)

## Define function for preprocess job descriptions

In [3]:

def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df_sample['text'] = df_sample.apply(preprocess_data,axis=1)
df_sample.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
11391,11392,House and Office Cleaners / Housekeepers FT/PT,"US, GA, Atlanta",,,Hedge your bets - work with the best domestic ...,Do more of the work you love and earn more wit...,Requirements:* Must have own supplies and reli...,Pay is $15/hr (add tips and you make even more...,0,1,1,Other,Not Applicable,,Facilities Services,,0,House and Office Cleaners / Housekeepers FT/PT...
10396,10397,Director of Software Engineering,"US, CA, San Mateo",Engineering,,#URL_ddb080358fa5eecf5a67c649cfb4ffc343c484389...,As Director of Software Engineering's newly fo...,Requirements: At least 10+ years in software ...,Our core values drive our culture. This is wha...,0,1,1,Full-time,Director,Master's Degree,Marketing and Advertising,Engineering,1,Director of Software Engineering Company: #URL...
5484,5485,Accounting Manager,"US, , Portland",,65000-80000,Human capital is usually the biggest asset and...,Who are client is…An innovator in solar techno...,"What you need to have….High integrity, ethics,...",,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Accounting,Accounting/Auditing,0,Accounting Manager Company: Human capital is u...
16446,16447,Community Support Manager (CSM),"US, SC, Fort Mill",,,We Help Create Communities that Withstand the ...,Job Title: Community Support ManagerGe...,Job RequirementsAbility to listenGood written ...,,0,1,0,Full-time,Entry level,High School or equivalent,,Customer Service,0,Community Support Manager (CSM) Company: We He...
4144,4145,Sr. Systems Developer,"US, NY, New York",,,"We design, deploy, and maintain advanced techn...",We are looking for highly skilled developer to...,Practical Knowledge &amp; Experience:Strong re...,"At Fivesky, our employees are our greatest ass...",0,1,1,,,,,,0,"Sr. Systems Developer Company: We design, depl..."


## Define function for ML feature preprocessing

In [6]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',str(text))

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',str(text))

def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def final_preprocess(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = ' '.join(e for e in text.split() if e.lower() not in stopwords)
    text = text.lower()
    ps = PorterStemmer()
    text = ps.stem(text)
    return text

def feature_preprocessing(text):
    text = remove_URL(text)
    text = remove_emoji(text)
    text = remove_html(text)
    text = remove_punctuation(text)
    text = final_preprocess(text)
    return text

# Get embeddings

In [7]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [8]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

In [9]:
qdrant_client = QdrantClient(url="http://localhost:6333")

# Hybrid Search

In [10]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

In [28]:


def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="job-postings-collection-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [29]:
result = retrieve_data("data scientist")

In [30]:
result.points

[ScoredPoint(id=37, version=2, score=0.75, payload={'text': 'Data Scientist Engineer Company: Mashape is the largest, most trafficked API marketplace/hub in the world.Mashape is a revenue generating startup powering thousands of APIs (both private and public) and Applications - it’s used in almost 100 countries by thousands of developers and adopted in every major industry including finance,\n    Job Id: 11054\n    Description: Mashape is seeking a talented and proven analytics expert to help investigate our data,\xa0do research and analysis to help the company make data-driven decisions in moving the product forward at the speed of light.\xa0You will work cross-functionally,\xa0closely with engineers, growth hacker and our designer to help devise appropriate measurements and metrics, design randomized controlled experiments, and provide data-driven feedback to support rapid product iterations. You will also use our large distributed datasets to conduct analysis, working on projects su

In [15]:
texts = []

for data in result.points:
    texts.append(data.payload["text"])

texts

["Data Scientist Company: Zaius is a digital intuition platform enabling companies to optimize consumer experiences across channels. \xa0We're poised to make a huge dent in a $20B industry. \xa0Zaius is led by veterans from successful software companies including Endeca, Netezza, and LogMeIn and backed by top-tier venture capital \nDescription: Zaius is a startup building a SaaS product to give consumer companies and brands the digital intuition they need to better understand and engage their customers across any channel - to the benefit of the consumer.At Zaius we work on hard technical problems: world wide event collection, real-time processing, machine learning, big data analytics and data visualization, all at scale. We do this to build a product that our customers love and that benefits real people. We differentiate not just on our offering, but on the strength of our technology.Simply put, we solve interesting problems using the best technology alongside great people. Zaius is le

# Model Inference Function

In [25]:
import pickle
import spacy
import nltk
from scipy.sparse import hstack
from minio import Minio

In [17]:
minio_client = Minio(
    endpoint = "localhost:9000",
    access_key= "minioadmin",
    secret_key= "minioadmin",
    secure=False
)

response = minio_client.get_object('fraud-classifier', 'count_vectorizer.pkl')
obj_bytes = response.read()

COUNT_VECTORIZER = pickle.loads(obj_bytes)

response = minio_client.get_object('fraud-classifier', 'model.pkl')
obj_bytes = response.read()

MODEL = pickle.loads(obj_bytes)

In [21]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [23]:
def get_prediction(text:str):
    """Classifies if a given job posting is real or fake.
    Uses the top retrieved context from get_formatted_context as input
    and return classification_result.

    Args:
        text: Input string of the job posting from retrieved_context from get_formatted_context
    
    Returns:
        A dictionary of the classification label and prediction probability score.
    """
    print("[DEBUG] get_prediction received job description:", text)

    if not text:
        return {"classification_result": {"error": "Missing job description."}}

    import joblib

    # clf_model = joblib.load("fraud_clf_decision_tree.pkl")
    # vectorizer = joblib.load("countvectorizer.pkl")


    text = feature_preprocessing(text)

    doc = nlp(text)

    X_text, X_pos = ' '.join([token.lemma_ for token in doc]), ' '.join([token.pos_ for token in doc])

    X_text_transform = COUNT_VECTORIZER.transform([X_text])

    X_pos_transform = COUNT_VECTORIZER.transform([X_pos])

    X_in = hstack([X_text_transform, X_pos_transform])

    proba = MODEL.predict_proba(X_in)[0][1]
    is_fraud = proba > 0.5

    return {
        "classification_result": {
            "is_fraud": is_fraud,
            "probability": round(proba, 3),
        }
    }

In [26]:
pred = get_prediction(texts[0])

[DEBUG] get_prediction received job description: Data Scientist Company: Zaius is a digital intuition platform enabling companies to optimize consumer experiences across channels.  We're poised to make a huge dent in a $20B industry.  Zaius is led by veterans from successful software companies including Endeca, Netezza, and LogMeIn and backed by top-tier venture capital 
Description: Zaius is a startup building a SaaS product to give consumer companies and brands the digital intuition they need to better understand and engage their customers across any channel - to the benefit of the consumer.At Zaius we work on hard technical problems: world wide event collection, real-time processing, machine learning, big data analytics and data visualization, all at scale. We do this to build a product that our customers love and that benefits real people. We differentiate not just on our offering, but on the strength of our technology.Simply put, we solve interesting problems using the best techno

In [27]:
pred

{'classification_result': {'is_fraud': np.False_,
  'probability': np.float64(0.124)}}