In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import openai
import json
import tiktoken
import re
from nltk.stem import PorterStemmer

In [3]:
df = pd.read_csv("../data/fraud/fake_job_postings.csv")

df_sample = df.sample(n=50, random_state=123)

## Define function for preprocess job descriptions

In [4]:

def preprocess_data(row):
    text = f"""{row['title']} Company: {row['company_profile']} 
    Requirements: {row['requirements']}
    Benefits: {row['benefits']} 
    Education: {row['required_education']} 
    Salary: {row['salary_range']} 
    """
    return text
    
df_sample['text'] = df_sample.apply(preprocess_data,axis=1)
df_sample.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
11391,11392,House and Office Cleaners / Housekeepers FT/PT,"US, GA, Atlanta",,,Hedge your bets - work with the best domestic ...,Do more of the work you love and earn more wit...,Requirements:* Must have own supplies and reli...,Pay is $15/hr (add tips and you make even more...,0,1,1,Other,Not Applicable,,Facilities Services,,0,House and Office Cleaners / Housekeepers FT/PT...
10396,10397,Director of Software Engineering,"US, CA, San Mateo",Engineering,,#URL_ddb080358fa5eecf5a67c649cfb4ffc343c484389...,As Director of Software Engineering's newly fo...,Requirements: At least 10+ years in software ...,Our core values drive our culture. This is wha...,0,1,1,Full-time,Director,Master's Degree,Marketing and Advertising,Engineering,1,Director of Software Engineering Company: #URL...
5484,5485,Accounting Manager,"US, , Portland",,65000-80000,Human capital is usually the biggest asset and...,Who are client is…An innovator in solar techno...,"What you need to have….High integrity, ethics,...",,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Accounting,Accounting/Auditing,0,Accounting Manager Company: Human capital is u...
16446,16447,Community Support Manager (CSM),"US, SC, Fort Mill",,,We Help Create Communities that Withstand the ...,Job Title: Community Support ManagerGe...,Job RequirementsAbility to listenGood written ...,,0,1,0,Full-time,Entry level,High School or equivalent,,Customer Service,0,Community Support Manager (CSM) Company: We He...
4144,4145,Sr. Systems Developer,"US, NY, New York",,,"We design, deploy, and maintain advanced techn...",We are looking for highly skilled developer to...,Practical Knowledge &amp; Experience:Strong re...,"At Fivesky, our employees are our greatest ass...",0,1,1,,,,,,0,"Sr. Systems Developer Company: We design, depl..."


## Define function for ML feature preprocessing

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',str(text))

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',str(text))

def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def final_preprocess(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = ' '.join(e for e in text.split() if e.lower() not in stopwords)
    text = text.lower()
    ps = PorterStemmer()
    text = ps.stem(text)
    return text

def feature_preprocessing(text):
    text = remove_URL(text)
    text = remove_emoji(text)
    text = remove_html(text)
    text = remove_punctuation(text)
    text = final_preprocess(text)
    return text