In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct

import pandas as pd
import openai
import language_tool_python
from textblob import TextBlob
import spacy

import instructor
from pydantic import BaseModel
from openai import OpenAI 

from collections import defaultdict
import re

In [3]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [4]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [6]:
SUSPICIOUS_KEYWORDS = [
    "urgent",
    "guaranteed",
    "no experience",
    "quick hire",
    "work from home",
    "100% remote",
    "easy money",
    "immediate start",
    "customer service",
    "home",
    "high school",
    "telecommuting: yes",
    "not available",
    "data entry"
]

In [7]:
ENTITY_LABELS = {
    "ORG",      # Company names
    "GPE",      # Locations
    "LOC",      # Additional location info
    # "PERSON",   # Recruiter/contact name
    "DATE",     # Posting dealine etc.
    "MONEY",    # Salary info
    "PRODUCT",  # Tools, software products
    "LANGUAGE", # Programming/spoken language
    "NORP",     # Nationalities, religios, political groups
    "WORK_OF_ART"  # May hold job titles or certifications
}

In [8]:
CUSTOM_PATTERNS = [
    # Emails (simple pattern)
    {"label": "EMAIL", "pattern": [{"LIKE_EMAIL": True}]},

    # URLs (simple regex match using pattern of tokens starting with https?:// or www)
    {"label": "URL", "pattern": [{"TEXT": {"REGEX": r"https?://[^\s]+"}}]},
    {"label": "URL", "pattern": [{"TEXT": {"REGEX": r"www\.[^\s]+"}}]},

    # Phone numbers (simple patterns for digits and separators)
    {"label": "PHONE_NUMBER", "pattern": [
        {"TEXT": {"REGEX": r"\+?\d{1,3}"}}, {"TEXT": {"REGEX": r"[-.\s]?"}}, 
        {"TEXT": {"REGEX": r"\(?\d{1,4}\)?"}}, {"TEXT": {"REGEX": r"[-.\s]?"}}, 
        {"TEXT": {"REGEX": r"\d{1,4}"}}, {"TEXT": {"REGEX": r"[-.\s]?"}}, 
        {"TEXT": {"REGEX": r"\d{1,9}"}}
    ]},

    # Skills (examples, add or extend this list)
    {"label": "SKILL", "pattern": "python"},
    {"label": "SKILL", "pattern": "excel"},
    {"label": "SKILL", "pattern": "project management"},
    {"label": "SKILL", "pattern": "c++"},
    {"label": "SKILL", "pattern": "javascript"},

    # Certifications (examples)
    {"label": "CERTIFICATION", "pattern": "PMP"},
    {"label": "CERTIFICATION", "pattern": "AWS Certified"},
    {"label": "CERTIFICATION", "pattern": "Cisco Certified Network Associate"},

    # Job Titles (examples)
    {"label": "JOB_TITLE", "pattern": "Data Analyst"},
    {"label": "JOB_TITLE", "pattern": "Sales Manager"},
    {"label": "JOB_TITLE", "pattern": "Software Engineer"},
    {"label": "JOB_TITLE", "pattern": "Project Manager"}
]

In [9]:
# Add EntityRuler after the built-in NER pipe
ruler = nlp.add_pipe("entity_ruler", after="ner")
ruler.add_patterns(CUSTOM_PATTERNS)

In [14]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_context(query, top_k=5):
    query_embedding = get_embedding(query)

    qdrant_client = QdrantClient(url="http://localhost:6333")

    results = qdrant_client.query_points(
        collection_name="job-postings-collection-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=10
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=top_k
    )

    retrieved_context_ids = []
    retrieved_context = []
    similarity_scores = []

    for result in results.points:
        retrieved_context_ids.append(result.payload['job_id'])
        retrieved_context.append(result.payload['text'])
        similarity_scores.append(result.score)

    top_job_text = retrieved_context[0]
    # print("==== RETRIEVED JOB POSTING=====")
    # print(top_job_text)

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "similarity_scores": similarity_scores,
        "retrieved_job_posting": top_job_text
    }


def detect_suspicious_phrases(text: str) -> str:
    text_lower = text.lower()
    flagged = [kw for kw in SUSPICIOUS_KEYWORDS if kw in text_lower]
    # return ", ".join(flagged)
    return flagged


def check_misspellings(text: str) -> dict:
    # tool = language_tool_python.LanguageTool('en-US')
    # matches = tool.check(text)

    # # Count number of grammar/spelling issues
    # num_errors = len(matches)
    # # Optionally categorize types of issues
    # error_types = set(match.ruleId for match in matches)

    # Spelling mistakes using TextBlob
    blob = TextBlob(text)
    misspelled = [word for word in blob.words if not word.correct() == word]
    return misspelled

def process_suspicious_phrases(context):
    """
    Detect suspicious keywords or phrases commonly found in fake job postings
    """

    formatted_context = ""

    for id, chunk in zip(context["retrieved_context_ids"], context["retrieved_context"]):
        processed_chunk = chunk.lower()
        flagged = [kw for kw in SUSPICIOUS_KEYWORDS if kw in processed_chunk]
        flagged_str = ", ".join(flagged)
        formatted_context += f"- {id}: {flagged_str}\n"
    return formatted_context





def format_job_postings_analysis(context):
    md = ""

    for id, chunk in zip(context["retrieved_context_ids"], context["retrieved_context"]):
        chunk_misspellings = check_misspellings(chunk)
        chunk_suspicious_keywords = detect_suspicious_phrases(chunk)

        md += f" - {id} \n"
        md += f"""Misspelled words: {chunk_misspellings}
            """ 
        md += "---"
    return md




def extract_entities_job_posting(text: str) -> dict:
    doc = nlp(text.lower())
    group_entities = defaultdict(set)

    # include both spacy stanard relevant entities and custom ones
    for ent in doc.ents:
        label = ent.label_
        if label in ENTITY_LABELS or label in {p['label'] for p in CUSTOM_PATTERNS}:
            group_entities[label].add(ent.text)
        
    # Convert sets to sorted lists for json serialization
    grouped_entities = {label: sorted(list(ents)) for label, ents in group_entities.items()}
    total_entities = sum(len(ents) for ents in grouped_entities.values())

    return {
        "total_entities_detected": total_entities,
        "entities_by_label": grouped_entities
    }


def format_context_entities(context):
    md = ""

    for id, chunk in zip(context["retrieved_context_ids"], context["retrieved_context"]):
        chunk_entities = extract_entities_job_posting(chunk)

        md += f" - {id} \n"
        md += f"total entities: {chunk_entities["total_entities_detected"]} \n"
        md += f"entities by label: \n"
        for label, entities in chunk_entities["entities_by_label"].items():
            md += f"- {label}: {", ".join(entities)}\n"
        md += "--- \n"
    return md




# def get_suspicious_keywords(query: str, top_k: int = 5) -> str:

#     """Get the top k context, each representing an job posting for a given query.
    
#     Args:
#         query: The query to get the top k context for
#         top_k: The number of context chunks to retrieve, works best with 5 or more
    
#     Returns:
#         A string with IDs and its list of suspicious keywords or phrases commonly found in fake job postings
#     """

#     context = retrieve_context(query, top_k)
#     # print("[DEBUG] retrieved job posting: ", context["retrieved_job_posting"])
#     # formatted_context = process_suspicious_phrases(context)
#     formatted_context = format_context_entities(context)

#     print("[DEBUG] Formatted context output:", context)

#     return formatted_context
#     # return context


def get_context_entities(query: str, top_k: int = 5) -> str:

    """Get the top k context, each representing an job posting for a given query.
    
    Args:
        query: The query to get the top k context for
        top_k: The number of context chunks to retrieve, works best with 5 or more
    
    Returns:
        A string with IDs and its list of suspicious keywords or phrases commonly found in fake job postings
    """

    context = retrieve_context(query, top_k)
    # print("[DEBUG] retrieved job posting: ", context["retrieved_job_posting"])
    # formatted_context = process_suspicious_phrases(context)
    formatted_entities = format_context_entities(context)

    print("[DEBUG] Formatted context output:", context)

    return formatted_entities
    # return context

In [15]:
results = get_context_entities("job_id 17552", 5)

[DEBUG] Formatted context output: {'retrieved_context_ids': [13277, 17552, 16439, 8236, 8478], 'retrieved_context': ['Linux System Administrator Company: Not Available\n    Job Id: 13277\n    Description: UnoTelly is a DNS &amp; VPN service that allows users to access geo-restricted content no matter where they are in the world!We are seeking a dynamic network administrator to join our growing team in our new downtown Toronto office location. Do you derive satisfaction from seeing your work go live and help real customers from around the world? We ship and ship often! We are looking for a motivated network administrator who enjoys working on the many aspects of our expanding software products. Your responsibility will be to manage our server infrastructure to ensure 99.999% uptime. You will also dabble between playing around with the Unix/Linux distros, optimizing TCP/IP stack and some fun coding.We offer a challenging yet nurturing work environment where everyone is given the opportun

In [16]:
import pprint as pp

pp.pprint(results)

(' - 13277 \n'
 'total entities: 10 \n'
 'entities by label: \n'
 '- DATE: 13277, minimum 5 years\n'
 '- GPE: ntp, toronto\n'
 '- ORG: 2 &amp, acls, administrator?2, experienceipv4/ipv6, skills &amp\n'
 '- SKILL: python\n'
 '--- \n'
 ' - 17552 \n'
 'total entities: 1 \n'
 'entities by label: \n'
 '- DATE: 17552\n'
 '--- \n'
 ' - 16439 \n'
 'total entities: 3 \n'
 'entities by label: \n'
 '- GPE: detroit\n'
 '- ORG: mia\xa0sr.\n'
 '- DATE: 5+ years\n'
 '--- \n'
 ' - 8236 \n'
 'total entities: 10 \n'
 'entities by label: \n'
 '- GPE: canada, emea, india, usa\n'
 '- DATE: 1+ year, 2+ years, 30 days, 5-8 years\n'
 '- ORG: 3rd party systems, integration &amp\n'
 '--- \n'
 ' - 8478 \n'
 'total entities: 10 \n'
 'entities by label: \n'
 '- MONEY: '
 '#url_ddb080358fa5eecf5a67c649cfb4ffc343c484389f1bbaf2a1cb071e3f2b6e7e#\n'
 '- ORG: qa automationquality\n'
 '- GPE: san mateo, technologies•\n'
 '- DATE: 3 years, 3-5 years, 5 years\n'
 '- SKILL: javascript, python\n'
 '- PRODUCT: qualifications: