# Import Necessary Library and Load Dataset

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [None]:
import torch
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import batch_to_device, cos_sim
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tqdm.auto import tqdm
import pandas as pd
import random

In [None]:
>>> import nltk
>>> nltk.download('punkt_tab')

# Load JobBERT-v2 Model

In [None]:
# Load the model
model = SentenceTransformer("TechWolf/JobBERT-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

In [None]:
def encode_batch(jobbert_model, texts):
    features = jobbert_model.tokenize(texts)
    features = batch_to_device(features, jobbert_model.device)
    features["text_keys"] = ["anchor"]
    with torch.no_grad():
        out_features = jobbert_model.forward(features)
    return out_features["sentence_embedding"].cpu().numpy()

In [None]:
def encode(jobbert_model, texts, batch_size: int = 8):
    # Sort texts by length and keep track of original indices
    sorted_indices = np.argsort([len(text) for text in texts])
    sorted_texts = [texts[i] for i in sorted_indices]

    embeddings = []

    # Encode in batches
    for i in tqdm(range(0, len(sorted_texts), batch_size)):
        batch = sorted_texts[i:i+batch_size]
        embeddings.append(encode_batch(jobbert_model, batch))

    # Concatenate embeddings and reorder to original indices
    sorted_embeddings = np.concatenate(embeddings)
    original_order = np.argsort(sorted_indices)
    return sorted_embeddings[original_order]

# Job Corpus

In [None]:
# Example usage
job_titles = pd.read_csv("/content/drive/MyDrive/(Priority) Job Recommendation References/TF-IDF /combined_jobs_2000.csv")
job_titles

Unnamed: 0,Job.ID,text,Title
0,303691,dishwasher act retirement life communities abi...,Dishwasher @ ACTS Retirement- Life Communities
1,289262,prep cook bob evans corporate canfield part ti...,Prep Cook @ Bob Evans Corporate
2,289585,part time full time automotive customer servic...,Part Time & Full-Time Automotive Customer Serv...
3,266468,dietary cook villages rehabilitation nurse cen...,Dietary Cook @ The Villages Rehabilitation and...
4,309743,restaurant shift leader baker hourly crew memb...,Restaurant Shift Leader - Baker - Hourly Crew ...
...,...,...,...
1995,462,sushi chef chin sushi bar restaurant san franc...,Sushi Chef @ Chin's Sushi Bar & Restaurant
1996,307322,sales associate kitchen collection allen part ...,Sales Associate @ Kitchen Collection
1997,266915,baker bakery mgm resort international las vega...,Baker - Bakery (PT) @ MGM Resorts International
1998,313646,cook american golf corporation mirada part tim...,Cook @ American Golf Corporation


# Denoising Text

In [None]:
def denoise_text(text, method='a', del_ratio=0.6, word_freq_dict=None, freq_threshold=100):
    words = word_tokenize(text)
    n = len(words)
    if n == 0:
        return text

    if method == 'a':
        # === (a) Random 60% Deletion ===
        keep_or_not = np.random.rand(n) > del_ratio
        if sum(keep_or_not) == 0:
            keep_or_not[np.random.choice(n)] = True
        result = np.array(words)[keep_or_not]

    elif method == 'b':
        # === (b) Remove 60% of high-frequency words ===
        if word_freq_dict is None:
            raise ValueError("word_freq_dict is required for method 'b' or 'c'")
        high_freq_words = [i for i, w in enumerate(words) if word_freq_dict.get(w.lower(), 0) > freq_threshold]
        to_remove = set(random.sample(high_freq_words, int(del_ratio * len(high_freq_words)))) if high_freq_words else set()
        result = [w for i, w in enumerate(words) if i not in to_remove]

    elif method == 'c':
        # === (c) Based on (b) + shuffle remaining words ===
        if word_freq_dict is None:
            raise ValueError("word_freq_dict is required for method 'b' or 'c'")
        high_freq_words = [i for i, w in enumerate(words) if word_freq_dict.get(w.lower(), 0) > freq_threshold]
        to_remove = set(random.sample(high_freq_words, int(del_ratio * len(high_freq_words)))) if high_freq_words else set()
        result = [w for i, w in enumerate(words) if i not in to_remove]
        random.shuffle(result)  # simple shuffle, pair-aware shuffling can be added if needed

    else:
        raise ValueError("Unknown denoising method. Use 'a', 'b', or 'c'.")

    return TreebankWordDetokenizer().detokenize(result)


In [None]:
# Create noisy version of each job description
job_titles['noisy_text'] = job_titles['text'].fillna("").apply(lambda x: denoise_text(x))

In [None]:
# === Step 2: Encode the clean and noisy texts using pretrained JobBERT ===
clean_texts = job_titles['text'].fillna("").tolist()
noisy_texts = job_titles['noisy_text'].tolist()

# AutoEncoders (Embeddings)

In [None]:
# Use existing `encode` function already defined in your code
clean_embeddings = encode(model, clean_texts)
noisy_embeddings = encode(model, noisy_texts)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
# === Step 3: Combine embeddings — you can average or concatenate ===
# Here we average the two embeddings to simulate TSDAE reconstruction learning
tsdae_embeddings = (clean_embeddings + noisy_embeddings) / 2.0

In [None]:
# === Step 4: Store the TSDAE-style embeddings ===
job_titles['jobbert_tsdae_embedding'] = tsdae_embeddings.tolist()

In [None]:
job_titles

Unnamed: 0,Job.ID,text,Title,noisy_text,jobbert_tsdae_embedding
0,303691,dishwasher act retirement life communities abi...,Dishwasher @ ACTS Retirement- Life Communities,dishwasher retirement life abington join team ...,"[0.04412940889596939, 0.02191561833024025, -0...."
1,289262,prep cook bob evans corporate canfield part ti...,Prep Cook @ Bob Evans Corporate,prep bob corporate canfield part evans hire co...,"[0.05274323746562004, -0.1667470782995224, 0.0..."
2,289585,part time full time automotive customer servic...,Part Time & Full-Time Automotive Customer Serv...,time full automotive customer service internet...,"[0.009837589226663113, -0.05399450659751892, 0..."
3,266468,dietary cook villages rehabilitation nurse cen...,Dietary Cook @ The Villages Rehabilitation and...,dietary rehabilitation nurse lady part time ca...,"[-0.02399255521595478, -0.12295804917812347, -..."
4,309743,restaurant shift leader baker hourly crew memb...,Restaurant Shift Leader - Baker - Hourly Crew ...,leader member service full run dunkin career r...,"[0.014206374995410442, -0.004099982790648937, ..."
...,...,...,...,...,...
1995,462,sushi chef chin sushi bar restaurant san franc...,Sushi Chef @ Chin's Sushi Bar & Restaurant,sushi sushi francisco time inner sushi look su...,"[-0.05698367953300476, -0.026045650243759155, ..."
1996,307322,sales associate kitchen collection allen part ...,Sales Associate @ Kitchen Collection,associate collection provide consumer entertai...,"[-0.016873717308044434, -0.06819058954715729, ..."
1997,266915,baker bakery mgm resort international las vega...,Baker - Bakery (PT) @ MGM Resorts International,resort las vegas time properly place maintain ...,"[-0.04671752452850342, -0.034556955099105835, ..."
1998,313646,cook american golf corporation mirada part tim...,Cook @ American Golf Corporation,cook part time prepare specifications maintain...,"[0.04764906316995621, -0.03198384493589401, -0..."


In [None]:
# Calculate cosine similarity matrix
similarities = cos_sim(tsdae_embeddings, tsdae_embeddings)
print(similarities)

tensor([[1.0000, 0.3902, 0.0804,  ..., 0.4676, 0.4896, 0.3968],
        [0.3902, 1.0000, 0.2515,  ..., 0.6315, 0.6197, 0.7236],
        [0.0804, 0.2515, 1.0000,  ..., 0.0940, 0.1614, 0.2948],
        ...,
        [0.4676, 0.6315, 0.0940,  ..., 1.0000, 0.6453, 0.4876],
        [0.4896, 0.6197, 0.1614,  ..., 0.6453, 1.0000, 0.4877],
        [0.3968, 0.7236, 0.2948,  ..., 0.4876, 0.4877, 1.0000]])


# Clustering TSDAE Embeddings (Job2Vec)

In [None]:
# === KMeans Clustering ===
num_clusters = 20
embedding_matrix = np.vstack(job_titles['jobbert_tsdae_embedding'].values)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embedding_matrix)


In [None]:
cluster_labels = kmeans.labels_

In [None]:
# === Create clustered DataFrame ===
df_clustered_jobbert = pd.DataFrame({
    'Job.ID': job_titles['Job.ID'].values if 'Job.ID' in job_titles.columns else range(len(job_titles)),
    'Title': job_titles['Title'].values if 'Title' in job_titles.columns else [None]*len(job_titles),
    'text': job_titles['text'].values,
    'cluster': cluster_labels,
    'original_index': job_titles.index
})

In [None]:
# === View count per cluster ===
print(df_clustered_jobbert['cluster'].value_counts())

cluster
9     241
10    174
18    155
8     152
17    120
2     118
16    116
19    103
7     102
5      97
4      81
6      78
3      76
13     74
14     72
1      60
11     59
15     57
12     48
0      17
Name: count, dtype: int64


In [None]:
user_corpus = pd.read_csv("/content/drive/MyDrive/(Priority) Job Recommendation References/user_applicant_jobs.csv")
user_corpus

Unnamed: 0,text
0,researcher human technology evolution nan data...
1,technology associate
2,java developer
3,data scientist japanese translator barista bar...
4,business development innovation consultant tec...
...,...
193,report developer saicon consultants inc atlanta
194,report developer saicon consultants inc atlanta
195,software engineer senior software development ...
196,nan developer


In [None]:
# === Encode the job titles ===
texts_user = user_corpus["text"].fillna("").tolist()
embeddings_user = encode(model, texts_user) #model dari techwold/jobbert dan tanpa noise atau tidak melewati TSDAE

  0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
# Calculate cosine similarity matrix
similarities_user = cos_sim(embeddings_user, embeddings_user)
print(similarities_user)

tensor([[1.0000, 0.3377, 0.2515,  ..., 0.2265, 0.5163, 0.1762],
        [0.3377, 1.0000, 0.4377,  ..., 0.2740, 0.2128, 0.1892],
        [0.2515, 0.4377, 1.0000,  ..., 0.5718, 0.3725, 0.2837],
        ...,
        [0.2265, 0.2740, 0.5718,  ..., 1.0000, 0.3450, 0.2491],
        [0.5163, 0.2128, 0.3725,  ..., 0.3450, 1.0000, 0.1636],
        [0.1762, 0.1892, 0.2837,  ..., 0.2491, 0.1636, 1.0000]])


In [None]:
# === Add embedding to the dataframe ===
user_corpus['jobbert_embedding'] = embeddings_user.tolist()

In [None]:
user_corpus

Unnamed: 0,text,jobbert_embedding
0,researcher human technology evolution nan data...,"[0.13510161638259888, -0.025429964065551758, 0..."
1,technology associate,"[0.10252457857131958, -0.02839391492307186, 0...."
2,java developer,"[0.0219131987541914, -0.052545856684446335, 0...."
3,data scientist japanese translator barista bar...,"[-0.05992536246776581, -0.09382637590169907, -..."
4,business development innovation consultant tec...,"[0.05810216814279556, -0.05741456151008606, -0..."
...,...,...
193,report developer saicon consultants inc atlanta,"[-0.004331176169216633, -0.013917769305408001,..."
194,report developer saicon consultants inc atlanta,"[-0.00433118361979723, -0.013917817734181881, ..."
195,software engineer senior software development ...,"[-0.027365751564502716, -0.025424636900424957,..."
196,nan developer,"[-0.046097349375486374, -0.018703164532780647,..."


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

'''
# === Step 1: Stack job embeddings ===
embedding_matrix = np.vstack(job_titles['jobbert_embedding'])
'''

# === Step 2: Compute cluster centroids ===
cluster_centroids = (
    df_clustered_jobbert
    .groupby("cluster")["original_index"]
    .apply(lambda idxs: np.mean(embedding_matrix[list(idxs)], axis=0))
    .tolist()
)
cluster_centroids = np.vstack(cluster_centroids)

def calculate_relative_threshold(similarities, percentile=75):
    """
    Calculate the relative similarity threshold based on a given percentile.
    """
    return np.percentile(similarities, percentile)

def evaluate_with_relative_threshold(similarities, top_n_indices, df_clustered_jobbert, threshold):
    """
    Evaluate relevance based on cosine similarity and a dynamic threshold.
    """
    relevant_docs = set()
    for idx, similarity in zip(top_n_indices, similarities):
        if similarity >= threshold:
            relevant_docs.add(df_clustered_jobbert.iloc[idx]["Job.ID"])  # Mark as relevant
    return relevant_docs

def get_top_n_local_search(embeddings_user, df_clustered_jobbert, embedding_matrix, top_n_list=[3, 5, 10, 20]):
    # === Step 1: Find the closest cluster center to the user embedding ===
    cluster_centers = kmeans.cluster_centers_
    cluster_similarities = cosine_similarity(embeddings_user, cluster_centers)
    best_cluster_id = np.argmax(cluster_similarities)

    # === Step 2: Filter jobs in the best cluster ===
    cluster_subset = df_clustered_jobbert[df_clustered_jobbert['cluster'] == best_cluster_id]
    cluster_indices = cluster_subset.index.to_numpy()
    cluster_embeddings = embedding_matrix[cluster_indices]

    # === Step 3: Compute similarity between user and job postings in the cluster ===
    similarities = cosine_similarity(embeddings_user, cluster_embeddings).flatten()
    top_indices_within_cluster = np.argsort(similarities)[::-1]

    # === Step 4: Calculate relative similarity threshold ===
    threshold = calculate_relative_threshold(similarities)

    # === Step 5: Extract top-N matches with relevance evaluation based on threshold ===
    results = {}
    for n in top_n_list:
        top_n_idx = top_indices_within_cluster[:n]
        selected_indices = cluster_indices[top_n_idx]
        top_n_df = df_clustered_jobbert.loc[selected_indices].copy()
        top_n_df["similarity"] = similarities[top_n_idx]

        # Evaluate relevance based on relative similarity threshold
        relevant_docs = evaluate_with_relative_threshold(similarities[top_n_idx], top_n_idx, df_clustered_jobbert, threshold)
        top_n_df["relevance_label"] = top_n_df["similarity"].apply(lambda x: "relevant" if x >= threshold else "not relevant")



        results[f"top_{n}"] = top_n_df[['Job.ID', 'Title', 'text', 'cluster', 'similarity', 'relevance_label']]

    return results

In [None]:
    # === Step 4: Calculate relative similarity threshold ===
    threshold = calculate_relative_threshold(similarities)
    print(f"\n[Threshold] Relative Similarity Threshold (75th percentile): {threshold:.4f}")


[Threshold] Relative Similarity Threshold (75th percentile): 0.5640


In [None]:
# === Step 1: Find user query text ===
query_text = "java developer"

# Find the matching row in user_corpus
user_q_row = user_corpus[user_corpus['text'].str.lower() == query_text.lower()]

# Safety check
if user_q_row.empty:
    raise ValueError(f"Text '{query_text}' not found in user_corpus.")

# === Step 2: Extract existing embedding from user_corpus ===
embeddings_user = np.array(user_q_row.iloc[0]['jobbert_embedding']).reshape(1, -1)

In [None]:
'''
# Get Top-N recommendations using local search
recommendations = get_top_n_local_search(embeddings_user)

# Display results
print(f"\nQuery: '{query_text}'")
print("\nTop-3:\n", recommendations['top_3'])
print("\nTop-5:\n", recommendations['top_5'])
print("\nTop-10:\n", recommendations['top_10'])
print("\nTop-20:\n", recommendations['top_20'])
'''

recommendations = get_top_n_local_search(embeddings_user, df_clustered_jobbert, embedding_matrix)

print(f"\nQuery: '{query_text}'")
for k, df in recommendations.items():
    print(f"\n=== {k.upper()} (Threshold: {threshold:.4f}) ===")
    print(df[['Job.ID', 'Title', 'cluster', 'similarity', 'relevance_label']])



Query: 'java developer'

=== TOP_3 (Threshold: 0.5640) ===
      Job.ID                                        Title  cluster  \
1544  305264  Sr. Java Developer @ Paladin Consulting Inc       19   
401   303112                   Java Developer @ TransHire       19   
589   294684                    Java Developer @ Kavaliro       19   

      similarity relevance_label  
1544    0.687320        relevant  
401     0.687112        relevant  
589     0.678584        relevant  

=== TOP_5 (Threshold: 0.5640) ===
      Job.ID                                        Title  cluster  \
1544  305264  Sr. Java Developer @ Paladin Consulting Inc       19   
401   303112                   Java Developer @ TransHire       19   
589   294684                    Java Developer @ Kavaliro       19   
904   146640  Jr. Java Developer @ Paladin Consulting Inc       19   
1414  309649             Senior C# Developer @ ConsultNet       19   

      similarity relevance_label  
1544    0.687320        rele

In [None]:
query_text

'java developer'

In [None]:
df

Unnamed: 0,Job.ID,Title,text,cluster,similarity,relevance_label
1544,305264,Sr. Java Developer @ Paladin Consulting Inc,java developer paladin consult inc dallas full...,19,0.68732,relevant
401,303112,Java Developer @ TransHire,java developer transhire fort lauderdale seaso...,19,0.687112,relevant
589,294684,Java Developer @ Kavaliro,java developer kavaliro bethesda seasonal temp...,19,0.678584,relevant
904,146640,Jr. Java Developer @ Paladin Consulting Inc,java developer paladin consult inc saint louis...,19,0.674415,relevant
1414,309649,Senior C# Developer @ ConsultNet,senior developer consultnet salt lake city ful...,19,0.588904,relevant
1200,243777,Java Developer @ Mindteck,java developer mindteck philadelphia full time...,19,0.578035,relevant
1630,269922,Entry Level Java Developer / Jr. Java Develope...,entry level java developer java developer cont...,19,0.57236,relevant
1673,311279,Senior Java Developer - Remote @ MR-MRI of Fre...,senior java developer remote mri fremont washi...,19,0.562658,relevant
1876,275893,Software Developer @ Lyons HR,software developer lyons huntsville seasonal t...,19,0.555562,relevant
1806,251696,Java Developer @ ConsultNet,java developer consultnet salt lake city full ...,19,0.552852,relevant


In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML

# === Step 1: Extract top 10 recommendations ===
top_matches_10 = recommendations['top_10'].copy()
top_indices = top_matches_10.index

# === Step 2: Stack user and job embeddings (JobBERT) ===
job_embeddings_top_10 = embedding_matrix[top_indices]
combined_vectors = np.vstack([embeddings_user, job_embeddings_top_10])  # (1 + 10, dim)

# === Step 3: Reduce to 3D using PCA ===
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(combined_vectors)

# === Step 4: Create DataFrame for 3D plotting ===
plot_df = pd.DataFrame(reduced_embeddings, columns=['x', 'y', 'z'])
plot_df['type'] = ['QUERY'] + ['JOB'] * len(top_matches_10)
plot_df['cluster'] = ['Query'] + top_matches_10['cluster'].astype(str).tolist()
plot_df['similarity'] = [1.0] + top_matches_10['similarity'].tolist()
plot_df['text'] = ['User Query'] + top_matches_10['text'].tolist()
plot_df['title'] = ['User Query'] + top_matches_10['Title'].tolist()

# Assign a distinct color to each point (loop colors if not enough)
color_palette = px.colors.qualitative.Plotly
plot_df['color'] = [color_palette[i % len(color_palette)] for i in range(len(plot_df))]

# === Step 5: Create base scatter plot with Plotly ===
fig = px.scatter_3d(
    plot_df,
    x='x', y='y', z='z',
    color='type',
    size='similarity',
    symbol='type',
    hover_name='title',
    hover_data={'text': False, 'similarity': True, 'cluster': True},
    title='3D Visualization of Top 10 JobBERT Recommendations + User Query',
    width=800, height=700
)

fig.update_traces(marker=dict(size=7))
fig.update_layout(
    scene=dict(
        xaxis_title='PCA 1',
        yaxis_title='PCA 2',
        zaxis_title='PCA 3'
    ),
    legend_title='Type'
)

# === Step 6: Add distance lines from query to each job ===
query_coords = plot_df.iloc[0][['x', 'y', 'z']].values
job_coords = plot_df.iloc[1:][['x', 'y', 'z']].values
job_texts = plot_df.iloc[1:]['text'].tolist()

# Compute Euclidean distances in 3D space
distances = pairwise_distances([query_coords], job_coords).flatten()
plot_df.loc[1:, 'distance_from_query'] = distances

for i, (job_coord, dist, job_label) in enumerate(zip(job_coords, distances, job_texts)):
    fig.add_trace(go.Scatter3d(
        x=[query_coords[0], job_coord[0]],
        y=[query_coords[1], job_coord[1]],
        z=[query_coords[2], job_coord[2]],
        mode='lines',
        line=dict(color='gray', width=2, dash='dot'),
        hoverinfo='text',
        text=[f"Distance: {dist:.4f} to '{job_label[:30]}...'"],
        showlegend=False
    ))

fig.show()

# === Step 7: Color-coded External Label List ===
html_labels = "<h3 style='margin-top:10px;'>Legend: Matching Jobs and Query</h3><ul style='list-style:none;'>"
for i, row in plot_df.iterrows():
    dot_color = row['color']
    title = row['title']
    cluster = row['cluster']
    similarity = row['similarity']
    label_type = row['type']
    distance = row.get('distance_from_query', None)

    html_labels += f"""
    <li style="margin-bottom:8px;">
        <span style="display:inline-block;width:15px;height:15px;background:{dot_color};margin-right:10px;border-radius:50%;"></span>
        <strong>{label_type}</strong> — <em>{title}</em><br>
        <small>Cluster: {cluster} | Similarity: {similarity:.4f}"""
    if distance is not None:
        html_labels += f" | Distance: {distance:.4f}"
    html_labels += "</small></li>"

html_labels += "</ul>"

# Display HTML in Colab
display(HTML(html_labels))


In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# === Evaluation Prep ===

# Step 1: Define query and its ID
query_id = "q1"
queries = {query_id: query_text}  # from earlier: query_text = "java developer"

# Step 2: Define corpus (top-10 jobs) and map as ID => text
top_matches_10 = recommendations["top_10"]
corpus = {f"d{i}": row["text"] for i, (_, row) in enumerate(top_matches_10.iterrows())}

# Step 3: Define relevant documents dynamically based on similarity threshold
relevant_docs_set = set()
for i, (_, row) in enumerate(top_matches_10.iterrows()):
    if row["similarity"] >= threshold:
        relevant_docs_set.add(f"d{i}")

# Step 4: Prepare relevance dictionary
relevant_docs = {query_id: relevant_docs_set}

# Step 5: Initialize and run evaluator (model already loaded as JobBERT)
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="JobBERT-Top10-Eval",
    show_progress_bar=False
)

results = ir_evaluator(model)

# === Print Results in Required Format ===
print(f"\nScore-Function: cosine")
for metric, score in results.items():
    print(f"{metric}: {score * 100:.2f}%")
print(f"\nPrimary Metric: {ir_evaluator.primary_metric}")
print(f"Primary Metric Score: {results[ir_evaluator.primary_metric]:.4f}")



Score-Function: cosine
JobBERT-Top10-Eval_cosine_accuracy@1: 100.00%
JobBERT-Top10-Eval_cosine_accuracy@3: 100.00%
JobBERT-Top10-Eval_cosine_accuracy@5: 100.00%
JobBERT-Top10-Eval_cosine_accuracy@10: 100.00%
JobBERT-Top10-Eval_cosine_precision@1: 100.00%
JobBERT-Top10-Eval_cosine_precision@3: 100.00%
JobBERT-Top10-Eval_cosine_precision@5: 80.00%
JobBERT-Top10-Eval_cosine_precision@10: 70.00%
JobBERT-Top10-Eval_cosine_recall@1: 14.29%
JobBERT-Top10-Eval_cosine_recall@3: 42.86%
JobBERT-Top10-Eval_cosine_recall@5: 57.14%
JobBERT-Top10-Eval_cosine_recall@10: 100.00%
JobBERT-Top10-Eval_cosine_ndcg@10: 96.82%
JobBERT-Top10-Eval_cosine_mrr@10: 100.00%
JobBERT-Top10-Eval_cosine_map@100: 89.76%

Primary Metric: JobBERT-Top10-Eval_cosine_ndcg@10
Primary Metric Score: 0.9682


In [None]:
# === Step 1: Find user query text ===
query_text2 = "web developer"

# Find the matching row in user_corpus
user_q_row = user_corpus[user_corpus['text'].str.lower() == query_text.lower()]

# Safety check
if user_q_row.empty:
    raise ValueError(f"Text '{query_text}' not found in user_corpus.")

# === Step 2: Extract existing embedding from user_corpus ===
embeddings_user = np.array(user_q_row.iloc[0]['jobbert_embedding']).reshape(1, -1)



In [None]:
recommendations = get_top_n_local_search(embeddings_user, df_clustered_jobbert, embedding_matrix)

print(f"\nQuery: '{query_text2}'")
for k, df in recommendations.items():
    print(f"\n=== {k.upper()} ===")
    print(df[['Job.ID', 'Title', 'cluster', 'similarity']])


Query: 'web developer'

=== TOP_3 ===
      Job.ID                                          Title  cluster  \
398   310575            Sr. Web Developer @ Creative Circle        3   
999   277893  Senior Web Developer @ Paladin Consulting Inc        3   
1485  314771                Web Developer @ Creative Circle        3   

      similarity  
398     0.648812  
999     0.627929  
1485    0.624127  

=== TOP_5 ===
      Job.ID                                              Title  cluster  \
398   310575                Sr. Web Developer @ Creative Circle        3   
999   277893      Senior Web Developer @ Paladin Consulting Inc        3   
1485  314771                    Web Developer @ Creative Circle        3   
1847  262192                    Web Developer @ Creative Circle        3   
1289  250354  Web Developer  (Long Term Freelance) @ Creativ...        3   

      similarity  
398     0.648812  
999     0.627929  
1485    0.624127  
1847    0.619602  
1289    0.618049  

=== TOP_1

In [None]:
query_text2

'web developer'

In [None]:
df

Unnamed: 0,Job.ID,Title,text,cluster,similarity
398,310575,Sr. Web Developer @ Creative Circle,web developer creative circle portland full ti...,3,0.648812
999,277893,Senior Web Developer @ Paladin Consulting Inc,senior web developer paladin consult inc south...,3,0.627929
1485,314771,Web Developer @ Creative Circle,web developer creative circle phoenix part tim...,3,0.624127
1847,262192,Web Developer @ Creative Circle,web developer creative circle milton per diem ...,3,0.619602
1289,250354,Web Developer (Long Term Freelance) @ Creativ...,web developer long term freelance creative cir...,3,0.618049
1964,307724,Web Developer @ Creative Circle,web developer creative circle keller seasonal ...,3,0.615297
984,293683,Web Developer @ The BOSS Group,web developer boss group kenilworth seasonal t...,3,0.613785
1452,267494,Web Developer @ ConsultNet,web developer consultnet commerce full time pa...,3,0.582293
638,287907,Front End Web Developer - Drupal 7 @ Creative ...,front end web developer drupal creative circle...,3,0.579659
1331,254804,UI/UX Web Developer @ ConsultNet,web developer consultnet sandy full time part ...,3,0.576223


# JobAds Corpus