In [None]:
!pip install sentence-transformers scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

## Load JSON

In [1]:
import pandas as pd
import json

df_merged = pd.read_pickle("/content/mission_df_sentence_embeddings.pkl")

with open('/content/Data_Engineer_features.json', 'r') as f:
    job_data = json.load(f)

responsibilities = job_data.get('Responsibilities', [])


##Preprocessing and Embedding

In [2]:
import re
import unicodedata
from sentence_transformers import SentenceTransformer
import numpy as np

def preprocess_for_embedding(text):
    if not isinstance(text, str):
        return ''
    text = unicodedata.normalize('NFKC', text)
    text = text.replace('\xa0', ' ')
    text = re.sub(r'[\x00-\x09\x0B-\x1F\x7F]', '', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

responsibilities_clean = [preprocess_for_embedding(r) for r in responsibilities]

model = SentenceTransformer('all-MiniLM-L6-v2')
responsibility_embeddings = model.encode(responsibilities_clean, convert_to_numpy=True)
responsibility_embeddings = responsibility_embeddings / np.linalg.norm(responsibility_embeddings, axis=1, keepdims=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##Compute Average Cosine Similarity per User

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def average_similarity(user_sentence_embeddings, job_sentence_embeddings):
    if not user_sentence_embeddings:
        return 0.0
    user_embs = np.vstack(user_sentence_embeddings)
    sim_matrix = cosine_similarity(user_embs, job_sentence_embeddings)
    return float(sim_matrix.mean())

df_result = df_merged[['USER_ID']].copy()
df_result['similarity_score_raw'] = df_merged['EMBEDDING'].apply(
    lambda emb: average_similarity(emb, responsibility_embeddings)
)


## Scale and Calculate Ranking

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_result['similarity_score'] = scaler.fit_transform(df_result[['similarity_score_raw']])
df_result['user_rank'] = df_result['similarity_score'].rank(ascending=False, method='first').astype(int)


## Save Final Results or Create new table

In [5]:
df_result[['USER_ID', 'similarity_score', 'user_rank']].to_csv("user_similarity_ranking.csv", index=False)
