# Recommender using Embeddings

Data taken from https://www.kaggle.com/datasets/mexwell/us-software-engineer-jobs

In [1]:
import pandas as pd
from pathlib import Path
file = 'us-software-engineer-jobs-zenrows.csv'

df = pd.read_csv(Path('data',file))
df.shape

(58433, 29)

In [2]:
df.describe()

Unnamed: 0,rating,review_count,ad_id,source_id,job_location_postal
count,58433.0,58433.0,23626.0,58433.0,26121.0
mean,2.56504,1604.46905,337699100.0,6605754.0,52579.873627
std,1.832927,5894.934474,83079660.0,7333885.0,33371.51903
min,0.0,0.0,98269.0,17.0,603.0
25%,0.0,0.0,361697100.0,15710.0,20877.0
50%,3.5,14.0,369429600.0,3370807.0,53122.0
75%,4.0,624.0,371688700.0,13074500.0,87124.0
max,5.0,223345.0,372569600.0,20773080.0,99901.0


In [3]:
columns = ['title', 'company', 'types', 'location', 'snippet']
df = df[columns]
df.shape

(58433, 5)

In [4]:
df[df['snippet'].isnull()].dropna(inplace=True, axis='rows')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['snippet'].isnull()].dropna(inplace=True, axis='rows')


In [5]:
# Filter out rows 
mask = df[['title', 'snippet']].isnull().any(axis='columns')
df[mask].index

Index([928, 2796, 24639, 36774, 38167, 39357, 44512, 57079], dtype='int64')

In [6]:
df.drop(index=df[mask].index, inplace=True)
df.shape

(58425, 5)

In [7]:
df.sample(5)

Unnamed: 0,title,company,types,location,snippet
40628,Software Engineer - Crypto Products,TradeStation,,Remote,Work and collaborate with DevOps engineers to ...
56524,Principal Software Engineer,Liberty Mutual Insurance,,Remote,7+ years of software engineering experience.\n...
32993,C# Senior Software Developer (Contractor) - Fr...,DataAxxis,"Full-time, Contract","New York, NY",The Front Office Senior Software Developer wil...
3546,Site Reliability Engineer,SHIELD AI,,"San Diego, CA",Collaborate with a diverse group of supportive...
37609,c# Engineer,Optimum Technologies,Full-time,"San Francisco Bay Area, CA",Versed in software engineering best practices ...


In [22]:
pd.set_option('display.max_colwidth', None)

documents = list(df['snippet'][:5])

In [10]:
from fastembed.embedding import FlagEmbedding as Embedding
import numpy as np

Embedding.list_supported_models()

[{'model': 'BAAI/bge-small-en',
  'dim': 384,
  'description': 'Fast English model',
  'size_in_GB': 0.2},
 {'model': 'BAAI/bge-small-en-v1.5',
  'dim': 384,
  'description': 'Fast and Default English model',
  'size_in_GB': 0.13},
 {'model': 'BAAI/bge-small-zh-v1.5',
  'dim': 512,
  'description': 'Fast and recommended Chinese model',
  'size_in_GB': 0.1},
 {'model': 'BAAI/bge-base-en',
  'dim': 768,
  'description': 'Base English model',
  'size_in_GB': 0.5},
 {'model': 'BAAI/bge-base-en-v1.5',
  'dim': 768,
  'description': 'Base English model, v1.5',
  'size_in_GB': 0.44},
 {'model': 'sentence-transformers/all-MiniLM-L6-v2',
  'dim': 384,
  'description': 'Sentence Transformer model, MiniLM-L6-v2',
  'size_in_GB': 0.09},
 {'model': 'intfloat/multilingual-e5-large',
  'dim': 1024,
  'description': 'Multilingual model, e5-large. Recommend using this model for non-English languages',
  'size_in_GB': 2.24}]

In [23]:
embedding_model = Embedding(model_name="BAAI/bge-small-en-v1.5", max_length=512) 
embeddings: [np.ndarray] = list(embedding_model.embed(documents)) # Note the list() call - this is a generator 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76.7M/76.7M [00:06<00:00, 11.6MiB/s]


In [35]:
def print_top_k(query, embeddings, documents, top_k=5):
    query_embedding = list(embedding_model.query_embed(query))[0]
    
    # Calculate cosine similarity.
    scores = np.dot(embeddings, query_embedding)
    
    sorted_scores = np.argsort(scores)[::-1]
    for i in range(top_k):
        print(f"Rank {i+1}: {documents[sorted_scores[i]]}")

In [36]:
print_top_k("I am a backend engineer looking for golang job", embeddings, documents)

Rank 1: This person will be a senior member of the team and will be responsible for architecting, building complex features and providing technical guidance to other…
Rank 2: Proficiency in Agile software development principles is required.
 Advanced knowledge of industry software development methodologies, standards and architecture…
Rank 3: The ideal candidate will have a skill for tough puzzles, a talent for communicating complex ideas simply, and a drive to meet high expectations with great…
Rank 4: Reports to* DIRECTOR OF MARKETING.
 PHP - Equivalent of 3 years education and/or experience.
 Java script - Equivalent of 3 years education and/or experience.
Rank 5: Throughout the day, you will collaborate with your teammates and interact with our clients.
 Benefits! Shockoe offers a comprehensive and competitive benefits…


In [37]:
print_top_k("I am a backend engineer looking for php job", embeddings, documents)

Rank 1: Reports to* DIRECTOR OF MARKETING.
 PHP - Equivalent of 3 years education and/or experience.
 Java script - Equivalent of 3 years education and/or experience.
Rank 2: This person will be a senior member of the team and will be responsible for architecting, building complex features and providing technical guidance to other…
Rank 3: The ideal candidate will have a skill for tough puzzles, a talent for communicating complex ideas simply, and a drive to meet high expectations with great…
Rank 4: Throughout the day, you will collaborate with your teammates and interact with our clients.
 Benefits! Shockoe offers a comprehensive and competitive benefits…
Rank 5: Proficiency in Agile software development principles is required.
 Advanced knowledge of industry software development methodologies, standards and architecture…
