In [1]:
!pip install -q kagglehub sentence-transformers

In [2]:
import kagglehub
import os
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")
print("Path to dataset files:", path)
print("Files:", os.listdir(path))

Downloading from https://www.kaggle.com/api/v1/datasets/download/arshkon/linkedin-job-postings?dataset_version_number=13...


100%|██████████| 159M/159M [00:08<00:00, 19.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/arshkon/linkedin-job-postings/versions/13
Files: ['companies', 'mappings', 'postings.csv', 'jobs']


In [3]:
import pandas as pd
data_path = os.path.join(path, "postings.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [4]:
required_cols = [
    "job_id",
    "title",
    "description",
    "skills_desc",
    "company_name",
    "location",
    "formatted_experience_level",
    "formatted_work_type"
]

df = df[required_cols]

In [5]:
df = df.dropna(subset=["title", "description", "skills_desc"])
df = df[df["description"].str.strip() != ""]
df = df[df["skills_desc"].str.strip() != ""]

In [6]:
from bs4 import BeautifulSoup
import re

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_skills(skills):
    skills = skills.lower()
    skills = skills.replace("|", ",")
    skills = skills.replace(";", ",")
    return skills

df["job_text"] = (
    df["title"].apply(clean_text) + " " +
    df["title"].apply(clean_text) + " " +
    df["skills_desc"].apply(normalize_skills) + " " +
    df["skills_desc"].apply(normalize_skills) + " " +
    df["skills_desc"].apply(normalize_skills) + " " +
    df["description"].apply(clean_text)
)

df.to_csv("jobs_with_text.csv", index=False)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=30_000,
    min_df=5,
    stop_words="english"
)

tfidf_matrix = vectorizer.fit_transform(df["job_text"])
tfidf_matrix.shape

import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_recommend(query, top_k=10):
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = scores.argsort()[::-1][:top_k]
    return df.iloc[top_idx][["title", "company_name", "skills_desc"]]

tfidf_recommend("python data engineer spark", top_k=10)

Unnamed: 0,title,company_name,skills_desc
6896,Palantir Developer,Tata Consultancy Services,"Data Modelling , Big Data"
74986,Data Engineer,Collabera,"Data Engineer,AWS,Redshift,Python"
32389,Lead Data Engineer,OtterBase,"Lead, Data, Engineer, Python, Databricks, Snow..."
81530,Data Science & Visualization Engineer,Tata Consultancy Services,"Python , Tableau"
30129,Python API Developer (Python Binding for C++Li...,Tata Consultancy Services,Python
43542,Developer,Tata Consultancy Services,"Scala , Hadoop , Spark"
4767,Developer,Tata Consultancy Services,"Python , Data Modeling , Machine Learning"
91445,Python Developer,Collabera,"Python,Flask,FastApi,Django,SQL,Deployement,De..."
14626,Volunteer: Data Engineer,VolunteerMatch,This position requires the following skills: B...
43734,Python Developer with AI/ML Skills,Tata Consultancy Services,Python


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

job_embeddings = model.encode(
    df["job_text"].tolist(),
    batch_size=64,
    show_progress_bar=True
)

import numpy as np

np.save("job_embeddings.npy", job_embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [11]:
def recommend(query_text, top_k=10, alpha=0.6):
    # TF-IDF similarity
    q_tfidf = vectorizer.transform([query_text])
    tfidf_scores = cosine_similarity(q_tfidf, tfidf_matrix)[0]

    # Embedding similarity
    q_embed = model.encode([query_text])
    embed_scores = cosine_similarity(q_embed, job_embeddings)[0]

    # Hybrid score
    final_scores = alpha * tfidf_scores + (1 - alpha) * embed_scores
    top_idx = final_scores.argsort()[::-1][:top_k]

    return df.iloc[top_idx][
        ["title", "company_name", "location", "skills_desc"]
    ]

recommend("python spark airflow data engineer", top_k=10)

Unnamed: 0,title,company_name,location,skills_desc
6896,Palantir Developer,Tata Consultancy Services,"Cleveland, OH","Data Modelling , Big Data"
81530,Data Science & Visualization Engineer,Tata Consultancy Services,"Cupertino, CA","Python , Tableau"
74986,Data Engineer,Collabera,"Bellevue, WA","Data Engineer,AWS,Redshift,Python"
32389,Lead Data Engineer,OtterBase,"Nashville, TN","Lead, Data, Engineer, Python, Databricks, Snow..."
68823,Developer,Tata Consultancy Services,"Boston, MA","Python , Java , AWS"
68821,Developer,Tata Consultancy Services,"Boston, MA","Oracle , Python , Java"
68894,Developer,Tata Consultancy Services,"Boston, MA","Oracle , Python , Java"
69046,Analyst,Tata Consultancy Services,"Boston, MA","Python , AWS , Data Modelling"
43542,Developer,Tata Consultancy Services,"Plano, TX","Scala , Hadoop , Spark"
30129,Python API Developer (Python Binding for C++Li...,Tata Consultancy Services,"Cupertino, CA",Python


In [12]:
df[[
    "job_id",
    "title",
    "company_name",
    "location",
    "skills_desc"
]].to_csv("jobs.csv", index=False)

In [13]:
recommend("")
recommend("python, sql, spark")
recommend("Looking for a senior backend role building large-scale ETL pipelines")

Unnamed: 0,title,company_name,location,skills_desc
66272,Senior ETL Developer,Collabera,"Cambridge, MA","ETL,SSIS,SSRS,Datawarehouse,Maintenance ,Data ..."
105491,Ab Initio ETL Senior Developer,UST,"Aliso Viejo, CA","Abinitio,Unix,SQL"
68085,BIE/Business Intelligence Engineer/Business In...,Collabera,"Seattle, WA","SQL,ETL,Tableau"
36438,Senior ETL Tester,UST,"Aliso Viejo, CA","Abinitio,Unix,SQL"
43735,ETL Developer,Tata Consultancy Services,"Pleasanton, CA","DB2 DBA , ETL Testing , SQL , Informatica"
43773,ETL Developer,Tata Consultancy Services,"Pleasanton, CA","Oracle , ETL Testing , Informatica"
32389,Lead Data Engineer,OtterBase,"Nashville, TN","Lead, Data, Engineer, Python, Databricks, Snow..."
43432,Developer,Tata Consultancy Services,"McLean, VA","Control M , Mongo DB , Informatica"
43485,Engineer,Tata Consultancy Services,"New York, NY",AWS
30267,Full Stack Swift with MYSQL,Tata Consultancy Services,"Cupertino, CA","MySql\n\n, Swift"
