In [2]:
import pandas as pd
import numpy as np
import re
import random
from transformers import pipeline
from transformers import AutoTokenizer, BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("job_data.csv")
original_data = data.copy()
print(data.shape)

(998, 8)


In [3]:
data.isna().sum()

job_id              0
job_title           0
company_name        3
location            0
work_details        0
skills             28
qualifications    839
about               1
dtype: int64

In [4]:
data = data.drop_duplicates()
original_data = original_data.drop_duplicates()
data.shape

(989, 8)

In [5]:
data['skills'] = data['skills'].astype(str)
data['about'] = data['about'].astype(str)
data.loc[~data['qualifications'].isna(), 'qualifications'] = data['qualifications'].dropna().astype(str)

In [6]:
data['job_title'] = data['job_title'].apply(lambda x: x.lower())
data['skills'] = data['skills'].apply(lambda x: x.lower())
data['about'] = data['about'].apply(lambda x: x.lower())
data['about'] = data['about'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
data.loc[~data['qualifications'].isna(), 'qualifications'] = data['qualifications'].dropna().apply(lambda x: x.lower())
data.loc[~data['qualifications'].isna(), 'qualifications'] = data['qualifications'].dropna().apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

In [7]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

yoe_matcher = Matcher(nlp.vocab)

degree_matcher = Matcher(nlp.vocab)

yoe_patterns = [
    # 2 years experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["experience", "exp"]}}],
    # 2 years of experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": {"IN": ["experience", "exp"]}}],
    # 2 years work experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["work", "working", "professional"]}},{"LOWER": "experience"}],
    # 2 years of work experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": {"IN": ["work", "working", "professional"]}},{"LOWER": "experience"}],
    # experience of 3 years
    [{"LOWER": {"IN": ["experience", "exp"]}}, {"LOWER": {"IN": ["of", ""]}}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}],
    # 2 years as a developer / 2 years in a similar role
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": {"IN": ["as", "in"]}}],
    # 1-2 years experience
    [{"LIKE_NUM": True}, {"IS_SPACE": True, "OP": "*"}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}],
    # 2 years of relevant
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": "relevant"}],
    # 2 years relevant 
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "relevant"}],
    # 2 years of hands on experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "of"}, {"LOWER": "hands"}, {"LOWER": "on"}, {"LOWER": "experience"}],
    # 2 years hands on experience
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["year", "years"]}}, {"LOWER": "hands"}, {"LOWER": "on"}, {"LOWER": "experience"}],
]

degree_patterns = [
    [{"LOWER": {"IN": ["bachelor", "bachelors", "undergraduate"]}}],
    [{"LOWER": "master"}, {"LOWER": {"IN": ["'s", "s"]}}],
    [{"LOWER": "mba"}],
    [{"LOWER": {"IN": ["phd", "doctorate"]}}],
]

for pattern in yoe_patterns:
    yoe_matcher.add("YEARS_EXPERIENCE", [pattern])

for pattern in degree_patterns:
    degree_matcher.add("DEGREE", [pattern])

In [8]:
def get_degree(text):
    doc = nlp(text)
    matches = degree_matcher(doc)
    results = []
    for match_id, start, end in matches:
        span = doc[start:end]
        results.append(span.text)
    if len(results) == 0:
        return None
    return ','.join(results)

def get_experience(text):
    doc = nlp(text)
    matches = yoe_matcher(doc)
    yoe_found = False
    results = []
    for match_id, start, end in matches:
        span = doc[start:end]
        results.append(span.text)
        yoe_found = True
    if len(results) == 0:   
        return None
    return ','.join(results)

In [9]:
data['degree'] = data['qualifications'].apply(lambda x: get_degree(x) if pd.notna(x) else None)
data['degree'] = data.apply(lambda x: get_degree(x['about']) if pd.isna(x['degree']) else x['degree'], axis=1)

KeyboardInterrupt: 

In [None]:
data['experience'] = data['qualifications'].apply(lambda x: get_experience(x) if pd.notna(x) else None)
data['experience'] = data.apply(lambda x: get_experience(x['about']) if pd.isna(x['experience']) else x['experience'], axis=1)

In [None]:
data[['job_id', 'degree', 'experience']].sample(5)

Unnamed: 0,job_id,degree,experience
166,3971681988,bachelor,
199,3977519288,,
663,3978192327,,"3 years of work experience,3 years of work exp..."
621,3966860922,,
894,3897064595,"bachelor,master s",5 years of working experience


In [None]:
new_column_order = ['job_id', 'job_title', 'company_name', 'work_details', 'location', 'degree', 'experience', 'skills', 'about', 'qualifications']
data = data[new_column_order]
data.head()

Unnamed: 0,job_id,job_title,company_name,work_details,location,degree,experience,skills,about,qualifications
0,3935168115,devops engineer,PT. BANK NEGARA INDONESIA (Persero) Tbk.,"On-site,Contract,Entry level",Jakarta Metropolitan Area,bachelor,"4 years of work experience,4 years of work exp...","agile application development,confluence,conti...",requirements memiliki pengalaman minimal 4 tah...,bachelor s degree 4 years of work experience w...
1,3951984858,"software engineer, backend - business platform",Grab,"Hybrid,Full-time,Associate","Jakarta, Indonesia",,,"back-end web development,code review,communica...",company description about grab and our workpla...,
2,3954206104,sr. officer-rebuy & cvm (circle jakarta raya),PT. Indosat Tbk,"On-site,Full-time,Associate","Jakarta, Indonesia",bachelor,5 years of work experience,"project management,analytical skills,customer ...",develop and implement product strategies aimed...,bachelor s degree 5 years of work experience w...
3,3966866576,product innovation,ParagonCorp,"On-site,Full-time,Mid-Senior level","Jakarta, Indonesia",bachelor,"2 years of work experience,2 years of work exp...","collaboration tools,concept development,creati...",requirements a graduate of bachelor master s d...,bachelor s degree can start immediately 2 year...
4,3965410324,account executive,detikcom,"On-site,Full-time,Entry level","Jakarta, Indonesia",bachelor,"1 years of work experience,1 years of experien...","communication,microsoft office,sales",responsibilities build and maintain client rel...,bachelor s degree 1 years of work experience w...


In [None]:
def get_degree_value(degree):
    if pd.isna(degree):
        return None
    degrees = degree.lower().split(",")
    min_degree = 4
    for d in degrees:
        if 'bachelor' in d or 'undergraduate' in d:
            min_degree = min(min_degree, 1)
        elif 'master' in d or 'mba' in d:
            min_degree = min(min_degree, 2)
        elif 'phd' in d or 'doctorate' in d:
            min_degree = min(min_degree, 3)
    return min_degree

data['degree_value'] = data['degree'].apply(lambda x: get_degree_value(x))
data[['degree', 'degree_value']].sample(5)

Unnamed: 0,degree,degree_value
92,bachelor,1.0
94,"undergraduate,undergraduate",1.0
435,,
312,bachelor,1.0
842,,


In [None]:
def get_experience(experience):
    if pd.isna(experience):
        return None
    exp = experience.split(",")
    min_exp = float('inf')
    for e in exp:
        match = re.search(r'\d+', e)
        if match:
            num = int(match.group())
            min_exp = min(min_exp, num)
    return min_exp

data['min_experience'] = data['experience'].apply(lambda x: get_experience(x))
data[['job_id', 'experience', 'min_experience']].sample(5)

Unnamed: 0,job_id,experience,min_experience
194,3964083010,2 years of work experience,2.0
732,3967831767,5 7 years,5.0
765,3942431063,,
966,3940830426,"2 years of working experience,3 years in",2.0
367,3979952404,,


In [None]:
for_migration = {
    'job_id': data['job_id'].to_list(),
    'job_title': original_data['job_title'].to_list(),
    'company_name': original_data['company_name'].to_list(),
    'location': original_data['location'].to_list(),
    'work_details': original_data['work_details'].to_list(),
    'degree': data['degree'].to_list(),
    'min_experience': data['min_experience'].to_list(),
    'skills': original_data['skills'].to_list(),
    'about': original_data['about'].to_list(),
    'qualifications': original_data['qualifications'].to_list()
}
for_migration = pd.DataFrame(for_migration)

In [None]:
data.to_csv("job_preprocessed.csv", index=False)

In [15]:
user_title = ['Software Engineer,Fullstack Developer,Backend Engineer', 'Sales Manager', 'FullStack Developer', "Backend Engineer", "Data Scientist", "HRD","Product Manager", "Data Analyst", "Business Analyst", "Campaign Management"]
user_skills = [
        "Back-End Development,Software Development,Business Requirements,Communication,Computer Engineering,Computer Science,Critical Thinking,Technical Requirements,UI Testing,Unit Testing", 
        "Business-to-Business (B2B),Communication,Cross-Selling,Invitation to Tender,Negotiation,Portfolio Analysis,Turnover", 
        "Java,Cascading Style Sheets (CSS),Express.js,Full-Stack Development,JavaScript,Node.js,React.js,Software Development,MongoDB,TypeScript", 
        "Back-End Web Development,Computer Science,Java,Object-Oriented Programming (OOP),Python (Programming Language),Software Development,Angular",
        "Data Science,Programming Languages,Python (Programming Language),Artificial Intelligence,Attention to Detail,Machine Learning,Deep Learning",
        "Human Resources (HR),Recruitment,Training,Employee Relations,Performance Management,Compensation and Benefits,Organizational Development,Employee Engagement",
        "Product Management,Product Development,Product Strategy,Product Marketing,Product Launch,Product Lifecycle Management,Product Roadmap,Product Design",
        "Data Analysis,Data Management,Data Visualization,Data Warehousing,Data Mining,Data Modeling,Data Quality,Data Science",
        "Business Analysis,Business Process Improvement,Business Process Reengineering,Business Process Management,Business Process Mapping,Business Process Design,Business Process Automation,Business Process Development",
        "Campaign Management,Marketing,Marketing Strategy,Marketing Communications,Marketing Management,Marketing Research,Marketing Automation,Marketing Analytics"
    ]
user_years_of_experience = [2, 5, 3, 3, 2, 5, 4, 3, 2, 3]
user_degree = ['Bachelor', '', 'Bachelor', 'Bachelor', 'Master', None, 'Master', 'Bachelor', None, 'Bachelor']

user_experience = [
    ["2 years as senior Software Engineer at Maybank Indonesia", "Worked with backend technologies such as Node.js stack, Django and Flask", "Experience in developing web applications", "Ran Unit tests and CI/CD", "Created and maintained RESTful APIs for personal projects"],
    ["5 years as Sales Manager at PT. ABC Indonesia", "Managed a team of 10 sales representatives", "Increased sales by 30% in 2020", "Negotiated with clients to close deals", "Created a sales strategy for the company"],
    ["3 years as FullStack Developer at PT. XYZ Indonesia", "Developed web applications using MERN stack", "Worked on various projects using Node.js, React.js and Express.js", "Used MongoDB for database management", "Created RESTful APIs for client projects"],
    ["3 years as Backend Engineer at PT. ABC Indonesia", "Worked on backend technologies using Java and Python", "Developed web applications using Django and Flask", "Used OOP principles for software development", "Created RESTful APIs for client projects"],
    ["2 years as Data Scientist at PT. XYZ Indonesia", "Worked on various projects using Python", "Developed machine learning models using scikit-learn", "Used deep learning for image classification", "Created data pipelines for data processing"],
    ["5 years as HRD at PT. ABC Indonesia", "Managed recruitment process for the company", "Conducted training for new employees", "Managed employee relations and performance management", "Developed compensation and benefits for employees"],
    ["4 years as Product Manager at PT. XYZ Indonesia", "Managed product development for the company", "Developed product strategy for new products", "Created product marketing campaigns for new products", "Managed product lifecycle for existing products"],
    ["3 years as Data Analyst at PT. ABC Indonesia", "Managed data analysis for the company", "Developed data management systems for data processing", "Created data visualization for company reports", "Used data mining for data analysis"],
    ["2 years as Business Analyst at PT. XYZ Indonesia", "Managed business process improvement for the company", "Developed business process reengineering for company operations", "Created business process management for company departments", "Managed business process mapping for company operations"],
    ["3 years as Campaign Management at PT. ABC Indonesia", "Managed marketing campaigns for the company", "Developed marketing strategy for new products", "Created marketing communications for company products", "Managed marketing research for company products"]
]

user_id = [i for i in range(1, len(user_title)+1)]

user_input = pd.DataFrame({
    'user_id': user_id,
    'job_title': user_title,
    'skills': user_skills,
    'years_of_experience': user_years_of_experience,
    'degree': user_degree,
})

user_input = user_input[4:5]

In [16]:
json_data = user_input.head().to_json(orient='records')

In [17]:
json_data

'[{"user_id":5,"job_title":"Data Scientist","skills":"Data Science,Programming Languages,Python (Programming Language),Artificial Intelligence,Attention to Detail,Machine Learning,Deep Learning","years_of_experience":2,"degree":"Master"}]'

In [None]:
def calculate_title_similarity(job_title, user_titles):
    job_title = job_title.lower()
    user_titles = user_titles.lower().split(",")
    model = SentenceTransformer("all-mpnet-base-v2")
    concat = [job_title] + user_titles
    embeddings = model.encode(concat)
    job_title_embedding = embeddings[0]
    user_embeddings = embeddings[1:]
    similarities = cosine_similarity([job_title_embedding], user_embeddings)
    return max(similarities[0]) * 100

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")

In [93]:
def calculate_skill_similarity(job_skills, user_skills):
    job_skills = job_skills.lower().split(",")
    user_skills = user_skills.lower().split(",")
    score = 0
    concat = job_skills + user_skills
    embeddings = model.encode(concat)
    job_embeddings = embeddings[:len(job_skills)]
    my_embeddings = embeddings[len(job_skills):]
    matches = []
    for i in range(len(job_skills)):
        if job_skills[i] in user_skills:
            score += 1
            matches.append({"matched_skill": job_skills[i], "user_skill": user_skills[max_index], "similarity": 100})
            continue
        similarities = cosine_similarity([job_embeddings[i]], my_embeddings)[0]
        max_similarity = max(similarities)
        max_index = np.argmax(similarities)
        if max_similarity >= 0.5:
            matches.append({"matched_skill": job_skills[i], "user_skill": user_skills[max_index], "similarity": max_similarity * 100})
            score += max_similarity
    score = score/len(job_skills) * 100
    if len(matches) == 0:
        matches = None
    return score, matches

In [94]:
def calculate_description_similarity(job_description, user_description):
    concat = job_description + user_description
    model = SentenceTransformer("all-roberta-large-v1")
    embeddings = model.encode(concat)
    job_embeddings = embeddings[:len(job_description)]
    my_embeddings = embeddings[len(job_description):]
    score = 0
    matches = []
    for i in range(len(job_description)):
        similarities = cosine_similarity([my_embeddings[i]], job_embeddings)[0][0]
        score += similarities
        matches.append({"Experience": my_experience[i], "Similarity": similarities * 100})
    if len(matches) == 0:
        return 0, [{"Experience": "No experience", "Similarity": 0}]
    return score/len(job_description) * 100, matches

In [95]:
def calculate_degree_similarity(job_degree, user_degree):
    user_degree = get_degree_value(user_degree)
    if pd.isna(job_degree):
        return -1
    if pd.isna(user_degree):
        return 0
    if user_degree >= job_degree:
        return 1
    return 0

In [96]:
def calculate_experience_similarity(job_experience, user_experience):
    if pd.isna(job_experience):
        return -1
    if pd.isna(user_experience):
        return 0
    if job_experience <= user_experience:
        return 1
    return 0

In [107]:
def calculate_overall_score(title_score, skill_score, degree_score, experience_score):
    result = []
    if degree_score != -1:
        result.append(degree_score * 50)
    if experience_score != -1:
        result.append(experience_score * 50)
    result.append(skill_score)
    result.append(title_score)
    if len(result) == 0:
        return 0
    return sum(result)/len(result)

In [11]:
job_ids = []
titles = []
title_scores = []
skills = []
skill_scores = []
skill_matches = []
description = []
description_scores = []
degrees = []
degree_scores = []
years_of_experience = []
experience_scores = []
overall_scores = []

for i in range(600, 620):
    job = data.iloc[i]
    job_ids.append(job['job_id'])
    titles.append(job['job_title'])
    skills.append(job['skills'])
    title_score = calculate_title_similarity(job['job_title'], user_input['job_title'].iloc[0])
    title_scores.append(title_score)
    skill_score, skill_match = calculate_skill_similarity(job['skills'], user_input['skills'].iloc[0])
    skill_scores.append(skill_score)
    skill_matches.append(skill_match)
    degrees.append(job['degree'])
    degree_score = calculate_degree_similarity(job['degree_value'], user_input['degree'].iloc[0])
    degree_scores.append(degree_score)
    years_of_experience.append(job['min_experience'])
    experience_score = calculate_experience_similarity(job['min_experience'], user_input['years_of_experience'].iloc[0])
    experience_scores.append(experience_score)
    overall_score = calculate_overall_score(title_score, skill_score, degree_score, experience_score)
    overall_scores.append(overall_score)

results = pd.DataFrame({
    'job_id': job_ids,
    'job_title': titles,
    'title_score': title_scores,
    'skills': skills,
    'skill_score': skill_scores,
    'skill_matches': skill_matches,
    'degree': degrees,
    'degree_score': degree_scores,
    'years_of_experience': years_of_experience,
    'experience_score': experience_scores,
    'overall_score': overall_scores
})

NameError: name 'data' is not defined

In [10]:
results.sort_values(by='overall_score', ascending=False, inplace=True)
results.to_json(orient='records')

NameError: name 'results' is not defined