In [1]:
!pip install pandas sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1

In [2]:
import numpy as np
import pandas as pd
import re
import json
from scipy import spatial
from sentence_transformers import SentenceTransformer

In [3]:
# Files
occupation_data_file = "https://www.onetcenter.org/dl_files/database/db_27_2_text/Occupation%20Data.txt"
task_statements_file = "https://www.onetcenter.org/dl_files/database/db_27_2_text/Task%20Statements.txt"

In [4]:
# Loading the Occupational dataset
occupation_data = pd.read_csv(occupation_data_file, sep='\t')

index_col = "O*NET-SOC Code"

occupation_data = occupation_data.set_index(index_col)

occupation_data.head()

Unnamed: 0_level_0,Title,Description
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1
11-1011.00,Chief Executives,Determine and formulate policies and provide o...
11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [5]:
# Loading the Tasks dataset
task_data = pd.read_csv(task_statements_file, sep='\t')

task_data = task_data.set_index(index_col)

task_data.head()

Unnamed: 0_level_0,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11-1011.00,8823,Direct or coordinate an organization's financi...,Core,87.0,07/2014,Incumbent
11-1011.00,8831,Appoint department heads or managers and assig...,Core,87.0,07/2014,Incumbent
11-1011.00,8825,Analyze operations to evaluate performance of ...,Core,87.0,07/2014,Incumbent
11-1011.00,8826,"Direct, plan, or implement policies, objective...",Core,87.0,07/2014,Incumbent
11-1011.00,8827,"Prepare budgets for approval, including those ...",Core,87.0,07/2014,Incumbent


In [6]:
# Aggregate job description
group_task_df = task_data[['Task']]\
  .groupby(["O*NET-SOC Code"])['Task']\
  .apply(lambda x: '\t'.join(x)).reset_index()

group_task_df.head()

Unnamed: 0,O*NET-SOC Code,Task
0,11-1011.00,Direct or coordinate an organization's financi...
1,11-1011.03,Monitor and evaluate effectiveness of sustaina...
2,11-1021.00,"Review financial statements, sales or activity..."
3,11-1031.00,Analyze and understand the local and national ...
4,11-2011.00,Plan and prepare advertising and promotional m...


In [7]:
# Merge with occupational data table
merge_df = occupation_data.merge(
  group_task_df, on="O*NET-SOC Code", suffixes=("_df1", "_df2"), how='inner'
)

merge_df = merge_df.set_index(index_col)

merge_df["description_task"] = merge_df["Description"] + "\t" + merge_df["Task"]

merge_df.head()

Unnamed: 0_level_0,Title,Description,Task,description_task
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Direct or coordinate an organization's financi...,Determine and formulate policies and provide o...
11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",Monitor and evaluate effectiveness of sustaina...,"Communicate and coordinate with management, sh..."
11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","Review financial statements, sales or activity...","Plan, direct, or coordinate the operations of ..."
11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",Analyze and understand the local and national ...,"Develop, introduce, or enact laws and statutes..."
11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",Plan and prepare advertising and promotional m...,"Plan, direct, or coordinate advertising polici..."


In [8]:
merge_df.describe()

Unnamed: 0,Title,Description,Task,description_task
count,923,923,923,923
unique,923,923,923,923
top,Chief Executives,Determine and formulate policies and provide o...,Direct or coordinate an organization's financi...,Determine and formulate policies and provide o...
freq,1,1,1,1


Text Processing

In [9]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
task_embeddings = model.encode(merge_df["description_task"])

In [11]:
task_embeddings.shape

(923, 768)

In [12]:
task_embeddings

array([[ 0.05364482,  0.01825294,  0.00836201, ..., -0.00818568,
         0.03841463, -0.0037288 ],
       [ 0.05310498,  0.06315918,  0.00045989, ..., -0.00501814,
         0.02057329, -0.00680427],
       [ 0.03038887,  0.00030764, -0.00270253, ...,  0.00073381,
         0.01919656, -0.01241072],
       ...,
       [-0.00132103, -0.00754632,  0.00295144, ...,  0.01046922,
        -0.01817417, -0.01741959],
       [ 0.04147436,  0.0771044 ,  0.00296297, ..., -0.03907046,
         0.01063881,  0.01055883],
       [-0.02435871,  0.01742864, -0.01280583, ..., -0.00957355,
        -0.02452322, -0.02681528]], dtype=float32)

In [13]:
# Assign embeddings to occupation
merge_df["embeddings"] = np.ndarray.tolist(task_embeddings)
merge_df.head()

Unnamed: 0_level_0,Title,Description,Task,description_task,embeddings
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Direct or coordinate an organization's financi...,Determine and formulate policies and provide o...,"[0.0536448210477829, 0.018252937123179436, 0.0..."
11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",Monitor and evaluate effectiveness of sustaina...,"Communicate and coordinate with management, sh...","[0.053104981780052185, 0.06315918266773224, 0...."
11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","Review financial statements, sales or activity...","Plan, direct, or coordinate the operations of ...","[0.03038887307047844, 0.00030763985705561936, ..."
11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",Analyze and understand the local and national ...,"Develop, introduce, or enact laws and statutes...","[0.02162821590900421, 0.05060312896966934, 0.0..."
11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",Plan and prepare advertising and promotional m...,"Plan, direct, or coordinate advertising polici...","[0.07993657886981964, 0.03238765150308609, 0.0..."


In [14]:
merge_df.to_csv("jobs_tasks.csv")

Text Similarity via Cosine Similarity

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
df = merge_df.copy()

def calc_similarity(input):
    data = df.copy()
    input_vector = model.encode(input)
    s = data['embeddings'].apply(lambda x: 1 - spatial.distance.cosine(x, input_vector))
    data = data.assign(similarity = s)
    return(data.sort_values('similarity',ascending=False))

In [17]:
sample_input_text = "I like math and into tasting different cuisines"

In [18]:
results = calc_similarity(sample_input_text)
results.head(20)

Unnamed: 0_level_0,Title,Description,Task,description_task,embeddings,similarity
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
35-2013.00,"Cooks, Private Household",Prepare meals in private homes. Includes perso...,Plan menus according to employers' needs and d...,Prepare meals in private homes. Includes perso...,"[0.005730997305363417, 0.022486571222543716, -...",0.306955
19-1012.00,Food Scientists and Technologists,"Use chemistry, microbiology, engineering, and ...",Inspect food processing areas to ensure compli...,"Use chemistry, microbiology, engineering, and ...","[0.054701369255781174, 0.0664542019367218, -0....",0.300826
35-1011.00,Chefs and Head Cooks,"Direct and may participate in the preparation,...",Monitor sanitation practices to ensure that em...,"Direct and may participate in the preparation,...","[0.026355532929301262, 0.032496266067028046, -...",0.26708
19-4013.00,Food Science Technicians,Work with food scientists or technologists to ...,Record or compile test results or prepare grap...,Work with food scientists or technologists to ...,"[0.03496571257710457, 0.004672635812312365, -0...",0.265807
15-2021.00,Mathematicians,Conduct research in fundamental mathematics or...,"Address the relationships of quantities, magni...",Conduct research in fundamental mathematics or...,"[-0.0152164651080966, 0.0656314417719841, 4.13...",0.265395
35-2015.00,"Cooks, Short Order",Prepare and cook to order a variety of foods t...,"Clean food preparation equipment, work areas, ...",Prepare and cook to order a variety of foods t...,"[0.006050362251698971, -0.0005959015106782317,...",0.262124
51-3092.00,Food Batchmakers,Set up and operate equipment that mixes or ble...,Record production and test data for each food ...,Set up and operate equipment that mixes or ble...,"[-0.016138162463903427, -0.030569136142730713,...",0.261882
35-2011.00,"Cooks, Fast Food",Prepare and cook food in a fast food restauran...,Order and take delivery of supplies.\tCook the...,Prepare and cook food in a fast food restauran...,"[0.012925593182444572, 0.0021542359609156847, ...",0.259068
35-2014.00,"Cooks, Restaurant","Prepare, season, and cook dishes such as soups...","Inspect and clean food preparation areas, such...","Prepare, season, and cook dishes such as soups...","[0.023434465751051903, 0.01888412982225418, -0...",0.254589
35-2021.00,Food Preparation Workers,Perform a variety of food preparation duties o...,"Clean and sanitize work areas, equipment, uten...",Perform a variety of food preparation duties o...,"[0.023771002888679504, 0.01435038074851036, -0...",0.248258


In [19]:
vecs = merge_df[['embeddings']].copy()
vecs = vecs.reset_index()
vecs = vecs.rename(columns={index_col: "id"})
vecs.head() 

Unnamed: 0,id,embeddings
0,11-1011.00,"[0.0536448210477829, 0.018252937123179436, 0.0..."
1,11-1011.03,"[0.053104981780052185, 0.06315918266773224, 0...."
2,11-1021.00,"[0.03038887307047844, 0.00030763985705561936, ..."
3,11-1031.00,"[0.02162821590900421, 0.05060312896966934, 0.0..."
4,11-2011.00,"[0.07993657886981964, 0.03238765150308609, 0.0..."


In [20]:
vecs.to_pickle("./task_embeddings.pkl")  

In [21]:
emb_df = pd.read_pickle("./task_embeddings.pkl")

emb_df.head()

Unnamed: 0,id,embeddings
0,11-1011.00,"[0.0536448210477829, 0.018252937123179436, 0.0..."
1,11-1011.03,"[0.053104981780052185, 0.06315918266773224, 0...."
2,11-1021.00,"[0.03038887307047844, 0.00030763985705561936, ..."
3,11-1031.00,"[0.02162821590900421, 0.05060312896966934, 0.0..."
4,11-2011.00,"[0.07993657886981964, 0.03238765150308609, 0.0..."


In [23]:
task_embeddings = model.encode(merge_df["description_task"])

In [24]:
embeddings_df = pd.DataFrame(task_embeddings)

merge_embeddings_df = group_task_df[[index_col]].join(embeddings_df).set_index(index_col)

merge_embeddings_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11-1011.00,0.053645,0.018253,0.008362,0.02031,-0.026004,0.045552,-0.042385,-0.018059,0.019062,-0.013607,...,0.024042,-0.031567,-0.05986,0.035682,-0.049713,-0.066396,-0.006808,-0.008186,0.038415,-0.003729
11-1011.03,0.053105,0.063159,0.00046,0.009874,-0.005874,0.025072,-0.031806,-0.005352,-0.052735,-0.021316,...,0.001337,-0.032829,-0.072195,0.001321,-0.015946,-0.054686,0.01046,-0.005018,0.020573,-0.006804
11-1021.00,0.030389,0.000308,-0.002703,-0.004188,-0.01992,0.033918,0.028961,-0.015089,-0.026181,-0.011414,...,0.001564,-0.036881,-0.080305,0.008594,-0.052148,-0.066299,-0.007575,0.000734,0.019197,-0.012411
11-1031.00,0.021628,0.050603,0.0115,0.020515,-0.083787,0.025022,-0.031809,-0.000512,0.007156,-0.015567,...,0.04832,-0.038064,-0.040545,0.034387,-0.043746,-0.035286,-0.03091,-0.009832,0.04827,-0.013265
11-2011.00,0.079937,0.032388,0.005315,-0.046845,-0.004993,0.007684,-0.016015,-0.032601,-0.011916,-0.04192,...,0.02145,-0.027315,-0.048459,-0.004725,-0.057103,-0.027643,-0.035003,0.011107,-0.001153,-0.008362


In [25]:
# Calculate item-to-item cosine similarity
cs = cosine_similarity(merge_embeddings_df)
print(cs)

[[0.99999994 0.59117174 0.8439366  ... 0.5616852  0.41994438 0.42809284]
 [0.59117174 1.0000002  0.6130444  ... 0.39549705 0.46762455 0.36703587]
 [0.8439366  0.6130444  0.99999964 ... 0.53880125 0.55187166 0.560198  ]
 ...
 [0.5616852  0.39549705 0.53880125 ... 1.0000005  0.5649844  0.70752424]
 [0.41994438 0.46762455 0.55187166 ... 0.5649844  1.0000002  0.70315635]
 [0.42809284 0.36703587 0.560198   ... 0.70752424 0.70315635 1.0000005 ]]


In [26]:
cs_df = pd.DataFrame(cs, index=merge_embeddings_df.index, columns=merge_embeddings_df.index)
cs_df.to_csv('./similarity.csv')