In [19]:
import numpy as np
import pandas as pd
import json

In [20]:
# Import results from `jobs_merge.csv` file
id_col = 'O*NET-SOC Code'

jobs_merge = pd.read_csv("jobs_merge.csv", sep=',').set_index(id_col)
jobs_tasks =  pd.read_csv("jobs_tasks.csv", sep=',').set_index(id_col)

jobs_merge.head()

new_merge_df = jobs_merge.merge(
  jobs_tasks[['Task']], on="O*NET-SOC Code", suffixes=("_df1", "_df2"), how='inner'
)
# # Remove unnamed columns, reset indexes
new_merge_df = new_merge_df.loc[:, ~new_merge_df.columns.str.contains('^Unnamed')].reset_index().reset_index()

In [21]:
new_merge_df.head()

Unnamed: 0,index,O*NET-SOC Code,Title,Description,Job Zone,Date,Domain Source,required_level_of_education_1,required_level_of_education_2,required_level_of_education_3,...,context_duration_of_typical_work_week,career_cluster,career_pathway,education_median,education_mode,education_diff,experience_median,experience_mode,experience_diff,Task
0,0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,5.0,07/2014,Analyst,0.0,0.0,0.0,...,2.89,Business Management & Administration,General Management,8.0,8.0,0.0,9.0,11.0,2.0,Direct or coordinate an organization's financi...
1,1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",5.0,08/2021,Analyst,0.0,0.0,0.0,...,2.74,Business Management & Administration,General Management,8.0,8.0,0.0,8.0,8.0,0.0,Monitor and evaluate effectiveness of sustaina...
2,2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",4.0,07/2015,Analyst,0.0,15.2,9.0,...,2.86,Business Management & Administration,General Management,5.0,6.0,1.0,8.0,8.0,0.0,"Review financial statements, sales or activity..."
3,3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",4.0,06/2008,Analyst,,,,...,,Government & Public Administration,Governance,,,,,,,Analyze and understand the local and national ...
4,4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",4.0,08/2018,Analyst,6.16,9.82,0.0,...,2.48,Marketing,Marketing Management,6.0,6.0,0.0,8.0,8.0,0.0,Plan and prepare advertising and promotional m...


In [22]:
# Output data with selected columns
selected_df = new_merge_df[[
    'O*NET-SOC Code',
    'index',
    'Title',
    'Description',
    'Task',
    'Job Zone',

    # Education
    'education_mode',
    'education_median',
    # Experience
    'experience_mode',
    'experience_median',

    # Style
    'style_stress_tolerance',
    'style_achievement_effort',
    'style_social_orientation',
    'style_independence',
    'style_innovation',
    # Context
    'context_duration_of_typical_work_week',
    'context_outdoors_exposed_to_weather',
    'context_deal_with_unpleasant_or_angry_people',
    'context_exposed_to_hazardous_conditions'
]]

selected_df = selected_df.rename(columns={
    'O*NET-SOC Code': 'id',
    'Title': 'title',
    'Description': 'description',
    'Task': 'task',
    'Job Zone': 'job_zone'
  })

In [23]:
selected_df = selected_df.set_index('id')
selected_df.head()

Unnamed: 0_level_0,index,title,description,task,job_zone,education_mode,education_median,experience_mode,experience_median,style_stress_tolerance,style_achievement_effort,style_social_orientation,style_independence,style_innovation,context_duration_of_typical_work_week,context_outdoors_exposed_to_weather,context_deal_with_unpleasant_or_angry_people,context_exposed_to_hazardous_conditions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
11-1011.00,0,Chief Executives,Determine and formulate policies and provide o...,Direct or coordinate an organization's financi...,5.0,8.0,8.0,11.0,9.0,4.75,4.55,3.67,4.54,4.27,2.89,2.21,3.57,1.65
11-1011.03,1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",Monitor and evaluate effectiveness of sustaina...,5.0,8.0,8.0,8.0,8.0,3.96,4.19,3.78,4.26,4.48,2.74,2.23,2.41,1.37
11-1021.00,2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","Review financial statements, sales or activity...",4.0,6.0,5.0,8.0,8.0,4.33,4.18,3.57,3.99,3.65,2.86,1.96,3.44,1.64
11-1031.00,3,Legislators,"Develop, introduce, or enact laws and statutes...",Analyze and understand the local and national ...,4.0,,,,,,,,,,,,,
11-2011.00,4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",Plan and prepare advertising and promotional m...,4.0,6.0,6.0,8.0,8.0,4.35,4.1,3.7,4.04,3.99,2.48,2.46,2.92,1.06


In [24]:
selected_df_json = selected_df.to_json(orient="index")

print(selected_df_json)

Output hidden; open in https://colab.research.google.com to view.

In [25]:
with open("occupations.json", "w") as outfile:
    outfile.write(selected_df_json)

**Career Pathways**

In [26]:
cosim_df = pd.read_csv("similarity.csv", sep=',')

In [27]:
# Required columns to decide path direction
required_cols = ['Title', 'Job Zone', 'education_median', 'education_mode', 'experience_median',	'experience_mode', 'career_cluster', 'career_pathway']

In [28]:
# Insert required columns for knowledge graph
sim_final_df = cosim_df.copy()

# sim_final_df = sim_final_df.set_index(id_col)

for index in range(len(required_cols)):
  insert_index = index + 1
  sim_final_df.insert(insert_index, required_cols[index], new_merge_df[required_cols[index]])

sim_final_df.head()

required_cols_len = len(required_cols)

In [29]:
sim_final_df = sim_final_df.set_index(id_col)

In [30]:
sim_final_df.head()

Unnamed: 0_level_0,Title,Job Zone,education_median,education_mode,experience_median,experience_mode,career_cluster,career_pathway,11-1011.00,11-1011.03,...,53-7062.00,53-7062.04,53-7063.00,53-7064.00,53-7065.00,53-7071.00,53-7072.00,53-7073.00,53-7081.00,53-7121.00
O*NET-SOC Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11-1011.00,Chief Executives,5.0,8.0,8.0,9.0,11.0,Business Management & Administration,General Management,1.0,0.591172,...,0.403639,0.404391,0.516145,0.361697,0.599789,0.555423,0.510832,0.561685,0.419944,0.428093
11-1011.03,Chief Sustainability Officers,5.0,8.0,8.0,8.0,8.0,Business Management & Administration,General Management,0.591172,1.0,...,0.339362,0.530367,0.408795,0.326051,0.443778,0.437448,0.395737,0.395497,0.467625,0.367036
11-1021.00,General and Operations Managers,4.0,5.0,6.0,8.0,8.0,Business Management & Administration,General Management,0.843937,0.613044,...,0.556488,0.523341,0.608168,0.53618,0.761739,0.602178,0.587606,0.538801,0.551872,0.560198
11-1031.00,Legislators,4.0,,,,,Government & Public Administration,Governance,0.735055,0.45488,...,0.411206,0.391405,0.454076,0.348042,0.464771,0.475191,0.440169,0.471908,0.390212,0.387859
11-2011.00,Advertising and Promotions Managers,4.0,6.0,6.0,8.0,8.0,Marketing,Marketing Management,0.675352,0.507233,...,0.350519,0.362442,0.409167,0.356931,0.533211,0.427089,0.411167,0.427121,0.414391,0.367862


**Knowledge Graph**

In [31]:
def filter_valid_path(row):
  row_id = row.name
  evaluation_cols = row.iloc[required_cols_len:]
  matches = evaluation_cols.gt(0.6)
  matches = matches.index[matches].tolist()
  matches.remove(row_id)

  comparison_col = 'experience_median'
  current_row_value = row[comparison_col]
  category_col = 'career_pathway'
  current_row_category_value = row[category_col]

  filtered = sim_final_df.loc[matches]
  filtered = filtered[(filtered[comparison_col] > current_row_value) & (filtered[category_col] == current_row_category_value)]
  return filtered.index.tolist()
  
promotions = sim_final_df.apply(filter_valid_path, axis=1)

promotions.head()

O*NET-SOC Code
11-1011.00                            [11-3031.03]
11-1011.03                                      []
11-1021.00    [11-1011.00, 11-3031.03, 11-3071.04]
11-1031.00                                      []
11-2011.00                            [11-2021.00]
dtype: object

In [32]:
promotions_dict = {}
comparison_col = 'experience_median'

for source_id, row in promotions.iteritems():
  if row:
    for target_id in row:
      source_item = selected_df.loc[source_id]
      target_item = selected_df.loc[target_id]
      relation_id = f"{source_id}__{target_id}"
      promotions_dict[relation_id] = {
          "id": relation_id,
          "source_id": source_id,
          "target_id": target_id,
          "difference": target_item[comparison_col] - source_item[comparison_col]
      }
      print(f"({source_id}: {source_item['title']} [{source_item[comparison_col]}]) -[upskill]-> ({target_id}: {target_item['title']} [{target_item[comparison_col]}])")

promotions_json = json.dumps(promotions_dict)

with open("relations.json", "w") as outfile:
    outfile.write(promotions_json)

  for source_id, row in promotions.iteritems():


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(21-1021.00: Child, Family, and School Social Workers [6.0]) -[upskill]-> (21-1094.00: Community Health Workers [7.0])
(21-1022.00: Healthcare Social Workers [6.0]) -[upskill]-> (11-9151.00: Social and Community Service Managers [8.0])
(21-1022.00: Healthcare Social Workers [6.0]) -[upskill]-> (21-1094.00: Community Health Workers [7.0])
(21-1023.00: Mental Health and Substance Abuse Social Workers [7.0]) -[upskill]-> (21-1011.00: Substance Abuse and Behavioral Disorder Counselors [8.0])
(21-1091.00: Health Education Specialists [6.0]) -[upskill]-> (11-9151.00: Social and Community Service Managers [8.0])
(21-1091.00: Health Education Specialists [6.0]) -[upskill]-> (21-1094.00: Community Health Workers [7.0])
(21-1092.00: Probation Officers and Correctional Treatment Specialists [6.0]) -[upskill]-> (33-1011.00: First-Line Supervisors of Correctional Officers [7.0])
(21-1093.00: Social and Human Service Assistants [5.0]) 

In [33]:
# Print relationship
def print_relation(source_id, target_id):
  source_item = sim_final_df.loc[source_id]
  target_item = sim_final_df.loc[target_id]
  print(f"[{source_item['career_pathway']}] ({source_id}: {source_item['Title']} [{source_item[comparison_col]}]) -[upskill]-> ({target_id}: {target_item['Title']} [{target_item[comparison_col]}])")

def print_targets(id, type='source'):
  id_key = (id + "__") if type == 'source' else ("__" + id)

  def check_key(key):
    return key.startswith(id_key) if type == 'source' else key.endswith(id_key)

  results = [(key, value) for key, value in promotions_dict.items() if check_key(key)]
  for key, value in results:
    if type == 'source':
      print_relation(id, value['target_id'])
    else:
      print_relation(value['source_id'], id)

In [34]:
# Example
print("Source")
print_targets("51-4052.00")

print("Target")
print_targets("51-4052.00", type='target')

Source
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-2011.00: Aircraft Structure, Surfaces, Rigging, and Systems Assemblers [6.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-2023.00: Electromechanical Equipment Assemblers [6.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-2031.00: Engine and Other Machine Assemblers [5.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-4022.00: Forging Machine Setters, Operators, and Tenders, Metal and Plastic [4.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-4031.00: Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic [6.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0]) -[upskill]-> (51-4032.00: Drilling and Boring Machine Tool Setters, Operators, and Tenders, Metal and Plastic [6.0])
[Production] (51-4052.00: Pourers and Casters, Metal [3.0])

In [61]:
!pip install networkx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [71]:
import networkx as nx
import matplotlib.pyplot as plt

def build_graph_data(dictionary):
  items = list(dictionary.items())
  n = len(items)
  source = [None] * n
  target = [None] * n
  edge = ["upskill"] * n
  for i in range(n):
    key, item = items[i]
    source_id = item['source_id']
    target_id = item['target_id']
    source_item = sim_final_df.loc[source_id]
    target_item = sim_final_df.loc[target_id]
    source[i] = f"{source_id}: {source_item['Title']}"
    target[i] = f"{target_id}: {target_item['Title']}"

  return pd.DataFrame({
      "source": source,
      "target": target,
      "edge": edge
  })

# Build Graph data
def build_graph(dictionary):
  data = build_graph_data(dictionary)
  # create a directed-graph from a dataframe
  graph = nx.from_pandas_edgelist(data, "source", "target", 
    edge_attr=True, create_using=nx.MultiDiGraph())
  return graph

# Print network graph
def plot_graph(graph):
  plt.figure(figsize=(60,60))
  pos = nx.spring_layout(graph)
  nx.draw(graph, with_labels=True, node_color='red', edge_cmap=plt.cm.Blues, pos = pos)
  plt.show()


In [69]:
graph = build_graph(promotions_dict)
print(graph)

MultiDiGraph with 793 nodes and 5399 edges


In [72]:
plot_graph(graph)

Output hidden; open in https://colab.research.google.com to view.