In [20]:
import pandas as pd
import numpy as np
import os, re, ast
from fuzzywuzzy import process

In [64]:
def convert_to_list(df,columns):

    df.fillna('', inplace=True)
    
    for col in columns:
        if isinstance(df[col][0], str):
            df[col] = [ast.literal_eval(s) for s in df[col]]
        
    return df

In [65]:
def find_edu_skills(job_df,skills_df,education):

    education_jobs = job_df.loc[job_df['education_groups'] == education]
    merged_df = pd.merge(left=education_jobs, 
                         right=skills_df, 
                         how='left', 
                         left_on='noc', 
                         right_on='noc')
    
    merged_df_unique = merged_df.drop_duplicates(subset=['link'])
    
    skills_expanded = merged_df_unique[['noc',
                                        'edu_level',
                                        'education_groups',
                                        'top_jobs',
                                        'job_percent',
                                        'skills']].explode('skills')
    
    skills_dict = skills_expanded['skills'].value_counts().to_dict()
    
    # Remove skills from the list where the count is less than 2 as they're probably not strongly associated with the educaiton
    skill_values = {key:val for key, val in skills_dict.items() if val > 2}
    
    return skill_values

In [98]:
def find_job_skills(job_lookup,skills_df,job):
    
    job_link = job_lookup.loc[job_lookup['job_group'] == job.lower()]
    
    merged_df = pd.merge(left=job_link, 
                         right=skills_df, 
                         how='left', 
                         left_on='noc', 
                         right_on='noc')
    
    
    merged_df_unique = merged_df.drop_duplicates(subset=['noc'])
    
    skills_expanded = merged_df_unique[['noc',
                                        'job_group',
                                        'skills']].explode('skills')
    
    job_skill_values = skills_expanded['skills'].value_counts().to_dict()
    
    return job_skill_values

In [92]:
data_path = '/Users/amanda/Documents/Projects/insight/data'

# Load education and occupation details

overview = pd.read_csv(os.path.join(data_path,'processed','noc-overview.csv'))
description = pd.read_csv(os.path.join(data_path,'processed','job-description.csv'))
regulation = pd.read_csv(os.path.join(data_path,'processed','job-regulation.csv'))
skills = pd.read_csv(os.path.join(data_path,'processed','job-skills.csv'))

job_name_df = pd.read_csv(os.path.join(data_path,'processed','education-to-job.csv'))


In [93]:
# Convert dataframe columns back to lists where necessary
overview_df = overview
description_df = convert_to_list(description,['duties','titles'])
regulation_df = regulation
skills_df = convert_to_list(skills,['expertise','skills','knowledge'])

In [119]:
job_lookup = overview_df[['job_group','noc']].drop_duplicates()
job_lookup['job_group'] = job_lookup['job_group'].str.lower().str.strip()

In [128]:
[print(x) for x in skills_df.noc]

NOC 7205
NOC 7304
NOC 7305
NOC 7232
NOC 7243
NOC 7244
NOC 9214
NOC 9221
NOC 9224
NOC 9227
NOC 9411
NOC 7294
NOC 7312
NOC 7313
NOC 7314
NOC 7315
NOC 7316
NOC 7322
NOC 7331
NOC 7332
NOC 7333
NOC 6343
NOC 7361
NOC 7362
NOC 7371
NOC 5131
NOC 5132
NOC 5134
NOC 5136
NOC 5211
NOC 7534
NOC 7441
NOC 7442
NOC 7451
NOC 7611
NOC 7621
NOC 7622
NOC 8211
NOC 8231
NOC 8255
NOC 8261
NOC 8411
NOC 8421
NOC 8422
NOC 8431
NOC 8441
NOC 8611
NOC 8614
NOC 9211
NOC 9212
NOC 9213
NOC 9521
NOC 9522
NOC 9525
NOC 9526
NOC 9412
NOC 9413
NOC 9414
NOC 7372
NOC 7381
NOC 9423
NOC 9431
NOC 9432
NOC 9434
NOC 9445
NOC 9461
NOC 9527
NOC 9531
NOC 9462
NOC 9471
NOC 9474
NOC 412
NOC 511
NOC 632
NOC 1112
NOC 1213
NOC 1221
NOC 1223
NOC 1243
NOC 1411
NOC 9532
NOC 9534
NOC 9535
NOC 9415
NOC 9422
NOC 9613
NOC 9614
NOC 9615
NOC 9616
NOC 9617
NOC 9619
NOC 6561
NOC 1431
NOC 6563
NOC 6622
NOC 6731
NOC 6732
NOC 6733
NOC 7203
NOC 7204
NOC 113
NOC 121
NOC 132
NOC 7311
NOC 7321
NOC 7334
NOC 6345
NOC 7373
NOC 7521
NOC 7444
NOC 7452
NOC 151

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [125]:
job = 'Statistical officers and related research support occupations'
job_link = job_lookup.loc[job_lookup['job_group'] == job.lower().strip()]

job_link

Unnamed: 0,job_group,noc
74,statistical officers and related research supp...,NOC 1254


In [126]:
# Get skills associated with the given education and compare to new job

degree = 'Civil engineering and related studies'
alt_job = 'Statistical officers and related research support occupations'

edu_skill_dict = find_edu_skills(job_name_df,skills_df,degree)
job_skill_dict = find_job_skills(job_lookup,skills_df,alt_job)

edu_skill_sort = sorted(list(edu_skill_dict.keys()))
job_skill_sort = sorted(list(job_skill_dict.keys()))

matching_skills = [key for key in edu_skill_dict.keys() & job_skill_dict.keys()]
new_skills = set(job_skill_dict.keys()) - set(edu_skill_dict.keys())

new_skills

set()

In [127]:
job_skill_sort

[]

In [108]:
def get_description(job_lookup,df,job):

    job_link = job_lookup['noc'].loc[job_lookup['job_group'] == job.lower()]
    
    print(job_link)

    merged_df = pd.merge(left=job_link, 
                         right=df, 
                         how='left', 
                         left_on='noc', 
                         right_on='noc')
    
    
    description = merged_df['description'][0]
    
    return description

In [109]:
get_description(job_lookup,description_df,'Administrative officers')

Series([], Name: noc, dtype: object)


IndexError: index out of bounds

In [32]:
job_name_df['education_groups']

0       Accounting
1       Accounting
2       Accounting
3       Accounting
4       Accounting
           ...    
1940     Sociology
1941     Sociology
1942     Sociology
1943     Sociology
1944     Sociology
Name: education_groups, Length: 1945, dtype: object

In [33]:
q = 'soc'

job_names = list(job_name_df['education_groups'].unique())

print(type(job_names))
query = [s for s in job_names if q.lower() in s.lower()]
query

<class 'list'>


['Other social sciences',
 'Social sciences',
 'Mental and Social Health Services and Allied Professions',
 'Public administration and other social service professions',
 'Social work',
 'Sociology']

In [40]:
def find_closest_match(usr_input,name_list):

    highest = process.extract(usr_input,name_list)[0:3]
    top3matches = [x[0] for x in highest]

    return top3matches

In [42]:
edu_names = list(job_name_df['education_groups'].unique())
close_match = find_closest_match('enginering',edu_names)
close_match

['Mechanical Engineering Related Technologies/Technicians',
 'Other engineering related fields',
 'Electrical and Electronic Engineering Technologies/Technicians']