In [1]:
import pandas as pd
import numpy as np
import os, re, ast
from fuzzywuzzy import process



In [2]:
def convert_to_list(df,columns):

    df.fillna('', inplace=True)
    
    for col in columns:
        if isinstance(df[col][0], str):
            df[col] = [ast.literal_eval(s) for s in df[col]]
        
    return df

In [3]:
def find_edu_skills(job_df,skills_df,education):

    education_jobs = job_df.loc[job_df['education_groups'] == education]
    merged_df = pd.merge(left=education_jobs, 
                         right=skills_df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    merged_df_unique = merged_df.drop_duplicates(subset=['link'])
    
    skills_expanded = merged_df_unique[['link',
                                        'degree',
                                        'education_groups',
                                        'top_jobs',
                                        'job_percent',
                                        'skills']].explode('skills')
    
    skills_dict = skills_expanded['skills'].value_counts().to_dict()
    
    # Remove skills from the list where the count is less than 2 as they're probably not strongly associated with the educaiton
    skill_values = {key:val for key, val in skills_dict.items() if val > 2}
    
    return skill_values

In [4]:
def find_job_skills(job_lookup,skills_df,job):
    
    job_link = job_lookup.loc[job_lookup['top_jobs'] == job.lower()]
    
    merged_df = pd.merge(left=job_link, 
                         right=skills_df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    
    merged_df_unique = merged_df.drop_duplicates(subset=['link'])
    
    skills_expanded = merged_df_unique[['link',
                                        'top_jobs',
                                        'skills']].explode('skills')
    
    job_skill_values = skills_expanded['skills'].value_counts().to_dict()
    
    return job_skill_values

In [14]:
data_path = '/Users/amanda/Documents/Projects/insight/data'

# Load education and occupation details

overview = pd.read_csv(os.path.join(data_path,'processed','noc-overview.csv'))
description = pd.read_csv(os.path.join(data_path,'processed','job-description.csv'))
regulation = pd.read_csv(os.path.join(data_path,'processed','job-regulation.csv'))
skills = pd.read_csv(os.path.join(data_path,'processed','job-skills.csv'))

job_name_df = pd.read_csv(os.path.join(data_path,'processed','education-to-job.csv'))


In [15]:
# Convert dataframe columns back to lists where necessary
overview_df = overview
description_df = convert_to_list(description,['duties','titles'])
regulation_df = regulation
skills_df = convert_to_list(skills,['expertise','skills','knowledge'])

In [16]:
overview_df

Unnamed: 0,noc_link,noc_code,job_group,description,alt_titles
0,/Structure/NocProfile/d86ebb9a0ef646f6ad0ef3d5...,11,Legislators,Legislators participate in the activities of a...,"['Aboriginal band chief', 'Aboriginal band cou..."
1,/Structure/NocProfile/4196c76219ed4408a4defee3...,12,Senior government managers and officials,"Senior government managers and officials plan,...","['Ambassador', 'Assistant chief statistician -..."
2,/Structure/NocProfile/b835b460af1c4473ae874d47...,13,"Senior managers - financial, communications ...","Senior managers in financial, communications a...","['Advertising agency president', 'Assistant ex..."
3,/Structure/NocProfile/89394570529f4d049825a3b8...,14,"Senior managers - health, education, social ...","Senior managers in health, education, social a...","['Academic association president', 'Academic b..."
4,/Structure/NocProfile/82496d82a4ce49569191a014...,15,"Senior managers - trade, broadcasting and ot...","Senior managers in trade, broadcasting and oth...","['Broadcasting corporation president', 'Casino..."
...,...,...,...,...,...
495,/Structure/NocProfile/40ee46039438487abc199210...,9615,Labourers in rubber and plastic products man...,Labourers in rubber and plastic products manuf...,['Airbag extractor - rubber products manufactu...
496,/Structure/NocProfile/40deef98f71a419baff819d4...,9616,Labourers in textile processing,Labourers in textile processing perform a vari...,"['Bale checker - textile processing', 'Battery..."
497,/Structure/NocProfile/6ca6bc6727814e549571f23c...,9617,Labourers in food and beverage processing,Labourers in food and beverage processing perf...,"['Animal feed sack filler', 'Bagger - food and..."
498,/Structure/NocProfile/607616f9d24a4a0e993763b5...,9618,Labourers in fish and seafood processing,Labourers in fish and seafood processing perfo...,"['Cannery labourer', 'Cannery labourer - fish ..."


In [7]:
# Create a simple key for job title to link
merged_df = pd.merge(left=job_link, 
                     right=skills_df, 
                     how='left', 
                     left_on='link', 
                     right_on='link')

top_job_lookup = job_name_df[['top_jobs','noc']].drop_duplicates()
top_job_lookup['top_jobs'] = job_lookup['top_jobs'].str.lower()

KeyError: "['noc'] not in index"

In [28]:
# Get skills associated with the given education and compare to new job

degree = 'Civil engineering and related studies'
alt_job = 'Administrative officers'

edu_skill_dict = find_edu_skills(job_name_df,skills_df,degree)
job_skill_dict = find_job_skills(job_lookup,skills_df,alt_job)

edu_skill_sort = sorted(list(edu_skill_dict.keys()))
job_skill_sort = sorted(list(job_skill_dict.keys()))

matching_skills = [key for key in edu_skill_dict.keys() & job_skill_dict.keys()]
new_skills = set(job_skill_dict.keys()) - set(edu_skill_dict.keys())

new_skills

{'Information Handling : Managing Information',
 'Management : Allocating and Controlling Resources'}

In [29]:
job_skill_sort

['Analysis : Analyzing Information',
 'Analysis : Planning',
 'Analysis : Projecting Outcomes',
 'Information Handling : Managing Information',
 'Management : Allocating and Controlling Resources',
 'Management : Co-ordinating and Organizing',
 'Management : Supervising']

In [30]:
def get_description(job_lookup,df,job):

    job_link = job_lookup['link'].loc[job_lookup['top_jobs'] == job.lower()]

    merged_df = pd.merge(left=job_link, 
                         right=df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    
    description = merged_df['description'][0]
    
    return description

In [31]:
get_description(job_lookup, description_df,'Administrative officers')

'Administrative officers oversee and implement administrative procedures, establish work priorities, conduct analyses of administrative operations and co-ordinate acquisition of administrative services such as office space, supplies and security services. They are employed throughout the public and private sectors. Administrative officers who are supervisors are included in this unit group.'

In [32]:
job_name_df['education_groups']

0       Accounting
1       Accounting
2       Accounting
3       Accounting
4       Accounting
           ...    
1940     Sociology
1941     Sociology
1942     Sociology
1943     Sociology
1944     Sociology
Name: education_groups, Length: 1945, dtype: object

In [33]:
q = 'soc'

job_names = list(job_name_df['education_groups'].unique())

print(type(job_names))
query = [s for s in job_names if q.lower() in s.lower()]
query

<class 'list'>


['Other social sciences',
 'Social sciences',
 'Mental and Social Health Services and Allied Professions',
 'Public administration and other social service professions',
 'Social work',
 'Sociology']

In [40]:
def find_closest_match(usr_input,name_list):

    highest = process.extract(usr_input,name_list)[0:3]
    top3matches = [x[0] for x in highest]

    return top3matches

In [42]:
edu_names = list(job_name_df['education_groups'].unique())
close_match = find_closest_match('enginering',edu_names)
close_match

['Mechanical Engineering Related Technologies/Technicians',
 'Other engineering related fields',
 'Electrical and Electronic Engineering Technologies/Technicians']