In [1]:
import pandas as pd
import numpy as np
import os, re, ast

In [2]:
def convert_to_list(df,columns):

    df.fillna('', inplace=True)
    
    for col in columns:
        if isinstance(df[col][0], str):
            df[col] = [ast.literal_eval(s) for s in df[col]]
        
    return df

In [3]:
def find_edu_skills(job_df,skills_df,education):

    education_jobs = job_df.loc[job_df['education_groups'] == education]
    merged_df = pd.merge(left=education_jobs, 
                         right=skills_df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    merged_df_unique = merged_df.drop_duplicates(subset=['link'])
    
    skills_expanded = merged_df_unique[['link',
                                        'degree',
                                        'education_groups',
                                        'top_jobs',
                                        'job_percent',
                                        'skills']].explode('skills')
    
    skills_dict = skills_expanded['skills'].value_counts().to_dict()
    
    # Remove skills from the list where the count is less than 2 as they're probably not strongly associated with the educaiton
    skill_values = {key:val for key, val in skills_dict.items() if val > 2}
    
    return skill_values

In [68]:
def find_job_skills(job_lookup,skills_df,job):
    
    job_link = job_lookup.loc[job_lookup['top_jobs'] == job.lower()]
    
    merged_df = pd.merge(left=job_link, 
                         right=skills_df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    
    merged_df_unique = merged_df.drop_duplicates(subset=['link'])
    
    skills_expanded = merged_df_unique[['link',
                                        'top_jobs',
                                        'skills']].explode('skills')
    
    job_skill_values = skills_expanded['skills'].value_counts().to_dict()
    
    return job_skill_values

In [5]:
data_path = '/Users/amanda/Documents/Projects/insight/data'

# Load education and occupation details

overview = pd.read_csv(os.path.join(data_path,'processed','job-overview.csv'))
description = pd.read_csv(os.path.join(data_path,'processed','job-description.csv'))
regulation = pd.read_csv(os.path.join(data_path,'processed','job-regulation.csv'))
skills = pd.read_csv(os.path.join(data_path,'processed','job-skills.csv'))

job_name_df = pd.read_csv(os.path.join(data_path,'processed','education-to-job.csv'))


In [6]:
# Convert dataframe columns back to lists where necessary
overview_df = overview
description_df = convert_to_list(description,['duties','titles'])
regulation_df = regulation
skills_df = convert_to_list(skills,['expertise','skills','knowledge'])

In [34]:
# Create a simple key for job title to link
job_lookup = job_name_df[['top_jobs','link']].drop_duplicates()
job_lookup['top_jobs'] = job_lookup['top_jobs'].str.lower()

In [71]:
# Get skills associated with the given education and compare to new job

degree = 'Civil engineering and related studies'
alt_job = 'Administrative officers'

edu_skill_dict = find_edu_skills(job_name_df,skills_df,degree)
job_skill_dict = find_job_skills(job_lookup,skills_df,alt_job)

edu_skill_sort = sorted(list(edu_skill_dict.keys()))
job_skill_sort = sorted(list(job_skill_dict.keys()))

matching_skills = [key for key in edu_skill_dict.keys() & job_skill_dict.keys()]
new_skills = set(job_skill_dict.keys()) - set(edu_skill_dict.keys())

new_skills

{'Information Handling : Managing Information',
 'Management : Allocating and Controlling Resources'}

In [72]:
job_skill_sort

['Analysis : Analyzing Information',
 'Analysis : Planning',
 'Analysis : Projecting Outcomes',
 'Information Handling : Managing Information',
 'Management : Allocating and Controlling Resources',
 'Management : Co-ordinating and Organizing',
 'Management : Supervising']

In [66]:
def get_description(job_lookup,df,job):

    job_link = job_lookup['link'].loc[job_lookup['top_jobs'] == job.lower()]

    merged_df = pd.merge(left=job_link, 
                         right=df, 
                         how='left', 
                         left_on='link', 
                         right_on='link')
    
    
    description = merged_df['description'][0]
    
    return description

In [67]:
get_description(job_lookup, description_df,'Administrative officers')

'Administrative officers oversee and implement administrative procedures, establish work priorities, conduct analyses of administrative operations and co-ordinate acquisition of administrative services such as office space, supplies and security services. They are employed throughout the public and private sectors. Administrative officers who are supervisors are included in this unit group.'