In [15]:
import os
import re
import psycopg2
import requests
import json, urllib
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go

In [2]:
def back_to_list(df, col):
    """Convert columns in df back to lists from strings (this happens when saving to csv)
    """
    
    df[col] = [s.strip("'[") for s in df[col]]
    df[col] = [s.strip("]'") for s in df[col]]
    df[col] = [s.split("', '") for s in df[col]]
    
    return df[col]

In [3]:
# Load education and occupation details

jobs_df = pd.read_csv('/Users/amanda/Documents/Projects/insight/data/processed/jobbank-fields.csv')
details_df = pd.read_csv('/Users/amanda/Documents/Projects/insight/data/processed/jobbank-details.csv')
job_name_df = pd.read_csv('/Users/amanda/Documents/Projects/insight/data/processed/education-to-job.csv')


# Clean and organize skills information

In [5]:
jobs_df['description'].str.split('found under ')
edu_categories = [s.split('found under ')[1] for s in jobs_df['description']]
edu_categories = [s.strip('.') for s in edu_categories]
jobs_df['education_groups'] = edu_categories

jobs_df['top_jobs'] = back_to_list(jobs_df, 'top_jobs')
jobs_df['top_job_links'] = back_to_list(jobs_df, 'top_job_links')
jobs_df['job_percent'] = back_to_list(jobs_df, 'job_percent')

education_df = jobs_df[['degree','education_groups','top_jobs','job_percent']].drop_duplicates(subset =['education_groups','degree']) 


In [6]:
details_df['requirements'] = back_to_list(details_df, 'requirements')
details_df['skills'] = back_to_list(details_df, 'skills')

# Build dataframe for Sankey diagram

In [7]:
job_name_df = pd.read_csv('/Users/amanda/Documents/Projects/insight/data/processed/education-to-job.csv')

#job_name_df['identifier'] = job_name_df['education-groups'].str.cat(job_name_df['degree'],sep=" : ")

#job_name_df['job-percent'].fillna('', inplace=True)

#job_name_df['job-percent'] = [s.strip('%') for s in job_name_df['job-percent']]
#job_name_df['job-percent'] = pd.to_numeric(job_name_df['job-percent'], errors='coerce')

In [8]:
job_name_df

Unnamed: 0,degree,education_groups,top_jobs,job_percent,top_job_links,link,identifier
0,College/CEGEP,Accounting,Accounting and related clerks,11.99,/marketreport/summary-occupation/14122/ca,/14122/ca,Accounting : College/CEGEP
1,College/CEGEP,Accounting,Financial auditors and accountants,11.23,/marketreport/summary-occupation/131/ca,/131/ca,Accounting : College/CEGEP
2,College/CEGEP,Accounting,Accounting technicians and bookkeepers,10.46,/marketreport/summary-occupation/24500/ca,/24500/ca,Accounting : College/CEGEP
3,College/CEGEP,Accounting,Administrative officers,3.92,/marketreport/summary-occupation/12462/ca,/12462/ca,Accounting : College/CEGEP
4,College/CEGEP,Accounting,Retail salespersons,3.25,/marketreport/summary-occupation/20599/ca,/20599/ca,Accounting : College/CEGEP
...,...,...,...,...,...,...,...
1940,Master's degree,Sociology,Administrative officers,2.63,/marketreport/summary-occupation/12462/ca,/12462/ca,Sociology : Master's degree
1941,Master's degree,Sociology,Social and community service workers,2.63,/marketreport/summary-occupation/5112/ca,/5112/ca,Sociology : Master's degree
1942,Master's degree,Sociology,Business development officers and marketing re...,2.63,/marketreport/summary-occupation/3916/ca,/3916/ca,Sociology : Master's degree
1943,Master's degree,Sociology,University professors and lecturers,1.97,/marketreport/summary-occupation/4707/ca,/4707/ca,Sociology : Master's degree


In [304]:
import matplotlib.colors as mc
import colorsys

def adjust_lightness(color, amount=1.28):
    c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    rgb = colorsys.hls_to_rgb(c[0], max(0, min(1, amount * c[1])), c[2])
    return rgb

In [305]:
import numpy as np
import matplotlib.cm
import seaborn as sns
from  more_itertools import unique_everseen

def get_cmap_string(palette, domain):

    domain_unique = list(unique_everseen(domain ))
    rgbmap = sns.color_palette(palette, len(domain_unique)) #matplotlib.cm.get_cmap(palette, lut=len(domain_unique))   
    color_dict = res = dict(zip(domain_unique, rgbmap))   
    color_list = [color_dict.get(i) for i in domain]

    return color_list

In [306]:
def build_sankey(df, degree):

    data = df.where(df['education_groups']==degree).dropna()
    
    data['identifier'] = data['identifier'].str.replace(" : ","- ")

    all_nodes = data['identifier'].values.tolist() + data['top_jobs'].values.tolist()
    source_indices = [all_nodes.index(identifier) for identifier in data['identifier']]
    target_indices = [all_nodes.index(top_job) for top_job in data['top_jobs']]
    
    cmap = get_cmap_string(palette='husl', domain=all_nodes)
    cmap2 = list(map(adjust_lightness, cmap))

    fig = go.Figure(data=[go.Sankey
                          (node = dict(pad = 15,
                                       thickness = 20,
                                       line = dict(color = "black",
                                                   width = 0),
                                       label = all_nodes,
                                       color = ['rgb' + str(s).strip('[]') for s in cmap]
                                       ),
                           link = dict(source = source_indices,
                                       target = target_indices,
                                       value = data['job_percent']*100,
                                       color = ['rgb' + str(s).strip('[]') for s in cmap2]
                                       ),
                          )
                         ]
                   )

    filename = 'static/{}.png'.format(degree)

    fig.update_layout(
    autosize=False,
    width=800,
    height=800)

    return fig

In [307]:
fig = build_sankey(job_name_df,'Accounting')

fig.show()

In [82]:
job_name_df.where(job_name_df['education-groups']=='Accounting').dropna()

Unnamed: 0,degree,education-groups,top-jobs,job-percent,identifier
0,College/CEGEP,Accounting,Accounting and related clerks,11.99%,Accounting : College/CEGEP
1,College/CEGEP,Accounting,Financial auditors and accountants,11.23%,Accounting : College/CEGEP
2,College/CEGEP,Accounting,Accounting technicians and bookkeepers,10.46%,Accounting : College/CEGEP
3,College/CEGEP,Accounting,Administrative officers,3.92%,Accounting : College/CEGEP
4,College/CEGEP,Accounting,Retail salespersons,3.25%,Accounting : College/CEGEP
5,College/CEGEP,Accounting,Administrative assistants,3.06%,Accounting : College/CEGEP
6,College/CEGEP,Accounting,Retail and wholesale trade managers,2.63%,Accounting : College/CEGEP
7,College/CEGEP,Accounting,General office support workers,2.53%,Accounting : College/CEGEP
8,College/CEGEP,Accounting,Other customer and information services repres...,2.48%,Accounting : College/CEGEP
9,College/CEGEP,Accounting,Customer services representatives - financial ...,2.05%,Accounting : College/CEGEP


In [7]:
job_name_df.to_csv('/Users/amanda/Documents/Projects/insight/data/processed/education-to-job.csv', index=False)