In [7]:
import numpy as np
import os
import json
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import contractions
import plotly.io as pio
import plotly.graph_objs as go
import seaborn as sns
pio.renderers.default = "plotly_mimetype+notebook_connected"

# Load JSON data
def read_json_data(folder_path):
    data_list = []
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), 'r') as f:
            json_data = json.load(f)
            jobs_results = json_data['jobs_results']
            data_list.extend(jobs_results)
    df = pd.DataFrame(data_list)
    return df

df_dc_da = read_json_data('../2023-04-14-job-search-location-USA/data_analysis')
df_dc_ds = read_json_data('../2023-04-14-job-search-location-DC/data_science')
df_dc_ml = read_json_data('../2023-04-14-job-search-location-DC/machine_learning')

df_us_da = read_json_data('../2023-04-14-job-search-location-USA/data_analysis')
df_us_ds = read_json_data('../2023-04-14-job-search-location-USA/data_science')
df_us_ml = read_json_data('../2023-04-14-job-search-location-USA/machine_learning')

# format each df accordingly
df_dc_da['job_type']='data_analysis'
df_dc_ds['job_type']='data_science'
df_dc_ml['job_type']='machine_learning'
df_us_da['job_type']='data_analysis'
df_us_ds['job_type']='data_science'
df_us_ml['job_type']='machine_learning'

df_dc_da['location_overall']='DC'
df_dc_ds['location_overall']='DC'
df_dc_ml['location_overall']='DC'
df_us_da['location_overall']='USA'
df_us_ds['location_overall']='USA'
df_us_ml['location_overall']='USA'

df_us_da= df_us_da[~df_us_da['location'].str.contains('DC')]
df_us_ds= df_us_ds[~df_us_ds['location'].str.contains('DC')]
df_us_ml= df_us_ml[~df_us_ml['location'].str.contains('DC')]


# concat dfs
df = pd.concat([df_dc_da,df_dc_ds,df_dc_ml,df_us_da,df_us_ds,df_us_ml],axis=0)
df.head()

Unnamed: 0,title,company_name,location,via,description,job_highlights,related_links,extensions,detected_extensions,job_id,job_type,location_overall
0,Master Data Analyst,Cargill,"Olathe, KS",via Career At Cargill,"Want to build a stronger, more sustainable fut...","[{'title': 'Qualifications', 'items': ['Bachel...","[{'link': 'http://www.cargill.com/', 'text': '...","[4 days ago, Full-time, Health insurance, Dent...","{'posted_at': '4 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJNYXN0ZXIgRGF0YSBBbmFseXN0Ii...,data_analysis,DC
1,Data Analyst,Rectangle Health,Anywhere,via LinkedIn,"Data Analyst\n\nAt Rectangle Health, we believ...","[{'title': 'Qualifications', 'items': ['Bachel...",[{'link': 'https://www.google.com/search?q=Rec...,"[1 day ago, Work from home, Full-time]","{'posted_at': '1 day ago', 'schedule_type': 'F...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,data_analysis,DC
2,Senior Data Analyst,Tendo Systems,Anywhere,via Startup Jobs,"As a Senior Data Analyst, you will play a cruc...","[{'title': 'Qualifications', 'items': ['5+ yea...","[{'link': 'http://tendo.com/', 'text': 'tendo....","[4 days ago, Work from home, Full-time, Health...","{'posted_at': '4 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJTZW5pb3IgRGF0YSBBbmFseXN0Ii...,data_analysis,DC
3,Senior Data Analyst,Walmart,"Bentonville, AR",via Walmart Careers,What you'll do...\n\nPosition: Senior Data Ana...,[{'items': ['What you'll do... Position: Seni...,"[{'link': 'https://www.walmart.com/', 'text': ...","[6 days ago, Full-time]","{'posted_at': '6 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJTZW5pb3IgRGF0YSBBbmFseXN0Ii...,data_analysis,DC
4,Data Analyst,Robert Half,United States,via LinkedIn,This role will be once a week in DC and Remote...,"[{'title': 'Qualifications', 'items': ['Micros...","[{'link': 'http://www.rhi.com/', 'text': 'rhi....","[3 hours ago, 75K–95K a year, Full-time]","{'posted_at': '3 hours ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,data_analysis,DC


In [106]:
# extract qualifiations, responsibilities and benefits where possible
qualifications = []
responsibilities = []
benefits = []
for row in df['job_highlights']:
    qual = []
    res = []
    ben = []
    for dictionary in row:
        if 'title' in dictionary:
            if dictionary['title'] == 'Qualifications':
                qual += dictionary['items']
            elif dictionary['title'] == 'Responsibilities':
                res += dictionary['items']
            elif dictionary['title'] == 'Benefits':
                ben += dictionary['items']
    qualifications.append(qual)
    responsibilities.append(res)
    benefits.append(ben)


schedule_types = []
remote_booleans = []
for row in df['detected_extensions']:
    if 'schedule_type' in row:
        schedule_types.append(row['schedule_type'])
    else:
        schedule_types.append(None) 
    if 'work_from_home' in row:
        remote_booleans.append(row['work_from_home'])
    else:
        remote_booleans.append(None)


# create two new columns
df['qualification'] = qualifications
df['responsibility'] = responsibilities
df['benefits'] = benefits
df['schedule_type'] = schedule_types
df['remote_status'] = remote_booleans


# Define a function to clean the text
def clean_text(text):
    # Convert the list of strings to a single string
    text = ' '.join(text)
    # Expand contractions
    text = contractions.fix(text)
    # Remove all the special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # Join the words back into a string
    text = ' '.join(words)
    # remove non-English words
    return text

# Apply the function to the "responsibilities" column
df['responsibility_n'] = df['responsibility'].apply(clean_text)
df['qualification_n'] = df['qualification'].apply(clean_text)

df.head()

def word_count(df, column):
    # create a list of unique job types
    job_types = df['job_type'].unique()
    # create an empty list to hold the results
    results = []
    # loop through each job type
    for job_type in job_types:
        # create a new dataframe with only the current job type
        job_df = df[df['job_type'] == job_type]
        # create a dictionary to hold the word counts
        word_count_dict = {}
        # loop through each row in the job dataframe
        for index, row in job_df.iterrows():
            # split the text into words
            words = row[column].split()
            # loop through each word and add it to the dictionary
            for word in words:
                if word not in word_count_dict:
                    word_count_dict[word] = 1
                else:
                    word_count_dict[word] += 1
        # loop through the dictionary and add the results to the list
        for word, count in word_count_dict.items():
            results.append({'job_type': job_type, 'word': word, 'frequency': count})
    
    # create a new dataframe from the results list and return it
    return pd.DataFrame(results)

df_word_count_res = word_count(df, 'responsibility_n')
df_word_count_qual = word_count(df, 'qualification_n')


In [107]:
df_word_count_res = df_word_count_res.sort_values(['job_type', 'frequency'], ascending=[True, False])
df_word_count_res = df_word_count_res.groupby('job_type').head(30)
df_word_count_qual = df_word_count_qual.sort_values(['job_type', 'frequency'], ascending=[True, False])
df_word_count_qual = df_word_count_qual.groupby('job_type').head(30)


In [227]:
df_word_count_qual_da = df_word_count_qual[df_word_count_qual['job_type']=='data_analysis']
df_word_count_qual_ds = df_word_count_qual[df_word_count_qual['job_type']=='data_science']
df_word_count_qual_ml = df_word_count_qual[df_word_count_qual['job_type']=='machine_learning']

# Define a custom color scale
custom_colors = ['#3B5873', '#A3BCD6', '#343E29', '#82827C', '#C7D3AB','#758373','#1F3053']

# create the treemap figure with custom hover text and color scale
fig = px.treemap(df_word_count_qual_da, path=['job_type', 'word'], values='frequency', color='word',
                 color_discrete_sequence=custom_colors,
                 hover_data={'frequency': ':f'},
                 )

# remove the treemap borders
fig.update_traces(
    go.Treemap(
        marker=dict(line=dict(width=0))
    )
)
fig.update_traces(textfont=dict(color='white'))
fig.update_traces(hovertemplate='<b>%{label}</b><br>Frequency:%{customdata[0]:.0f}')

# customize the font and size of the title and labels
fig.update_layout(title={'text': "Top 30 words describing<br>data analysis qualifications", 'font': {'size': 24}},
                  font={'size': 18})

# customize the margins
fig.update_layout(
    title={
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10,},
        'font': {'size': 20}
    },
    width=600,
    height=600,
    margin=dict(t=85, l=25, r=25, b=25),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
) 


# show the figure
fig.show()
fig.write_html('../images/tree_map1.html')


In [225]:
df_word_count_qual_da = df_word_count_qual[df_word_count_qual['job_type']=='data_analysis']
df_word_count_qual_ds = df_word_count_qual[df_word_count_qual['job_type']=='data_science']
df_word_count_qual_ml = df_word_count_qual[df_word_count_qual['job_type']=='machine_learning']

# Define a custom color scale
custom_colors = ['#3B5873', '#A3BCD6', '#343E29', '#82827C', '#C7D3AB','#758373','#1F3053']

# create the treemap figure with custom hover text and color scale
fig = px.treemap(df_word_count_qual_ds, path=['job_type', 'word'], values='frequency', color='word',
                 color_discrete_sequence=custom_colors,
                 hover_data={'frequency': ':f'},
                 )

# remove the treemap borders
fig.update_traces(
    go.Treemap(
        marker=dict(line=dict(width=0))
    )
)
fig.update_traces(textfont=dict(color='white'))
fig.update_traces(hovertemplate='<b>%{label}</b><br>Frequency:%{customdata[0]:.0f}')

# customize the font and size of the title and labels
fig.update_layout(title={'text': "Top 30 words describing<br>data science qualifications", 'font': {'size': 24}},
                  font={'size': 18})

# customize the margins
fig.update_layout(
    title={
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10,},
        'font': {'size': 20}
    },
    width=600,
    height=600,
    margin=dict(t=85, l=25, r=25, b=25),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
) 


# show the figure
fig.show()
fig.write_html('../images/tree_map2.html')


In [224]:
df_word_count_qual_da = df_word_count_qual[df_word_count_qual['job_type']=='data_analysis']
df_word_count_qual_ds = df_word_count_qual[df_word_count_qual['job_type']=='data_science']
df_word_count_qual_ml = df_word_count_qual[df_word_count_qual['job_type']=='machine_learning']

# Define a custom color scale
custom_colors = ['#3B5873', '#A3BCD6', '#343E29', '#82827C', '#C7D3AB','#758373','#1F3053']

# create the treemap figure with custom hover text and color scale
fig = px.treemap(df_word_count_qual_ml, path=['job_type', 'word'], values='frequency', color='word',
                 color_discrete_sequence=custom_colors,
                 hover_data={'frequency': ':f'},
                 )

# remove the treemap borders
fig.update_traces(
    go.Treemap(
        marker=dict(line=dict(width=0))
    )
)
fig.update_traces(textfont=dict(color='white'))
fig.update_traces(hovertemplate='<b>%{label}</b><br>Frequency:%{customdata[0]:.0f}')

# customize the font and size of the title and labels
fig.update_layout(title={'text': "Top 30 words describing<br>machine learning qualifications", 'font': {'size': 24}},
                  font={'size': 18})

# customize the margins
fig.update_layout(
    title={
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10,},
        'font': {'size': 20}
    },
    width=600,
    height=600,
    margin=dict(t=85, l=25, r=25, b=25),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
) 


# show the figure
fig.show()
fig.write_html('../images/tree_map3.html')


In [52]:
# treemap of qual or resp for job types 
import plotly.express as px
import numpy as np

# # Define a custom color scale
# custom_colors = ['#1a1a1a', '#333333', '#4d4d4d', '#666666', '#7f7f7f', '#999999', '#b3b3b3', '#cccccc']

# # create the treemap figure with custom hover text and color scale
# fig = px.treemap(df_word_count_qual, path=['job_type', 'word'], values='frequency', color='word',
#                  color_continuous_scale=custom_colors,
#                  hover_data={'frequency': ':.2f'},
#                  hover_name='word', #textinfo='label+value')
# )

# define custom colors for each category
color_map = {'Category A': '#FF5733', 'Category B': '#FFC300', 'Category C': '#DAF7A6'}

# create the treemap figure with custom hover text and color scale
fig = px.treemap(df_word_count_qual, path=['job_type', 'word'], values='frequency', color='word',
                 color_discrete_map=color_map,
                 hover_data={'frequency': ':.2f'},
                 hover_name='word',
                 ) 


# customize the font and size of the title and labels
fig.update_layout(title={'text': "Word Frequencies by Job Type", 'font': {'size': 24}},
                  font={'size': 18})


# customize the margins and background color
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
# show the figure
fig.show()
fig.write_html('../images/tree_map.html')


NameError: name 'df_word_count_qual_da' is not defined

In [32]:
# create unique list of job types for dropdown menu
job_types = df_word_count_res['job_type'].unique().tolist()

from plotly.subplots import make_subplots
pal = list(sns.color_palette(palette='Blues_r', n_colors=20).as_hex())
# create three subplots, each with one donut chart
specs = [[{'type': 'pie'}]] * 3
fig = make_subplots(rows=3, cols=1, specs=specs)
# add the first donut chart to the left subplot
fig.add_trace(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[0]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[0]]['frequency'],
                     textposition='outside', 
                     hole=.6,
                     insidetextorientation='auto',
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     showlegend=False,
                     title='Data Analysis',
                     texttemplate="%{label}"),
              row=1, col=1)

# add the second donut chart to the right subplot
fig.add_trace(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[1]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[1]]['frequency'],
                     textposition='outside', 
                     hole=.6,
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     showlegend=False,
                     title='Data Science',
                     texttemplate="%{label}"),
              row=2, col=1)

# add the second donut chart to the right subplot
fig.add_trace(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[2]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[2]]['frequency'],
                     textposition='outside', 
                     hole=.6,
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     title='Machine Learning',
                     showlegend=False,
                     texttemplate="%{label}"),
              row=3, col=1)

# set the layout options for the entire figure
fig.update_layout(
    title={
        'text': f"Top 20 words describing responsibilities<br>by job type",
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10, 'b':80},
        'font': {'size': 20}
    },
    width=600,
    height=1000,
    margin=dict(t=130),
)

fig.show()
fig.write_html('../images/donut_chart.html')


In [203]:
# create unique list of job types for dropdown menu
job_types = df_word_count_res['job_type'].unique().tolist()

from plotly.subplots import make_subplots
pal = list(sns.color_palette(palette='Blues_r', n_colors=30).as_hex())

# add the pie chart to the figure
fig = go.Figure(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[0]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[0]]['frequency'],
                     textposition='outside', 
                     hole=.4,
                     insidetextorientation='auto',
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     showlegend=False,
                     #title='Data Analysis',
                     texttemplate="%{label}"))

# set the layout options for the entire figure
fig.update_layout(
    title={
        'text': f"Top 20 words describing<br>data analysis responsibilities",
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10, 'b':80},
        'font': {'size': 24}
    },
    width=600,
    height=600,
    margin=dict(t=130),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
)

fig.show()
fig.write_html('../images/donut_chart1.html')


In [206]:
# create unique list of job types for dropdown menu
job_types = df_word_count_res['job_type'].unique().tolist()

from plotly.subplots import make_subplots
pal = list(sns.color_palette(palette='Blues_r', n_colors=30).as_hex())

# add the pie chart to the figure
fig = go.Figure(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[1]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[1]]['frequency'],
                     textposition='outside', 
                     hole=.4,
                     insidetextorientation='auto',
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     showlegend=False,
                     #title='Data Analysis',
                     texttemplate="%{label}"))

# set the layout options for the entire figure
fig.update_layout(
    title={
        'text': f"Top 20 words describing<br>data science responsibilities",
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10, 'b':80},
        'font': {'size': 24}
    },
    width=600,
    height=600,
    margin=dict(t=130),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
)

fig.show()
fig.write_html('../images/donut_chart2.html')


In [207]:
# create unique list of job types for dropdown menu
job_types = df_word_count_res['job_type'].unique().tolist()

from plotly.subplots import make_subplots
pal = list(sns.color_palette(palette='Blues_r', n_colors=30).as_hex())

# add the pie chart to the figure
fig = go.Figure(go.Pie(labels=df_word_count_res[df_word_count_res['job_type'] == job_types[2]]['word'],
                     values=df_word_count_res[df_word_count_res['job_type'] == job_types[2]]['frequency'],
                     textposition='outside', 
                     hole=.4,
                     insidetextorientation='auto',
                     hovertemplate='percent=%{percent:.0%}<br>frequency=%{value}',
                     marker=dict(colors=pal),
                     showlegend=False,
                     #title='Data Analysis',
                     texttemplate="%{label}"))

# set the layout options for the entire figure
fig.update_layout(
    title={
        'text': f"Top 20 words describing<br>machine learning responsibilities",
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10, 'b':80},
        'font': {'size': 24}
    },
    width=600,
    height=600,
    margin=dict(t=130),
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    font=dict(size = 16, color = 'white')
)

fig.show()
fig.write_html('../images/donut_chart3.html')


In [151]:
# sankey
# what about location? this is tech so what's the split between remote and non remote? (next vis: for those that aren't remote
# where are they in the us?) 
import pandas as pd
import plotly.graph_objects as go

df_us = df[df['location_overall']=='USA']
df_us['location'] = df_us['location'].str.strip()

df_sankey = df_us.copy()
df_sankey['remote_status'] = df_sankey['remote_status'].replace(True, 'Remote').fillna('Not remote')
df_sankey.loc[df_sankey['location'] == 'United States', 'remote_status'] = 'Remote'

# Define the data as a Pandas dataframe
data = df_sankey.groupby(['remote_status', 'job_type']).size().reset_index(name='count')

# Create a dictionary of node colors
node_colors = {
    'data_analysis': '#67615B',
    'data_science': '#67615B',
    'machine_learning': '#67615B',
    'Not remote': '#606A7F',
    'Remote': '#606A7F',
}

# create labels
labels = ['Not remote', 'Remote', 'data_analysis', 'data_science', 'machine_learning']

# Create a dictionary that maps each label to its index
node_indices = {'Not remote': 0,
 'Remote': 1,
 'data_analysis': 2,
 'data_science': 3,
 'machine_learning': 4}

link_colors = ['#C7D3AB', '#C7D3AB', '#C7D3AB','#CBBDB0', '#CBBDB0', '#CBBDB0']

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        #line = dict(color = "white", width = 0.5),
        label = labels,
        color = [node_colors[label] for label in labels],
    ),
    link = dict(
        source = [node_indices[status] for status in data['job_type']],
        target = [node_indices[job] for job in data['remote_status']],
        value = data['count'],
        color=link_colors
    )))


fig.update_layout(
    title={'text': f"Distribution of remote work across job titles<br>in data analysis, data science and machine learning",
        'x': 0.50,
        'y': 0.96,
        'xanchor': 'center',
        'yanchor': 'top',
        'pad': {'r': 10, 't': 10,},
        'font': {'size': 20, 'color':"white"}
    },
    width=700,
    height=550,
    plot_bgcolor = "#272b2e",
    paper_bgcolor = "#272b2e",
    margin=dict(t=85, l=25, r=25, b=25),
    font=dict(size = 14, color = 'white')
) 

fig.show()
fig.write_html('../images/sankey.html')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [154]:
# chloropleths for job types and number of jobs that came up for that job type
import altair as alt
from vega_datasets import data

states = alt.topo_feature(data.us_10m.url, 'states')

# strip the spaces in the city column
df_us[['city', 'state']] = df_us['location'].str.split(',', expand=True)
df_us['city'] = df_us['city'].str.strip()
df_us['state'] = df_us['state'].str.strip()

df_us['state']=df_us['state'].replace('TX  (+1 other)','TX')
df_us['state']=df_us['state'].replace('IL   (+12 others)','IL')
df_us['state']=df_us['state'].replace('NY   (+2 others)','NY')

# create a pivot table with state as the index and job types as columns
pivot_df = pd.pivot_table(df_us, index='state', columns='job_type', values='title', aggfunc='count').reset_index()
pivot_df.state.unique()

pivot_df['state']=pivot_df['state'].replace('AL','Alabama')
pivot_df['state']=pivot_df['state'].replace('AR','Arkansas')
pivot_df['state']=pivot_df['state'].replace('CA','California')
pivot_df['state']=pivot_df['state'].replace('CO','Colorado')
pivot_df['state']=pivot_df['state'].replace('IL','Illinois')
pivot_df['state']=pivot_df['state'].replace('IN','Indiana')
pivot_df['state']=pivot_df['state'].replace('KS','Kansas')
pivot_df['state']=pivot_df['state'].replace('MA','Massachusetts')
pivot_df['state']=pivot_df['state'].replace('MD','Maryland')
pivot_df['state']=pivot_df['state'].replace('MI','Michigan')
pivot_df['state']=pivot_df['state'].replace('MN','Minnesota')
pivot_df['state']=pivot_df['state'].replace('MO','Missouri')
pivot_df['state']=pivot_df['state'].replace('NE','Nebraska')
pivot_df['state']=pivot_df['state'].replace('NY','New York')
pivot_df['state']=pivot_df['state'].replace('OK','Oklahoma')
pivot_df['state']=pivot_df['state'].replace('TN','Tennessee')
pivot_df['state']=pivot_df['state'].replace('TX','Texas')
pivot_df['state']=pivot_df['state'].replace('WA','Washington')
pivot_df['state']=pivot_df['state'].replace('WI','Wisconsin')
pivot_df.fillna(0,inplace=True)

import altair as alt
import pandas as pd
from vega_datasets import data
df_test = pd.read_csv('../data/population_engineers_hurricanes.csv')

states = alt.topo_feature(data.us_10m.url, 'states')
source = df_test
chart1 = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('data_analysis:Q', legend=alt.Legend(title='Job titles')),
    tooltip=['state:N', 'data_analysis:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['state','data_analysis'])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300,
    title='Number of Data Analysis Job Titles'
)
chart2 = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('data_science:Q', legend=alt.Legend(title='Job titles')),
    tooltip=['state:N', 'data_science:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['state','data_science'])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300,
    title='Number of Data Science Job Titles'
)
chart3 = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('machine_learning:Q', legend=alt.Legend(title='Job titles')),
    tooltip=['state:N', 'machine_learning:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['state','machine_learning'])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300,
    title='Number of Machine Learning Job Titles'
)

concat = (chart1 | chart2 | chart3)
concat
#concat.save('../images/chloropleth.html')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [181]:
import altair as alt
import pandas as pd
from vega_datasets import data
df_test = pd.read_csv('../data/population_engineers_hurricanes.csv')

states = alt.topo_feature(data.us_10m.url, 'states')
source = df_test
chart1 = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('data_analysis:Q', legend=alt.Legend(title='Job titles')),
    tooltip=['state:N', 'data_analysis:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['state','data_analysis'])
).project(
    type='albersUsa'
).properties(
    width=600,
    height=450,
    title='A chroropleth map of data analysis job titles per state',
    background='#272b2e'
).configure_title(
    fontSize=22,
    color='white', # set the title font color to white
    anchor='middle',
).configure_legend(
    labelColor='white',
    titleColor='white',
    labelFontSize=14, 
).configure_view(stroke=None)

chart1.save('../images/chloropleth1.html')
chart1


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [183]:
import altair as alt
import pandas as pd
from vega_datasets import data
df_test = pd.read_csv('../data/population_engineers_hurricanes.csv')

states = alt.topo_feature(data.us_10m.url, 'states')
source = df_test
chart2 = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('data_science:Q', legend=alt.Legend(title='Job titles')),
    tooltip=['state:N', 'data_science:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['state','data_science'])
).project(
    type='albersUsa'
).properties(
    width=600,
    height=450,
    title='A chroropleth map of data science job titles per state',
    background='#272b2e'
).configure_title(
    fontSize=22,
    color='white', # set the title font color to white
    anchor='middle',
).configure_legend(
    labelColor='white',
    titleColor='white',
    labelFontSize=14, 
).configure_view(stroke=None)

chart2.save('../images/chloropleth2.html')
chart2


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [192]:
# companies and job types
import pandas as pd
from pyvis.network import Network

df_company = df.groupby(['job_type','company_name']).size().reset_index(name='edge_weight')

# Create object
net = Network(height='700px', width='100%', bgcolor='#272B2E', font_color='white',
              notebook=True, cdn_resources="remote", select_menu=False, filter_menu=True)

# Add nodes 
companies = df_company['company_name'].unique()
for company in companies:
    net.add_node(company, title=company, color='#C7D3AB', shape='dot', border_width=0)

job_types = df_company['job_type'].unique()

for jobtype in job_types:
    net.add_node(jobtype, title=jobtype, color='#A3BCD6', shape='dot', size=40, border_width=0)

# Add edges between the company and job title nodes based on the data
for i, row in df_company.iterrows():
    net.add_edge(row['company_name'], row['job_type'], value=row['edge_weight'], title="weight: {}".format(row['edge_weight']))

net.show('../images/my_network.html')
