In [24]:
#Dependencies and Setup
from api_keys import KAGGLE_USERNAME, KAGGLE_KEY
import os
import json
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display


#Set environment variables for Kaggle API
os.environ["KAGGLE_USERNAME"] = KAGGLE_USERNAME
os.environ["KAGGLE_KEY"] = KAGGLE_KEY

In [2]:
# Load CSV files
job_skills_df = pd.read_csv("job_skills.csv")
job_postings_df = pd.read_csv("job_postings.csv")
pd.set_option('display.max_columns', None)

# Merge job_postings and job_skills dataframes on the 'job_link' column
job_skills_postings_merged_df = pd.merge(job_postings_df, job_skills_df, on='job_link', how='inner')
job_skills_postings_merged_df.head()

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


In [3]:
#Remove unwanted columns
job_skills_postings_df = job_skills_postings_merged_df[['job_title',
                                                        'company',
                                                        'job_location',
                                                        'first_seen',
                                                        'search_country',
                                                        'search_position',
                                                        'job_level',
                                                        'job_type',
                                                        'job_skills'
                                                       ]]
job_skills_postings_df

Unnamed: 0,job_title,company,job_location,first_seen,search_country,search_position,job_level,job_type,job_skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."
...,...,...,...,...,...,...,...,...,...
12212,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,"Wembley, England, United Kingdom",2024-01-16,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,"Dashboard development, Reporting, Power BI, SQ..."
12213,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,United States,Teller,Mid senior,Onsite,"Investigation, Antimoney laundering, Fraud, Ba..."
12214,Senior Data Scientist,Highnote,"San Francisco, CA",2024-01-16,United States,Mathematician,Mid senior,Onsite,"Data Science, Quantitative Modeling, SQL, Data..."
12215,Senior Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",2024-01-16,United States,Protection Engineer,Mid senior,Onsite,"Data Engineering, Data Quality, SQL, Python, T..."


In [4]:
job_skills_postings_df.count()

job_title          12217
company            12217
job_location       12216
first_seen         12217
search_country     12217
search_position    12217
job_level          12217
job_type           12217
job_skills         12212
dtype: int64

In [5]:
#Rename columns
job_skills_postings_df = job_skills_postings_df.rename(columns={"job_title": "Job Title",
                                                        "company": "Company",
                                                        "job_location": "Job Location",
                                                        "first_seen": "Job Posting Seen",
                                                        "search_country": "Country",
                                                        "search_position": "Position",
                                                        "job_level": "Job Level",
                                                        "job_type": "Job Type",
                                                        "job_skills": "Job Skills"
                                                               })

# Ensure all entries in 'Job Location' are strings for consistent splitting
job_skills_postings_df['Job Location'] = job_skills_postings_df['Job Location'].astype(str)

# Split 'Job Location' into 'Job Location City' and 'Job Location State'
split_location = job_skills_postings_df['Job Location'].str.split(', ', n=1, expand=True)
job_skills_postings_df['Job Location City'] = split_location[0]
job_skills_postings_df['Job Location State'] = split_location[1]

# Reorder columns to place 'Job Location City' and 'Job Location State' after 'Job Location'
columns = list(job_skills_postings_df.columns)
new_order = columns[:3] + ['Job Location City', 'Job Location State'] + columns[3:-2]
job_skills_postings_df = job_skills_postings_df[new_order]

job_skills_postings_df

Unnamed: 0,Job Title,Company,Job Location,Job Location City,Job Location State,Job Posting Seen,Country,Position,Job Level,Job Type,Job Skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",San Francisco,CA,2024-01-14,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",New York,NY,2024-01-14,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",Harrisburg,PA,2024-01-12,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,Lead Data Engineer,Dice,"Plano, TX",Plano,TX,2024-01-14,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."
...,...,...,...,...,...,...,...,...,...,...,...
12212,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,"Wembley, England, United Kingdom",Wembley,"England, United Kingdom",2024-01-16,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,"Dashboard development, Reporting, Power BI, SQ..."
12213,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",Kalispell,MT,2024-01-14,United States,Teller,Mid senior,Onsite,"Investigation, Antimoney laundering, Fraud, Ba..."
12214,Senior Data Scientist,Highnote,"San Francisco, CA",San Francisco,CA,2024-01-16,United States,Mathematician,Mid senior,Onsite,"Data Science, Quantitative Modeling, SQL, Data..."
12215,Senior Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",Oklahoma City,OK,2024-01-16,United States,Protection Engineer,Mid senior,Onsite,"Data Engineering, Data Quality, SQL, Python, T..."


In [6]:
print(job_skills_postings_df["Job Posting Seen"].dtype)

object


In [7]:
# Convert 'Job Posting Seen' to datetime format
job_skills_postings_df["Job Posting Seen"] = pd.to_datetime(job_skills_postings_df["Job Posting Seen"])
print(job_skills_postings_df["Job Posting Seen"].dtype)

datetime64[ns]


In [8]:
country_counts = job_skills_postings_df['Country'].value_counts()
print(country_counts)

Country
United States     10291
United Kingdom      995
Canada              630
Australia           301
Name: count, dtype: int64


In [9]:
# Filter the DataFrame for rows where Country is 'United States'
us_job_skills_df = job_skills_postings_df[job_skills_postings_df["Country"] == "United States"]
us_job_skills_counts = us_job_skills_df['Country'].value_counts()
print(us_job_skills_counts)

Country
United States    10291
Name: count, dtype: int64


In [10]:
# Display the first few rows of the 'Job Skills' column to inspect the structure
print(us_job_skills_df["Job Skills"].head())

0    Machine Learning, Programming, Python, Scala, ...
1    C++, Python, PyTorch, TensorFlow, MXNet, CUDA,...
2    ETL, Data Integration, Data Transformation, Da...
3    Data Lakes, Data Bricks, Azure Data Factory Pi...
4    Java, Scala, Python, RDBMS, NoSQL, Redshift, S...
Name: Job Skills, dtype: object


In [11]:
# Split 'Job Skills' by comma in the filtered DataFrame
us_job_skills_df.loc[:, "Job Skills"] = us_job_skills_df["Job Skills"].str.split(",")

# Explode to create a new row for each skill
skills_df = us_job_skills_df.explode("Job Skills")

# Strip whitespace around each skill
skills_df["Job Skills"] = skills_df["Job Skills"].str.strip()

# Drop any empty strings that may remain
skills_df = skills_df[skills_df["Job Skills"] != ""]

In [12]:
# Count occurrences of each skill and get the top 15
top_skills = skills_df["Job Skills"].value_counts().head(15)
print(top_skills)

Job Skills
Python                4071
SQL                   3855
Communication         2013
Data Analysis         1762
Machine Learning      1694
AWS                   1561
Tableau               1394
Java                  1281
R                     1275
Data Visualization    1261
Spark                 1229
Data Science          1098
Data Engineering      1065
Project Management    1023
Teamwork               982
Name: count, dtype: int64


In [13]:
skills_df.head()

Unnamed: 0,Job Title,Company,Job Location,Job Location City,Job Location State,Job Posting Seen,Country,Position,Job Level,Job Type,Job Skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Machine Learning
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Programming
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Python
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Scala
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Java


In [14]:
# Count occurrences of each skill and get the top 15
top_skills = skills_df["Job Skills"].value_counts().head(15)
print(top_skills)

Job Skills
Python                4071
SQL                   3855
Communication         2013
Data Analysis         1762
Machine Learning      1694
AWS                   1561
Tableau               1394
Java                  1281
R                     1275
Data Visualization    1261
Spark                 1229
Data Science          1098
Data Engineering      1065
Project Management    1023
Teamwork               982
Name: count, dtype: int64


In [15]:
# List of top job skills, filtering by top 10 hard skill
top_skills = ['Python', 'SQL', 'Data Analysis', 'Machine Learning', 'Data Visualization', 'AWS', 'Project Management', 'Data Science', 'Data Engineering', 'Tableau']  # Replace with your actual skills

# Filter the original DataFrame to include only rows with these specific job skills
filtered_top_skills_df = skills_df[skills_df["Job Skills"].isin(top_skills)]

# Print the filtered DataFrame
filtered_top_skills_df.head()

Unnamed: 0,Job Title,Company,Job Location,Job Location City,Job Location State,Job Posting Seen,Country,Position,Job Level,Job Type,Job Skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Machine Learning
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Python
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Data Engineering
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",New Haven,CT,2024-01-14,United States,Agricultural-Research Engineer,Mid senior,Onsite,Data Visualization
1,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",San Francisco,CA,2024-01-14,United States,Set-Key Driver,Mid senior,Onsite,Python


In [16]:
#Export data to csv
filtered_top_skills_df.to_csv('us_top_job_skills.csv', index=False)

In [17]:
row_count = len(filtered_top_skills_df)
print(f"Number of rows: {row_count}")

Number of rows: 18784


In [18]:
# Count occurrences of top 10 skills
top10_skills = filtered_top_skills_df["Job Skills"].value_counts()
print(top10_skills)

Job Skills
Python                4071
SQL                   3855
Data Analysis         1762
Machine Learning      1694
AWS                   1561
Tableau               1394
Data Visualization    1261
Data Science          1098
Data Engineering      1065
Project Management    1023
Name: count, dtype: int64


In [19]:
# Count occurrences of top 10 job locations
top10_joblocations = filtered_top_skills_df["Job Location"].value_counts().head(10)
print(top10_joblocations)

Job Location
New York, NY         636
San Francisco, CA    522
Chicago, IL          492
Washington, DC       458
Dallas, TX           381
Seattle, WA          373
Atlanta, GA          308
Austin, TX           302
Boston, MA           295
Houston, TX          265
Name: count, dtype: int64


In [20]:
# Group and count skills by location
top10_joblocations = filtered_top_skills_df["Job Location"].value_counts().head(10).index
top10_skills_by_location = filtered_top_skills_df[filtered_top_skills_df["Job Location"].isin(top10_joblocations)]

# Group by "Job Location" and "Job Skills", then count occurrences
skills_by_location_count = (
    top10_skills_by_location.groupby(["Job Location", "Job Skills"])
    .size()
    .reset_index(name="Count")
    .sort_values(by=["Job Location", "Count"], ascending=[True, False]))
print(skills_by_location_count)

      Job Location          Job Skills  Count
8      Atlanta, GA                 SQL     64
7      Atlanta, GA              Python     53
1      Atlanta, GA       Data Analysis     32
9      Atlanta, GA             Tableau     30
5      Atlanta, GA    Machine Learning     28
..             ...                 ...    ...
95  Washington, DC    Machine Learning     44
90  Washington, DC                 AWS     30
93  Washington, DC        Data Science     30
96  Washington, DC  Project Management     23
92  Washington, DC    Data Engineering     20

[100 rows x 3 columns]


In [21]:
# Count occurrences of top 10 job locations
jobtypes = filtered_top_skills_df["Job Type"].value_counts().head()
jobtypes.head()

Job Type
Onsite    18773
Hybrid        7
Remote        4
Name: count, dtype: int64

In [22]:
# Convert the default selection to a tuple
skills_filter = widgets.SelectMultiple(
    options=list(skills_by_location_count["Job Skills"].unique()),  
    value=tuple(skills_by_location_count["Job Skills"].unique()[:10]),
    description='Job Skills',
    layout=widgets.Layout(width='50%', height='200px'),
    style={'description_width': 'initial'}
)

# Function to update the bar chart
def update_bar_chart(selected_skills):
    # Ensure selected_skills is a list
    if isinstance(selected_skills, str):
        selected_skills = [selected_skills]
        
    # Filter the data based on selected job skills
    filtered_data = skills_by_location_count[
        skills_by_location_count["Job Skills"].isin(selected_skills)
    ]

    # Create the bar chart
    fig = px.bar(
        filtered_data,
        x="Job Skills",
        y="Count",
        color="Job Location",
        title="Top 10 Job Skills within the Top 10 Job Locations",
        labels={"Count": "Number of Occurrences", "Job Skills": "Skills"},
        barmode="group"
    )
    
    fig.show()

# Use widgets.interactive_output to link the function and widget
interactive_bar_chart = widgets.interactive_output(update_bar_chart, {"selected_skills": skills_filter})

# Display the filter widget and the bar chart
display(skills_filter, interactive_bar_chart)

SelectMultiple(description='Job Skills', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), layout=Layout(height='200px', w…

Output()

In [23]:
# Create skills filter
skills_filter = widgets.SelectMultiple(
    options=list(skills_by_location_count["Job Skills"].unique()), # Flexible list for dropdown options
    value=tuple(skills_by_location_count["Job Skills"].unique()[:10]), # Set tuple for default selection
    description='Job Skills',
    layout=widgets.Layout(width='50%', height='150px'),
    style={'description_width': 'initial'}
)

# Function to update the sunburst chart with two filters
def update_chart(selected_skills):
    
    # Ensure the selection is a list, even if only one selection made
    if isinstance(selected_skills, str):
        selected_skills = [selected_skills]
        
    # Filter the data based on selected job skills
    filtered_data = skills_by_location_count[
        skills_by_location_count["Job Skills"].isin(selected_skills)
    ]

    # Create sunburst chart
    fig = px.sunburst(
        filtered_data,
        path=["Job Skills", "Job Location"],
        values="Count",
        title="Top Job Skills with Top Job Locations and Job Types",
        color="Job Skills",
        width=1000,
        height=800
    )
    
    # Set the text orientation to a consistent angle
    fig.update_traces(insidetextorientation="radial")
    fig.show()

# Use widgets.interactive to link the function and widget
interactive_chart = widgets.interactive_output(update_chart, {"selected_skills": skills_filter})

# Display the interactive widget and output
display(skills_filter, interactive_chart)

SelectMultiple(description='Job Skills', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), layout=Layout(height='150px', w…

Output()