In [15]:
#Dependencies and Setup
from api_keys import KAGGLE_USERNAME, KAGGLE_KEY
import os
import json
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import time

#Set environment variables for Kaggle API
os.environ["KAGGLE_USERNAME"] = KAGGLE_USERNAME
os.environ["KAGGLE_KEY"] = KAGGLE_KEY

#Download Kaggle datasets
!kaggle datasets download -d asaniczka/data-science-job-postings-and-skills --unzip

Dataset URL: https://www.kaggle.com/datasets/asaniczka/data-science-job-postings-and-skills
License(s): ODC Attribution License (ODC-By)
Downloading data-science-job-postings-and-skills.zip to /Users/amandadelgado/Desktop/project-3
 98%|█████████████████████████████████████▏| 19.0M/19.4M [00:02<00:00, 10.7MB/s]
100%|██████████████████████████████████████| 19.4M/19.4M [00:02<00:00, 9.11MB/s]


In [16]:
# Load CSV files
job_skills_df = pd.read_csv("job_skills.csv")
job_postings_df = pd.read_csv("job_postings.csv")
pd.set_option('display.max_columns', None)

# Merge job_postings and job_skills dataframes on the 'job_link' column
job_skills_postings_merged_df = pd.merge(job_postings_df, job_skills_df, on='job_link', how='inner')
job_skills_postings_merged_df.head()

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


In [17]:
job_skills_postings_df = job_skills_postings_merged_df[['job_title',
                                                        'company',
                                                        'job_location',
                                                        'first_seen',
                                                        'search_city',
                                                        'search_country',
                                                        'search_position',
                                                        'job_level',
                                                        'job_type',
                                                        'job_skills'
                                                       ]]
job_skills_postings_df

Unnamed: 0,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."
...,...,...,...,...,...,...,...,...,...,...
12212,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,"Wembley, England, United Kingdom",2024-01-16,High Wycombe,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,"Dashboard development, Reporting, Power BI, SQ..."
12213,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,Montana,United States,Teller,Mid senior,Onsite,"Investigation, Antimoney laundering, Fraud, Ba..."
12214,Senior Data Scientist,Highnote,"San Francisco, CA",2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite,"Data Science, Quantitative Modeling, SQL, Data..."
12215,Senior Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",2024-01-16,Arcadia,United States,Protection Engineer,Mid senior,Onsite,"Data Engineering, Data Quality, SQL, Python, T..."


In [18]:
job_skills_postings_df.count()

job_title          12217
company            12217
job_location       12216
first_seen         12217
search_city        12217
search_country     12217
search_position    12217
job_level          12217
job_type           12217
job_skills         12212
dtype: int64

In [19]:
job_skills_postings_df = job_skills_postings_df.rename(columns={"job_title": "Job Title",
                                                        "company": "Company",
                                                        "job_location": "Job Location",
                                                        "first_seen": "Job Posting Seen",
                                                        "search_city": "City",
                                                        "search_country": "Country",
                                                        "search_position": "Position",
                                                        "job_level": "Job Level",
                                                        "job_type": "Job Type",
                                                        "job_skills": "Job Skills"
                                                               })
job_skills_postings_df

Unnamed: 0,Job Title,Company,Job Location,Job Posting Seen,City,Country,Position,Job Level,Job Type,Job Skills
0,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."
...,...,...,...,...,...,...,...,...,...,...
12212,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,"Wembley, England, United Kingdom",2024-01-16,High Wycombe,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,"Dashboard development, Reporting, Power BI, SQ..."
12213,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,Montana,United States,Teller,Mid senior,Onsite,"Investigation, Antimoney laundering, Fraud, Ba..."
12214,Senior Data Scientist,Highnote,"San Francisco, CA",2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite,"Data Science, Quantitative Modeling, SQL, Data..."
12215,Senior Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",2024-01-16,Arcadia,United States,Protection Engineer,Mid senior,Onsite,"Data Engineering, Data Quality, SQL, Python, T..."


In [20]:
print(job_skills_postings_df["Job Posting Seen"].dtype)

object


In [21]:
# Convert 'Job Posting Seen' to datetime format
job_skills_postings_df["Job Posting Seen"] = pd.to_datetime(job_skills_postings_df["Job Posting Seen"])
print(job_skills_postings_df["Job Posting Seen"].dtype)

datetime64[ns]


In [24]:
country_counts = job_skills_postings_df['Country'].value_counts()
print(country_counts)

Country
United States     10291
United Kingdom      995
Canada              630
Australia           301
Name: count, dtype: int64


In [28]:
# Filter the DataFrame for rows where Country is 'United States'
us_job_skills_df = job_skills_postings_df[job_skills_postings_df["Country"] == "United States"]
us_job_skills_counts = us_job_skills_df['Country'].value_counts()
print(us_job_skills_counts)

Country
United States    10291
Name: count, dtype: int64


In [29]:
# Display the first few rows of the 'Job Skills' column to inspect the structure
print(us_job_skills_df["Job Skills"].head())

0    Machine Learning, Programming, Python, Scala, ...
1    C++, Python, PyTorch, TensorFlow, MXNet, CUDA,...
2    ETL, Data Integration, Data Transformation, Da...
3    Data Lakes, Data Bricks, Azure Data Factory Pi...
4    Java, Scala, Python, RDBMS, NoSQL, Redshift, S...
Name: Job Skills, dtype: object


In [31]:
# Filter the DataFrame for rows where Country is 'United States'
us_job_skills_df = job_skills_postings_df[job_skills_postings_df["Country"] == "United States"].copy()

# Split 'Job Skills' by comma in the filtered DataFrame
us_job_skills_df.loc[:, "Job Skills"] = us_job_skills_df["Job Skills"].str.split(",")

# Explode to create a new row for each skill
skills_df = us_job_skills_df.explode("Job Skills")

# Strip whitespace around each skill
skills_df["Job Skills"] = skills_df["Job Skills"].str.strip()

# Drop any empty strings that may remain
skills_df = skills_df[skills_df["Job Skills"] != ""]

In [32]:
# Count occurrences of each skill and get the top 10
top_skills = skills_df["Job Skills"].value_counts().head(15)
print(top_skills)

Job Skills
Python                4071
SQL                   3855
Communication         2013
Data Analysis         1762
Machine Learning      1694
AWS                   1561
Tableau               1394
Java                  1281
R                     1275
Data Visualization    1261
Spark                 1229
Data Science          1098
Data Engineering      1065
Project Management    1023
Teamwork               982
Name: count, dtype: int64
