In [1]:
# Importing Libraries

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import ast
from datasets import load_dataset


# Creating Dataframe

ds = load_dataset("lukebarousse/data_jobs")
df = ds['train'].to_pandas()

# Cleaning the DataFrame

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])   #converting job posted date to a datetime object
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x) # converting job skills to a list object

# Drop none values from column job_skills
# df.dropna(subset='job_skills',inplace=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Filtering DataFrame based on a country 
country  = "United States"  # country variable
df_cn = df[df['job_country'] == country].copy()
df_cn.info()

# Exploding the job_skills column which is currently a list
df_cn_expl = df_cn.explode('job_skills')

<class 'pandas.core.frame.DataFrame'>
Index: 206292 entries, 0 to 785705
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        206292 non-null  object        
 1   job_title              206292 non-null  object        
 2   job_location           205505 non-null  object        
 3   job_via                206285 non-null  object        
 4   job_schedule_type      205099 non-null  object        
 5   job_work_from_home     206292 non-null  bool          
 6   search_location        206292 non-null  object        
 7   job_posted_date        206292 non-null  datetime64[ns]
 8   job_no_degree_mention  206292 non-null  bool          
 9   job_health_insurance   206292 non-null  bool          
 10  job_country            206292 non-null  object        
 11  salary_rate            25372 non-null   object        
 12  salary_year_avg        15766 non-null   float64  

In [3]:
# Creating a list of job titles to explore

df_top3_job_titles = df_cn_expl['job_title_short'].value_counts().sort_values(ascending=False).head(3).reset_index(name='job_count')
job_titles = df_top3_job_titles['job_title_short'].tolist()


In [4]:
#Creating a DataFrame to capture highest count of skills for all Job Titles

df_skill_count = df_cn_expl.groupby(['job_skills','job_title_short']).size().reset_index(name='skill_count')
df_skill_count.sort_values(by='skill_count',ascending=False,inplace=True)


In [5]:
# Merging 'df_skill_count' and 'df_top3_job_titles' to create a DataFrame with percentage values for the top five skills

df_skill_count = df_skill_count.merge(df_top3_job_titles,how='left',on='job_title_short')
df_skill_count['skill likelihood'] = (df_skill_count['skill_count'] / df_skill_count['job_count']) * 100

In [6]:
df_skill_count

Unnamed: 0,job_skills,job_title_short,skill_count,job_count,skill likelihood
0,python,Data Scientist,42379,322290.0,13.149338
1,sql,Data Analyst,34452,254289.0,13.548364
2,sql,Data Scientist,30034,322290.0,9.318936
3,excel,Data Analyst,27519,254289.0,10.821939
4,r,Data Scientist,26022,322290.0,8.074095
...,...,...,...,...,...
1865,clojure,Software Engineer,1,,
1866,vb.net,Senior Data Scientist,1,,
1867,fortran,Machine Learning Engineer,1,,
1868,planner,Cloud Engineer,1,,


In [7]:
df_cn_expl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1073565 entries, 0 to 785705
Data columns (total 17 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   job_title_short        1073565 non-null  object        
 1   job_title              1073565 non-null  object        
 2   job_location           1069655 non-null  object        
 3   job_via                1073523 non-null  object        
 4   job_schedule_type      1068781 non-null  object        
 5   job_work_from_home     1073565 non-null  bool          
 6   search_location        1073565 non-null  object        
 7   job_posted_date        1073565 non-null  datetime64[ns]
 8   job_no_degree_mention  1073565 non-null  bool          
 9   job_health_insurance   1073565 non-null  bool          
 10  job_country            1073565 non-null  object        
 11  salary_rate            129800 non-null   object        
 12  salary_year_avg        85378 non-n

In [8]:
df['job_title_short'].value_counts()

job_title_short
Data Analyst                 196075
Data Engineer                186241
Data Scientist               172286
Business Analyst              49063
Software Engineer             44929
Senior Data Engineer          44563
Senior Data Scientist         36957
Senior Data Analyst           29216
Machine Learning Engineer     14080
Cloud Engineer                12331
Name: count, dtype: int64