In [64]:
import pandas as pd
import numpy as np

In [65]:
df = pd.read_csv('job_postings.csv')

df.head()

Unnamed: 0,Job Posting ID,Job Posting Date,Job Title,Job Title Full,Job Title Additional Info,Job Position Type,Job Position Level,Years of Experience,Job Skills,Job Location,Minimum Pay,Maximum Pay,Pay Rate,Number of Applicants,Company Name,Company Industry,Company Size
0,2701524240,2017-01-01,Software Engineer,Software Engineer,Java Full Stack | Remote,Full-time,Entry level,1,"database, javascript, agile, linux, server, no...",United States,,,,6.0,"Cardinal Financial Company, Limited Partnership",Financial Services,"1,001-5,000 employees"
1,2719108338,2017-01-01,Data Engineer,Senior Azure Data Engineer,,Full-time,Mid-Senior level,2,"data_lake, cloud, python, spark, github, wareh...",United States,,,,1.0,Brinks Home,Consumer Electronics,"1,001-5,000 employees"
2,2719503370,2017-01-01,Software Engineer,Software Engineer I,,Full-time,Entry level,5,"mongo, oracle, microsoft, css, javascript, htm...",United States,,,,16.0,Paycor,Computer Software,"1,001-5,000 employees"
3,2734877741,2017-01-01,Business Analyst,Associate Business Analyst,Telecommute,Full-time,Entry level,2,"agile, excel","Phoenix, AZ",,,,1.0,Optum,Hospital & Health Care,"10,001+ employees"
4,2752415616,2017-01-01,Developer,Swift Developer,,Contract,Mid-Senior level,4,"excel, back-end, ios, swift, programming","Richmond, CA",,,,,Toptal,Internet,"1,001-5,000 employees"


In [66]:
df.info()

# Remove irrelevant columns for analysis / columns with high amount of null values
df = df.drop(columns=['Job Title Full', 'Job Title Additional Info', 'Minimum Pay', 'Maximum Pay', 'Pay Rate', 'Job Posting ID'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25114 entries, 0 to 25113
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Job Posting ID             25114 non-null  int64  
 1   Job Posting Date           25114 non-null  object 
 2   Job Title                  25114 non-null  object 
 3   Job Title Full             25114 non-null  object 
 4   Job Title Additional Info  8187 non-null   object 
 5   Job Position Type          25114 non-null  object 
 6   Job Position Level         25114 non-null  object 
 7   Years of Experience        25114 non-null  int64  
 8   Job Skills                 22904 non-null  object 
 9   Job Location               25114 non-null  object 
 10  Minimum Pay                1812 non-null   float64
 11  Maximum Pay                1812 non-null   float64
 12  Pay Rate                   1812 non-null   object 
 13  Number of Applicants       17529 non-null  flo

In [67]:
# Convert Date to Datetime object 
df['Job Posting Date'] = df['Job Posting Date'].astype('datetime64[ms]')

In [68]:
# Create Job Category Column

# Declare conditions 
condition1 = df['Job Title'].str.contains('analyst' ,case=False, na=False)
condition2 = df['Job Title'].str.contains('scientist' ,case=False, na=False)
condition3 = df['Job Title'].str.contains('engineer' ,case=False, na=False)

# Create Job Category feature to group job titles into 3 major categories
df['Job Category'] = np.select([condition1, condition2, condition3], ["Analyst", "Data Scientist", "Engineer"], default='Other')

In [69]:
# Split location into City and Region 
split_location = df['Job Location'].str.split(",", expand=True)

df['City'] = split_location[0]
df['State'] = split_location[1]

In [70]:
df_skills = df.copy()

In [71]:
# Split string of skills into a list to prepare for explode
df_skills['Skills'] = df['Job Skills'].str.split(',')

# Explode the list into separate rows
df_skills = df_skills.explode('Skills')

# Remove whitespaces
df_skills['Skills'] = df_skills['Skills'].str.strip()

# Keep Skills column only
df_skills = df_skills[['Skills']]

df_skills

Unnamed: 0,Skills
0,database
0,javascript
0,agile
0,linux
0,server
...,...
25113,programming
25113,etl
25113,no-sql
25113,snowflake


In [72]:
# Export dataframes to csv for further analysis in Tableau 
df.to_csv('job_posting_clean.csv', index=True)
df_skills.to_csv('job_posting_skills.csv', index=True)