# Libraries

In [3]:
import pandas as pd

from thefuzz import process



# Configurations

In [148]:
# Pandas configuration for long phrases
pd.set_option('display.max_colwidth', 50)

# Data

In [5]:
df = pd.read_csv('data/Jobs.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,company,announcement,description
0,0,"Senior Analyst, Data Science and Analytics",TransUnion,The Muse,TransUnion's Job Applicant Privacy Notice Wha...
1,1,Senior Data Scientist,"Grubhub Holdings, Inc.",ZipRecruiter,About The Opportunity We're all about connect...
2,2,Lead Data Science Analyst,Discover Financial Services,LinkedIn,"Discover. A brighter future. With us, you’ll ..."
3,3,Data Science Intern,AbelsonTaylor,Startup Jobs,Are you a 2023 college graduate or rising coll...
4,4,Data Scientist,NORC at the University of Chicago,SimplyHired,"JOB DESCRIPTION: At NORC, Data Scientists pla..."


# Cleaning Data

In [7]:
df = df.drop('Unnamed: 0', axis=1)

In [8]:
df.head()

Unnamed: 0,title,company,announcement,description
0,"Senior Analyst, Data Science and Analytics",TransUnion,The Muse,TransUnion's Job Applicant Privacy Notice Wha...
1,Senior Data Scientist,"Grubhub Holdings, Inc.",ZipRecruiter,About The Opportunity We're all about connect...
2,Lead Data Science Analyst,Discover Financial Services,LinkedIn,"Discover. A brighter future. With us, you’ll ..."
3,Data Science Intern,AbelsonTaylor,Startup Jobs,Are you a 2023 college graduate or rising coll...
4,Data Scientist,NORC at the University of Chicago,SimplyHired,"JOB DESCRIPTION: At NORC, Data Scientists pla..."


In [9]:
# Data types of columns
df.dtypes

title           object
company         object
announcement    object
description     object
dtype: object

In [22]:
# Dataframe information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790 entries, 0 to 789
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         790 non-null    object
 1   company       790 non-null    object
 2   announcement  790 non-null    object
 3   description   790 non-null    object
dtypes: object(4)
memory usage: 24.8+ KB


In [10]:
# Get duplicated values
df[df.duplicated()]

Unnamed: 0,title,company,announcement,description


In [11]:
# Missing values
df[df['description'].isna()]

# There is no NA values. 

Unnamed: 0,title,company,announcement,description


In [12]:
# All type of unique titles of the job
df['title'].nunique()

515

In [13]:
df['title'].unique()

array(['Senior Analyst, Data Science and Analytics',
       'Senior Data Scientist', 'Lead Data Science Analyst',
       'Data Science Intern', 'Data Scientist',
       'Senior Solutions Architect (Data Science)',
       'Data Scientist - Research, Development & Construction',
       'Data Science Manager - S&A Strategy',
       'Senior/Principal Data Scientist', 'Data Science Analyst',
       'Senior Data Scientist - Knowledge Management',
       'Data Scientist, Consultant', 'Senior-Data Scientist',
       'Staff Data Scientist', 'Data Science Internship',
       'Undergrad Intern – Data Science Program (Summer 2023)',
       'Data Science, Department of Information Technology - Adjunct Faculty',
       'Summer 2023 Intelligent Sensors Data Science Intern',
       'Associate Director Data Science',
       'Data Science Senior Advisor (Solution Value Analytics) - Evernorth',
       'Director, Data Science',
       'Senior Manager, Institutional Data Analytics and Reporting',
       'D

There are 515 unique job title, but the most are repeated, so we have to classify this titles in some unique occurences. 

# Creating levels based on job title

## Filters for data analyst and data scientist

In [155]:
df['title'] = df['title'].str.lower()
df['title']

0      senior analyst, data science and analytics
1                           senior data scientist
2                       lead data science analyst
3                             data science intern
4                                  data scientist
                          ...                    
785                  research and data specialist
786             quality assurance data specialist
787                           senior data analyst
788                  cost controller/data analyst
789                               data specialist
Name: title, Length: 790, dtype: object

In [156]:
# Checking if the tile of the job contains analyst

job_analyst = df[df['title'].str.contains('analyst')]
job_scientist = df[df['title'].str.contains('scientist|science')]

In [157]:
# Checking what is duplicated between job_analyst and job_scientist

title_analyst_scientist = set(job_analyst['title']).intersection(job_scientist['title'])

job_analyst_or_scientist = pd.DataFrame()

for i in title_analyst_scientist:
    job_analyst_or_scientist = pd.concat([job_analyst_or_scientist, df[df['title'] == i]])

In [158]:
# Now, I want to remove from job_anayst and job_scientist the in tersection cases

job_analyst = job_analyst[~job_analyst['title'].isin(job_analyst_or_scientist['title'])]
job_scientist = job_scientist[~job_scientist['title'].isin(job_analyst_or_scientist['title'])]


In [159]:
job_scientist['title'].str.contains('principal')

1      False
3      False
4      False
5      False
6      False
       ...  
326    False
584    False
610     True
694    False
700    False
Name: title, Length: 315, dtype: bool

## Creating levels 

In [160]:
# Function that creates the levels of jobs, respecting this relationship
# level 1 - junior position
# level 2 - intermediate position
# level 3 - senior position
# level 4 - principal or leader position

def job_level(df):
    df['level'] = df.apply(lambda x:
                           1 if 'intern' in x['title'] or 'internship' in x['title'] else
                           2 if 'junior' in x['title'] or 'associate' in x['title'] or 'i' in x['title'] else
                           4 if 'senior' in x['title'] or 'sr' in x['title'] or 'snr' in x['title'] or 'ii' in x['title'] else
                           5 if 'principal' in x['title'] else
                           6 if 'lead' in x['title'] else
                           3, axis=1)
    
    return df

In [161]:
# Applying the levels on the dataframes 
job_level(job_analyst)

Unnamed: 0,title,company,announcement,description,level
163,predictive data analyst,County of Los Angeles,Government Jobs,TYPE OF RECRUITMENT: Open Competitive Job Oppo...,2
327,data analyst,Robert Half,LinkedIn,Job Title: Data Analyst Location: Saddle Broo...,3
328,senior business data analyst,MATRIX Resources,Matrix Resources,Work with the brightest minds at one of the la...,2
329,junior data analyst,IFG - International Financial Group,LinkedIn,"Title: Junior Data Analyst Location: Redmond,...",2
330,data analyst (monitoring),General Dynamics Information Technology,GDIT,The Data Analyst (Monitoring) will support the...,2
...,...,...,...,...,...
769,erp data analyst iii,TekWissen ®,LinkedIn,ob Title: ERP Data Analyst III Location: Morr...,2
771,"data analyst - part-time, alpha - an edu start...",Crossover,LinkedIn,Crossover is the world's #1 source of remote j...,2
774,wss business and data analyst lead,HP,LinkedIn,"At HP, we believe in the power of ideas. We us...",2
787,senior data analyst,Gopuff,Startup Jobs,The Senior Data Analyst will join as an analyt...,2


In [132]:
job_scientist

Unnamed: 0,title,company,announcement,description,level
1,senior data scientist,"Grubhub Holdings, Inc.",ZipRecruiter,About The Opportunity We're all about connect...,3
3,data science intern,AbelsonTaylor,Startup Jobs,Are you a 2023 college graduate or rising coll...,4
4,data scientist,NORC at the University of Chicago,SimplyHired,"JOB DESCRIPTION: At NORC, Data Scientists pla...",4
5,senior solutions architect (data science),The Trade Desk,LinkedIn,The Trade Desk is a global technology company ...,3
6,senior data scientist,Tango Card,Startup Jobs,The JobTango Card is hiring our first ever Sen...,3
...,...,...,...,...,...
326,lead data scientist - 100% remote - fte - $200...,Dice,LinkedIn,Dice is the leading career destination for tec...,4
584,data scientist jobs,ManTech International,Clearance Jobs,"Secure our Nation, Ignite your Future Join th...",4
610,senior/principal scientist- visualization and ...,Genentech,My Stateline Jobs,The Position Genentech (Research and Early De...,3
694,data science intern,Geospark Analytics Inc.,SimplyHired,Company Overview: Join the world’s only threa...,4


In [109]:
a = job_scientist.copy()

if job_scientist['title'].str.contains('junior|associate'):
    a['level'] = 1

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().