In [1948]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

Importing and combining 3 datasets from 3 different sites scraped by 3 different people

In [1949]:
jc = pd.read_csv('jobscentral.csv')
js = pd.read_csv('Jobstreet_Ziig.csv')
ft = pd.read_csv('salary_df_car_fut.csv')

In [1950]:
jc.head(1)

Unnamed: 0,Job Nature,Position Level,Job Category,Qualification,Salary,Company,Title,Description
0,Permanent,"Entry Level, Experienced","\r\n\r\nIT-Administration, \r\n \r\nIT-Soft...","Diploma, Bachelor's / Honours","\r\n\r\n 1,800 SGD - 2,000 SGD / Month\...",CareerBuilder Singapore,Junior Web Developer,[' • \xa0Provide first level of telephone and ...


In [1951]:
js.head(1)

Unnamed: 0.1,Unnamed: 0,Company,Title,Location,Address,Industry,Job Description,Seniority,Salary,Requirements,JD
0,0,Career Edge Asia Pte Ltd,Senior Data Analyst ($7K - $8K / CBD/ MNC ) re...,Singapore - Across Singapore,,Human Resources Management/Consulting,"<div class=""unselectable wrap-text"" id=""job_de...",Min 3 years (Senior Executive),6000-8000,Degree in IT/ System engineering / Business /...,Responsibilities Responsible for dashboards an...


In [1952]:
ft.head(1)

Unnamed: 0.1,Unnamed: 0,Company,Title,Address,Emp_type,Seniority,Industry,Salary,Responsibility,Requirements
0,0,SKILLSFUTURE SINGAPORE AGENCY,"Executive, (Quality Management Division) (6-mo...","ONE MARINA BOULEVARD, 1 MARINA BOULEVARD 018989",Contract,NONE,Public / Civil Service,NONE,Roles & ResponsibilitiesResponsibilities\r\n\r...,NONE


In [1953]:
js.columns

Index(['Unnamed: 0', 'Company', 'Title', 'Location', 'Address', 'Industry',
       'Job Description', 'Seniority', 'Salary', 'Requirements', 'JD'],
      dtype='object')

In [1954]:
ft.columns

Index(['Unnamed: 0', 'Company', 'Title', 'Address', 'Emp_type', 'Seniority',
       'Industry', 'Salary', 'Responsibility', 'Requirements'],
      dtype='object')

In [1955]:
jc.columns

Index(['Job Nature', 'Position Level', 'Job Category', 'Qualification',
       'Salary', 'Company', 'Title', 'Description'],
      dtype='object')

In [1956]:
js.drop('Unnamed: 0', axis=1, inplace=True)
ft.drop('Unnamed: 0', axis=1, inplace=True)

In [1957]:
jc['Emp_type'] = jc['Job Nature']
jc.drop('Job Nature', axis=1, inplace=True)

In [1958]:
jc['Seniority'] = jc['Position Level']
jc.drop('Position Level', axis=1, inplace=True)

In [1959]:
js.drop('Job Description', axis=1, inplace=True)

In [1960]:
js['Description'] = js['JD']
js.drop('JD', axis=1, inplace=True)

In [1961]:
js['Description'] = js.Requirements + js.Description
js.drop('Requirements', axis=1, inplace=True)

In [1962]:
ft['Description'] = ft.Requirements + ft.Responsibility
ft.drop(['Requirements', 'Responsibility'], axis=1, inplace=True)

In [1963]:
ft.columns

Index(['Company', 'Title', 'Address', 'Emp_type', 'Seniority', 'Industry',
       'Salary', 'Description'],
      dtype='object')

Combined the 3 datasets into 1, proceeding to do EDA and cleaning

In [1964]:
df = pd.concat([js, jc, ft])

In [1965]:
df.reset_index(drop=True, inplace=True)

In [1966]:
df.shape

(4945, 11)

In [1967]:
df = df[['Title', 'Company', 'Job Category', 'Seniority', 'Industry', 'Emp_type', 'Location', 'Address', 'Description', 'Qualification', 'Salary']]

In [1968]:
def letterizer(input):
    return ''.join(char for char in input if char in 'abcedfghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')

Cleaning the job title, then converting to dummy variables

In [1969]:
df['Title'] = df.Title.apply(letterizer)
df['Title'] = ['Data Analyst' if 'data' in title.lower() and 'analyst' in title.lower() else title for title in df.Title]
df['Title'] = ['Data Scientist' if 'data' in title.lower() and 'scientist' in title.lower() else title for title in df.Title]
df['Title'] = ['Business Analyst' if 'business' in title.lower() and 'analyst' in title.lower() else title for title in df.Title]
df['Title'] = ['Data Engineer' if 'data' in title.lower() and 'engineer' in title.lower() else title for title in df.Title]
df['Title'] = ['Reseach Scientist' if 'research' in title.lower() and 'scientist' in title.lower() else title for title in df.Title]
df['Title'] = ['Data(others)' if 'data' in title.lower() and 'analyst' not in title.lower() and 'scientist' not in title.lower() and 'engineer' not in title.lower() else title for title in df.Title]
df['Title'] = ['Business(others)' if 'business' in title.lower() and 'analyst' not in title.lower() else title for title in df.Title]
df['Title'] = ['Analyst(others)' if 'analyst' in title.lower() and 'data' not in title.lower() and 'business' not in title.lower() else title for title in df.Title]
df['Title'] = ['Others' if title not in ['Data Analyst', 'Data Scientist', 
                                         'Business Analyst', 'Data Engineer', 'Reseach Scientist',
                                         'Data(others)', 'Business(others)', 'Analyst(others)'] else title for title in df.Title]

In [1970]:
for title in df.Title.unique():
    if title != 'Others':
        df['title_'+title] = [1 if title == row else 0 for row in df.Title]

In [1971]:
df.drop('Title', axis=1, inplace=True)

Cleaning the company names

In [1972]:
df['Company'] = df.Company.apply(letterizer)

Cleaning the job categories - too many null values, so to preserve any possible value all non-null values will be thrown into the job description string before dropping the whole column

In [1973]:
print(df['Job Category'].isnull().sum())
df['Job Category'] = df['Job Category'].astype(str)
df['Job Category'] = df['Job Category'].apply(lambda x: x.replace('\r',''))
df['Job Category'] = df['Job Category'].apply(lambda x: x.replace('\n',''))

4200


In [1974]:
df['Description'] = [j if i == 'nan' else i + j for i, j in zip(df['Job Category'], df.Description)]
df.drop('Job Category', axis=1, inplace=True)

Cleaning location and address - dummy variables for location. Address will be left as a string.

In [1975]:
df['Location'] = [''.join(char for char in str(location) if char.lower() in ' abcedfghijklmnopqrstuvwxyz') for location in df.Location]

In [1976]:
df['Location'] = [location.replace('Singapore', '') for location in df.Location]
df['Location'] = ['Unknown' if Location.lower() == 'none' or Location.lower() == 'nan' or Location.lower() == ''else Location for Location in df.Location]
df['Location'] = ['Central' if 'central' in location.lower() else location for location in df.Location]
df['Location'] = ['Northeast' if 'northeast' in location.lower() else location for location in df.Location]
df['Location'] = ['Northwest' if 'northwest' in location.lower() else location for location in df.Location]
df['Location'] = ['East' if 'east' in location.lower() else location for location in df.Location]
df['Location'] = ['West' if 'west' in location.lower() else location for location in df.Location]
df['Location'] = ['North' if 'north' in location.lower() else location for location in df.Location]
df['Location'] = ['Across' if 'across' in location.lower() else location for location in df.Location]
df['Location'] = ['Overseas' if location not in df.Location.unique()[:6] else location for location in df.Location]

In [1977]:
df.Location.unique()

array(['Across', 'Central', 'West', 'Unknown', 'East', 'North',
       'Overseas'], dtype=object)

In [1978]:
for location in df.Location.unique():
    if location != 'Unknown':
        df['location_'+location] = [1 if location == row else 0 for row in df.Location]

In [1979]:
df.drop('Location', axis=1, inplace=True)

In [1980]:
df['Address'] = [''.join(char for char in str(location) if char.lower() in ' abcedfghijklmnopqrstuvwxyz') for location in df.Address]
df['Address'] = ['Unknown' if address.lower() == 'none' or address.lower() == 'nan' or address.lower() == ''else address for address in df.Address]

Cleaning the qualifications - too messy and too many null fields. Chunk into the job description string and drop.

In [1981]:
print(df.Qualification.isnull().sum())
df['Description'] = [j if type(i) is float else i + j for i, j in zip(df.Qualification, df.Description)]
df.drop('Qualification', axis=1, inplace=True)

4200


Cleaning the salary data

In [1990]:
df.Salary.value_counts()

 0.0         1046
 1.0          653
 4000.0       410
 6000.0       147
 4500.0       127
 5000.0       124
-1.0          109
 3750.0       109
 6500.0        99
 5500.0        97
 3500.0        96
 4250.0        95
 7500.0        84
 7000.0        76
 5250.0        74
 4750.0        68
 3250.0        64
 6750.0        59
 6250.0        53
 8000.0        51
 3000.0        50
 9000.0        40
 4800.0        39
 10000.0       39
 2750.0        36
 7250.0        31
 3600.0        25
 5750.0        24
 8500.0        24
 2500.0        23
             ... 
 9650.0         1
 1800.0         1
 9300.0         1
 16500.0        1
 155000.0       1
 228.0          1
 7275.0         1
 117000.0       1
 19000.0        1
 4475.0         1
 7600.0         1
 6132.0         1
 7900.0         1
 6086.5         1
 57500.0        1
 8531.0         1
 17250.0        1
 17750.0        1
 13300.0        1
 27000.0        1
 11900.0        1
 15250.0        1
 13850.0        1
 12333.5        1
 6450.0   

In [1983]:
df['Salary'] = ['0' if salary == '0.0' or str(salary).lower() == 'none' else salary for salary in df.Salary]

In [1984]:
df['Salary'] = df.Salary.apply(lambda x: -1 if 'Below' in str(x) else x)
df['Salary'] = df.Salary.apply(lambda x: 4000 if 'Around' in str(x) else x)
df['Salary'] = df.Salary.apply(lambda x: 1 if 'Above' in str(x) else x)
df['Salary'] = df.Salary.apply(lambda x: ''.join([char for char in str(x) if char in '0123456789 - to']))

In [1985]:
def split1(value):
    splitted = value.split('-')
    if len(splitted) == 2:
        try:
            return str((int(splitted[0]) + int(splitted[1]))/2)
        except:
            return value
    else:
        return value

In [1986]:
def split2(value):
    splitted = value.split('to')
    if len(splitted) == 2:
        try:
            return str((int(splitted[0]) + int(splitted[1]))/2)
        except:
            return value
    else:
        return value

In [1987]:
df['Salary'] = df.Salary.apply(split1)
df['Salary'] = df.Salary.apply(split2)

In [1989]:
df['Salary'] = df.Salary.apply(lambda x: ''.join([char for char in x if char in '0123456789-.']))
df['Salary'] = df.Salary.apply(split1)
df['Salary'] = df.Salary.apply(float)

In [1803]:
df.Salary.describe()

count      3899.000000
mean       6750.578995
std       12394.154588
min           1.000000
25%        4000.000000
50%        5400.000000
75%        6250.000000
max      300000.000000
Name: Salary, dtype: float64

In [1804]:
df = df[df.Salary>1000]

In [1805]:
# some seem to be yearly salary

df[df.Salary>15000]

Unnamed: 0,Company,Job Category,Seniority,Industry,Emp_type,Location,Address,Description,Qualification,Salary,title_Data Analyst,title_Analyst(others),title_Business Analyst,title_Data Scientist,title_Reseach Scientist,title_Data(others),title_Business(others),title_Data Engineer
96,Morgan McKinley,,Min 2 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,,,57500.0,1,0,0,0,0,0,0,0
203,Morgan McKinley,,Min 5 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,Min 7-10 years of experience as Business Anal...,,135000.0,1,0,0,0,0,0,0,0
219,Morgan McKinley,,Min 5 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,Min 7-10 years of experience as Business Anal...,,135000.0,1,0,0,0,0,0,0,0
363,Cobalt Consulting Asia Pte Ltd,,Min 4 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Market Street Grace Global Raffles Singapore,,,66000.0,0,1,0,0,0,0,0,0
886,Macdonald And Company Pte Ltd,,Min 4 years (Manager),Human Resources Management/Consulting,,Singapore Across Singapore,,Experience creating end to end data science s...,,20000.0,0,0,0,1,0,0,0,0
1357,Morgan McKinley,,Min 7 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,7+ years of business analysis experience whic...,,130000.0,0,0,1,0,0,0,0,0
1358,Morgan McKinley,,Min 7 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,7+ years of business analysis experience whic...,,130000.0,0,0,1,0,0,0,0,0
1377,Morgan McKinley,,Min 7 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,7+ years of business analysis experience whic...,,130000.0,0,0,1,0,0,0,0,0
1378,Morgan McKinley,,Min 7 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,7+ years of business analysis experience whic...,,130000.0,0,0,1,0,0,0,0,0
1467,Morgan McKinley,,Min 5 years (Non-Executive),Human Resources Management/Consulting,,Singapore Across Singapore,Raffles Place One Raffles Place Tower Singap...,Min 7-10 years of experience as Business Anal...,,135000.0,1,0,0,0,0,0,0,0


In [1806]:
df['Salary'] = df.Salary.apply(lambda x: x/12 if x > 20000 else x)

In [1807]:
df[df.Salary > 20000]

# double-checked the posting to verify... guess the numbers are correct.

# AMAZON WEB SERVICES SINGAPORE PRIVATE LIMITEDSr. Data Architect, Data Warehousing & MPP

# AIA TOWER, 1 ROBINSON ROAD 048542

# Permanent

# Professional

# Information Technology
# $250,000to$300,000

# Annually

Unnamed: 0,Company,Job Category,Seniority,Industry,Emp_type,Location,Address,Description,Qualification,Salary,title_Data Analyst,title_Analyst(others),title_Business Analyst,title_Data Scientist,title_Reseach Scientist,title_Data(others),title_Business(others),title_Data Engineer
3580,DATASPARK PTE LTD,,Middle Management,Information Technology,Permanent,,COMCENTRE EXETER ROAD,RequirementsQualifications 7+ years’ experien...,,25000.0,0,0,0,0,0,1,0,0
3596,AMAZON WEB SERVICES SINGAPORE PRIVATE LIMITED,,Professional,Information Technology,Permanent,,AIA TOWER ROBINSON ROAD,RequirementsBasic Qualifications BA/BS degree...,,22916.666667,0,0,0,0,0,1,0,0


In [1808]:
df['Description'] = df.Description.apply(lambda x: str(x).replace('\r', ''))
df['Description'] = df.Description.apply(lambda x: str(x).replace('\n', ''))

In [1809]:
df['Min Years'] = [int(''.join([char for char in seniority if char in '0123456789'])) if bool(re.search(r'\d', seniority)) is True else np.NaN for seniority in df.Seniority]

In [1810]:
df['Seniority'] = [''.join(char for char in seniority if char.lower() in 'abcedfghijklmnopqrstuvwxyz ') for seniority in df.Seniority]
df['Seniority'] = [seniority.replace('Min', '') for seniority in df.Seniority]
df['Seniority'] = [seniority.replace('  years ', '') for seniority in df.Seniority]
df['Seniority'] = [seniority.replace('  year ', '') for seniority in df.Seniority]
df['Seniority'] = ['Entry Level' if 'entry' in seniority.lower() and 'level' in seniority.lower() else seniority for seniority in df.Seniority]

In [1811]:
df = df[(df.Seniority != 'None')]
df = df[(df.Seniority != 'NONE')]

In [1812]:
df['Seniority'] = ['Professional' if 'professional' in seniority.lower() else seniority for seniority in df.Seniority]
df['Seniority'] = ['Senior Executive' if 'Senior Executive' in seniority else seniority for seniority in df.Seniority]
df['Seniority'] = ['Senior Management' if 'Senior Management' in seniority else seniority for seniority in df.Seniority]
df['Seniority'] = ['Junior Executive' if 'Junior Executive' in seniority else seniority for seniority in df.Seniority]
df['Seniority'] = ['Middle Management' if 'Middle Management' in seniority else seniority for seniority in df.Seniority]
df['Seniority'] = ['Senior Manager' if 'Senior Manag' in seniority else seniority for seniority in df.Seniority]

In [1765]:
df.Seniority

0                          Min 3 years (Senior Executive)
1                          Min 2 years (Junior Executive)
2                                             Entry Level
3                          Min 5 years (Senior Executive)
4                                             Entry Level
5                          Min 2 years (Junior Executive)
6                           Min 1 year (Senior Executive)
7                          Min 7 years (Junior Executive)
8                          Min 5 years (Senior Executive)
9                          Min 7 years (Senior Executive)
10                                       Senior Executive
11                         Min 5 years (Senior Executive)
12                                         Senior Manager
13                                       Senior Executive
14                         Min 3 years (Senior Executive)
15                         Min 3 years (Junior Executive)
16                         Min 2 years (Senior Executive)
17            

In [1724]:
impute_dict = dict(zip(df.groupby('Title')['Min Years'].mean().index, df.groupby('Title')['Min Years'].mean()))

In [1725]:
df['Min Years'] = [impute_dict[title] if year != year else year for year, title in zip(df['Min Years'], df['Title'])]

In [1726]:
# df['Min Years'] = [df['Min Years'].mean() if year != year else year for year in df['Min Years']]
df['Industry'] = ['Unknown' if industry != industry else industry for industry in df['Industry']]
df['Emp_type'] = ['Unknown' if Emp_type != Emp_type else Emp_type for Emp_type in df['Emp_type']]
df['Company'] = ['Unknown' if Company.lower() == 'none' or Company.lower() == 'nan' or Company.lower() == '' else Company for Company in df.Company]
df['Description'] = ['Unknown' if Description.lower() == 'none' or Description.lower() == 'nan' or Description.lower() == '' else Description for Description in df.Description]

In [1727]:
df.to_csv('final_df.csv', index=False)

In [1728]:
pd.read_csv('final_df.csv')

Unnamed: 0,Title,Company,Seniority,Industry,Emp_type,Location,Address,Description,Salary,Min Years
0,Data Analyst,Career Edge Asia Pte Ltd,Senior Executive,Human Resources Management/Consulting,Unknown,Singapore Across Singapore,Unknown,Degree in IT/ System engineering / Business /...,7000.0,3.000000
1,Data Analyst,YGP Pte Ltd,Junior Executive,Automobile/Automotive Ancillary/Vehicle,Unknown,Singapore Central Downtown Core Tanjong Pagar,Anson Road Singapore,Minimum 2 years of relevant experiences in Bu...,3500.0,2.000000
2,Data Analyst,Unknown,Entry Level,Wood/Fibre/Paper,Unknown,Singapore Central,Bukit Merah Central Podium,Unknown,4200.0,3.212329
3,Data Analyst,nSearch Global Pte Ltd,Senior Executive,Human Resources Management/Consulting,Unknown,Singapore Central,Unknown,"Degree in Computer/Computer Science, Electroni...",6000.0,5.000000
4,Data Analyst,Adecco Personnel Pte Ltd,Entry Level,Human Resources Management/Consulting,Unknown,Singapore Across Singapore Singapore,Unknown,Unknown,4000.0,3.212329
5,Data Analyst,Recruitment Firm,Junior Executive,Human Resources Management/Consulting,Unknown,Singapore Central,Marina View Asia Square Tower Singapore,Unknown,3250.0,2.000000
6,Data Analyst,RECRUIT EXPRESS PTE LTD,Senior Executive,Human Resources Management/Consulting,Unknown,Singapore Central,Unknown,Candidate must possess at least a Bachelor's ...,4000.0,1.000000
7,Data Analyst,Sopra Steria Asia Pte Ltd,Junior Executive,"Consulting (IT, Science, Engineering & Technical)",Unknown,Singapore West Others One North,Fusionopolis Way Symbiosis,"Develop data mining, data analysis and data pr...",6000.0,7.000000
8,Data Analyst,United Overseas Bank Limited UOB,Senior Executive,Banking/Financial Services,Unknown,Singapore Across Singapore Singapore,Unknown,Minimum Bachelor Degree or equivalent profess...,6000.0,5.000000
9,Data Analyst,United Overseas Bank Limited UOB,Senior Executive,Banking/Financial Services,Unknown,Singapore Across Singapore Singapore,Unknown,Unknown,6000.0,7.000000
