In [2]:
import re
import pandas as pd

In [29]:
# read csv files list_of_jobs_jobstreet.csv
job_dataset = pd.read_csv('jobs_jobstreet_prep.csv')

In [30]:
len(job_dataset)

7617

In [31]:
# drop duplicates based on job id
job_dataset = job_dataset.drop_duplicates(subset='Job ID', keep='first')
job_dataset.reset_index(drop=True, inplace=True)

In [41]:
job_dataset.isnull().sum()

Job Title                    0
Company Name                 0
Location                     0
Salary                       0
Job Classification           0
Job Sub Classification       0
Facility                     0
Posted Date                  0
Job Type                  6237
Job Description           6237
More Detail                  0
Job ID                       0
Job Sub Category             0
dtype: int64

In [42]:
job_dataset['Company Name'].fillna('not_written', inplace=True)
job_dataset['Salary'].fillna('not_written', inplace=True)
job_dataset['Facility'].fillna('not_written', inplace=True)
job_dataset['Job ID'].fillna('not_written', inplace=True)
job_dataset['Job ID'].fillna('not_written', inplace=True)



In [6]:
job_sub_categories = {
    "golang" : ["golang"],
    ".net" : [".net", "asp"],
    "php" : ["php", "code igniter","laravel", "lumen"], #CI
    "java" : ["java", "springboot"],
    "python" : ["python", "flask"],
    "nodejs" : ["node js", "express js", "expressjs", "nodejs", "node", "node-js"],
    "reactjs" : ["reactjs", "react-js", "react js", "react"],
    "nextjs" : ["nextjs", "next-js", "next js", "next"],
    "angularjs" : ["angular", "angularjs"],
    "fluter" : ["fluter"],
    "kotlin" : ["kotlin"],
    "vuejs" : ["vue-js", "vuejs", "vue", "vue js"],
    "backend" : ["back-end", "backend", "back end", "ruby"],
    "frontend" : ["front-end", "frontend", "front end", "svelte", "nuxt"],
    "mobile" : ["mobile", "mobile dev", "mobile apps", "react native", "native", "android", "ios"],
    "data" : ["data-engineer", "data engineer", "de", "data", "scientist", "analyst", "sql"],
    "other_it" : ["software engineer", "software developer", "software", "full-stack", "fullstack", "programmer", "javascript"]
}

# # Create a new column 'Job Category' with default value 'Other'
job_dataset['Job Sub Category'] = 'Other'

# # Convert job titles and categories to lowercase for case-insensitive comparison
job_dataset['Job Title'] = job_dataset['Job Title'].str.lower()
job_sub_categories = {key.upper(): [title.lower() for title in titles] for key, titles in job_sub_categories.items()}

# # Iterate over job categories and update 'Job Category' column based on job title
for category, keywords in job_sub_categories.items():
    try:
        for keyword in keywords:
            job_dataset.loc[job_dataset['Job Title'].str.contains(keyword, na=False, regex=False), 'Job Sub Category'] = category
    except ValueError as e:
        print(f"ValueError: {e}")
        print(f"Problematic rows: {job_dataset[job_dataset['Job Title'].str.contains(keyword)].index}")


In [33]:
job_dataset['Job Sub Category'].value_counts()

Job Sub Category
Other                 5516
DATA                  1375
PHP                    556
OTHER_IT               109
JAVA                    12
BACKEND                  9
MOBILE                   7
Information System       6
FRONTEND                 6
.NET                     5
GOLANG                   4
PYTHON                   4
AI                       2
NODEJS                   2
RUBY                     2
IT_SUPPORT               1
QA                       1
Name: count, dtype: int64

In [43]:
def clean_column(job_dataset, column_name):
    corpus = []
    for i in range(0, len(job_dataset)):
        text = re.sub('[^a-zA-Z]', ' ', job_dataset[column_name][i])
        text = text.title()
        text = text.split()
        text = ' '.join(text)
        corpus.append(text)

    job_dataset[column_name] = corpus
    return job_dataset

In [45]:
column_name = ['Job Title', 'Company Name', 'Location','Facility','Job Classification','Job Sub Classification']
for i in column_name:
    job_dataset = clean_column(job_dataset, i)

In [49]:
job_dataset['Company Name'] = job_dataset['Company Name'].str.upper()

In [48]:
# make company name value in uppercase
job_dataset

Unnamed: 0,Job Title,Company Name,Location,Salary,Job Classification,Job Sub Classification,Facility,Posted Date,Job Type,Job Description,More Detail,Job ID,Job Sub Category
0,Admin Data Entry,PT FIRST PROPERTI ABADI,Yogyakarta,"Rp1,800,000 – Rp2,000,000 per month",Administrasi Dukungan Perkantoran,Entri Data Pengolahan Kata,Not Written,2024-03-14,Full time,PT. KONGLO MUDA PROPERTI adalah salah satu per...,https://www.jobstreet.co.id/id/job/74425476?ty...,74425476,DATA
1,Data Engineer,PT PERMODALAN NASIONAL MADANI,Jakarta Selatan,not_written,Teknologi Informasi Komunikasi,Teknik Perangkat Lunak,Asuransi Kesehatan Tunjangan Kinerja Insentif,2024-03-16,Kontrak,Qualifications:1. Pendidikan minimal S1 di bid...,https://www.jobstreet.co.id/id/job/74488809?ty...,74488809,DATA
2,Data Center Manager,EQUINIX SDN BHD,Jakarta Raya,not_written,Teknologi Informasi Komunikasi,Manajemen,Career Growth Opportunities Outstanding Compan...,2024-03-18,Full time,Equinix is the world’s digital infrastructure ...,https://www.jobstreet.co.id/id/job/74395033?ty...,74395033,DATA
3,Data Scientist,PT SERASI AUTORAYA MEMBER OF ASTRA,Jakarta Timur,not_written,Sains Teknologi,Matematika Statistik Teknik Informasi,Not Written,2024-03-14,Full time,"Job Description: Provide data modelling, minin...",https://www.jobstreet.co.id/id/job/74428274?ty...,74428274,DATA
4,Business Analyst Manager,PT HEMA MEDHAJAYA,Cikupa,not_written,Teknologi Informasi Komunikasi,Analis Bisnis Sistem,Career Growth Bpjs Working Days Monday To Friday,2024-03-16,Full time,Qualifications:Minimum bachelor's degree in bu...,https://www.jobstreet.co.id/id/job/74490352?ty...,74490352,DATA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,Sr Sales Manager Wedding Social Event,HILTON BALI RESORT,Jakarta Utara,not_written,Hospitaliti Pariwisata,Manajemen,Not Written,2024-03-15,,,https://www.jobstreet.co.id/id/job/73186431?ty...,73186431,PHP
7613,Ui Ux Consultant,PT ERNST YOUNG INDONESIA,Jakarta Raya,not_written,Teknologi Informasi Komunikasi,Arsitek,Not Written,2024-03-15,,,https://www.jobstreet.co.id/id/job/72380507?ty...,72380507,Other
7614,Retail Credit Function Head,PT BANK AMAR INDONESIA,Jakarta Raya,not_written,Penjualan,Pengembangan Bisnis Baru,Not Written,2024-03-15,,,https://www.jobstreet.co.id/id/job/73569700?ty...,73569700,Other
7615,Account Manager Oil Gas,PT TRIMAXINDO INTERNATIONAL INDONESIA,Jakarta Raya,not_written,Penjualan,Pengembangan Bisnis Baru,Not Written,2024-03-15,,,https://www.jobstreet.co.id/id/job/72358532?ty...,72358532,Other


## Cleaning Salary

In [50]:
def checking_salary_period(salary):
    if 'year' in salary:
        return 'year'
    elif 'month' in salary:
        return 'month'
    elif 'week' in salary:
        return 'week'
    elif 'day' in salary:
        return 'day'
    elif 'hour' in salary:
        return 'hour'
    else:
        return 'not_written'

In [51]:
job_dataset['period_of_salary'] = job_dataset['Salary'].apply(checking_salary_period)

In [52]:
def cleaning_salary_period(salary):
    if 'per month' in salary:
        salary = re.sub(r'per month', '', salary)
    elif 'per year' in salary:
        salary = re.sub(r'per year', '', salary)
    elif 'per week' in salary:
        salary = re.sub(r'per week', '', salary)
    elif 'per day' in salary:
        salary = re.sub(r'per day', '', salary)
    elif 'per hour' in salary:
        salary = re.sub(r'per hour', '', salary)
    return salary

In [53]:
job_dataset['Salary'] = job_dataset['Salary'].apply(cleaning_salary_period)

In [54]:
job_dataset['Compensation'] = job_dataset[(job_dataset['Salary'] != 'not_written') & ~(job_dataset['Salary'].str.contains('-')) & ~(job_dataset['Salary'].str.contains('–')) & ~(job_dataset['Salary'].str.contains('[0-9]'))]['Salary']

In [55]:
def checking_currency(salary):
    currency_map = {
        'Rp': 'IDR',
        'IDR': 'IDR',
        'USD': 'USD',
        '$': 'USD',
        'RM': 'RM',
        'SGD': 'SGD',
        'MYR': 'MYR',
        'AUD': 'AUD',
        'EUR': 'EUR',
        'HKD': 'HKD',
        '฿': 'THB',
        'THB': 'THB'
    }
    for key, value in currency_map.items():
        if key in salary:
            return value
    return 'not_written'



In [56]:
job_dataset['salary_currency'] = job_dataset['Salary'].apply(checking_currency)

In [57]:
def cleaning_currency(salary):
    currency_symbols = ['IDR', 'Rp', '$', '(USD)', 'SGD', 'MYR', 'AUD', 'EUR', 'RM', 'HKD', '฿', '(THB)', '()']
    
    for symbol in currency_symbols:
        salary = salary.replace(symbol, '')

    if ' ' in salary:
        salary = salary.replace(' ', '')
    elif salary == 'not_written':
        salary = salary

    return salary

In [58]:
job_dataset['Salary'] = job_dataset['Salary'].apply(cleaning_currency)

In [60]:
# deleting . , and space in salary
job_dataset['Salary'] = job_dataset['Salary'].str.replace('[,. ]', '', regex=True).str.strip()

In [61]:
# replace Nan value in compensation with 'not_written'
job_dataset['Compensation'].fillna('not_written', inplace=True)

In [64]:
# job_dataset['Salary'].str.contains('–') replace '–' with '-'
job_dataset['Salary'] = job_dataset['Salary'].str.replace('–', '-')

In [None]:
def average_salary(row):
    try:
        salary = row['Salary']
        if '-' in salary:
            salary = salary.split('-')
            salary = list(map(int, salary))
            salary = sum(salary)/len(salary)
        else:
            salary = salary  # Convert salary to integer
        return salary
    except ValueError as e:
        print(f"Invalid value for salary: {salary} in row {row}. Error: {e}")
        return None  # Return None for invalid salaries

job_dataset['Salary'] = job_dataset.apply(average_salary, axis=1)



In [None]:
job_dataset

## Job ID

In [72]:
# def extract_job_id(url):
#     parts = url.split('/')
#     job_id_with_query = parts[-1]
#     job_id = job_id_with_query.split('?')[0]
#     return job_id

# job_dataset['Job ID'] = job_dataset['More Detail'].apply(extract_job_id)
# job_dataset = job_dataset.drop_duplicates(subset='Job ID', keep='first')

In [None]:
# job_sub_categories = {
#     'business intelligence': ['business intelligence','bisnis analis', 'business analyst', 'it business'],
#     'cloud computing': ['cloud', 'cloud consultant', 'cloud devops engineer', 'cloud engineer', 'cloud infrastructure engineer', 'cloud security specialist', 'cloud solutions architect'],
#     'cyber security': ['cyber', 'cyber security', 'data privacy', 'network security', 'security'],
#     'data': ['data', 'data analyst', 'data engineer', 'data scientist', 'database administrator'],
#     'design': ['design', 'desain', 'designer', 'editor', 'grafis', 'graphic', 'interior','ui', 'user experience', 'ux', 'ux researcher', 'ux writer', 'user interface'],
#     'ui ux':['ui', 'user experience', 'ux', 'ux researcher', 'ux writer', 'user interface'],
#     'information system': ['erp', 'information system', 'system analyst', 'odoo', 'sap','system'],
#     'it': ['ai', 'ai engineer', 'ai ethics specialist', 'ai ml', 'ai project', 'ai research scientist', 'ai trainer', 'backend', 'back end', 'cloud', 'dev ops', 'devops', 'devops engineer', 'flutter', 'front end', 'front-end', 'full stack', 'fullstack', 'golang', 'help desk technician', 'ios developer', 'it', 'it manager', 'it risk management', 'it service desk', 'it staff', 'java', 'javascript developer', 'machine learning', 'machine learning engineer', 'ml analyst', 'ml consultant', 'ml engineer', 'ml modeler', 'ml researcher', 'mobile apps', 'mobile developer', 'mobile programmer', 'nextjs', 'php developer', 'python', 'react', 'security specialist', 'software developer', 'software engineer', 'systems analysis', 'technical support', 'web', 'web desain', 'web designer', 'web developer'],
#     # 'product management': ['it presales(ordo)', 'product', 'product analyst', 'product owner', 'project'],
#     'quality control': ['quality', 'quality assurance', 'quality control', 'software quality assurance', 'qa', 'qc'],
# }

# job_dataset['Job Sub Category'] = 'Other'

# job_dataset['Job Title'] = job_dataset['Job Title'].str.lower()
# job_sub_categories = {key.upper(): [title.lower() for title in titles] for key, titles in job_sub_categories.items()}

# for category, keywords in job_sub_categories.items():
#     for keyword in keywords:
#         job_dataset.loc[job_dataset['Job Title'].str.contains(keyword), 'Job Sub Category'] = category

# # Display the updated DataFrame

In [80]:
# job_sub_categories = {
#     'data': ['data mining','big data','data science','data scientist','data engineer','data analyst','data-engineer','scientist','sql'],
#     # 'web development': ['web developer', 'backend','back-end', 'full stack', 'web designer','fullstack','web developer ','web desain','back end','web programmer','web','it staff'],
#     'frontend':['frontend','front end','front-end','svelte','nuxt'],
#     'backend':['backend','back end','back-end','ruby'],
#     # 'fullstack':['fullstack','full stack','full-stack','full stack developer','fullstack developer'],
#     'ui ux': ['ui ux','uiux','ui/ux designer','ui','user experience ','ux','user interface'],
#     'ai/ml engineering': ['ai engineer', 'machine learning','artificial intelligence','deep learning','nlp','natural language processing','computer vision'],
#     'business analyst': ['business analyst','bisnis analis','it business','business intelligence'],   
#     'cloud engineering': ['cloud','cloud engineer'],
#     'cyber security': ['cyber security', 'network security','cyber security ',' security','data privacy','security specialist'],
#     'devops': ['devops','dev ops'],
#     'it management': ['it manager','it risk management'],
#     'it support': ['it support','it application support','helpdesk technician'],
#     'network administration': ['network administrator','network '],
#     'product management': ['product manager', 'product owner', 'product analyst','project manager'],
#     'software quality_assurance': ['software quality assurance'],
#     'software engineering': ['software engineer', 'software developer','javascript','developer'],
#      "golang" : ["golang"],
#     ".net" : [".net", "asp"],
#     "php" : ["php", "code igniter", "ci", "laravel", "lumen"],
#     "java" : ["java", "springboot"],
#     "python" : ["python", "flask"],
#     "nodejs" : ["node js", "express js", "expressjs", "nodejs", "node", "node-js"],
#       "reactjs" : ["reactjs", "react-js", "react js", "react"],
#     "nextjs" : ["nextjs", "next-js", "next js", "next"],
#     "angularjs" : ["angular", "angularjs"],
#     "fluter" : ["fluter"],
#     "kotlin" : ["kotlin"],
#     'programmer' : ['programmer'],
#     'systems analysis': ['system analyst', 'system administrator','system engineer'],
#     'mobile development': ['mobile', 'ios','android','mobile apps','mobile dev','react native','native'],
#     'game development': ['game'],
#     'information system': ['erp', 'odoo','information system support','information system'],
#     'other_it':['"software engineer", "software developer", "software", "full-stack", "fullstack", "programmer", "javascript"']
# }

# job_dataset['Job Sub Category'] = 'Other'

# job_dataset['Job Title'] = job_dataset['Job Title'].str.lower()
# job_sub_categories = {key.upper(): [title.lower() for title in titles] for key, titles in job_sub_categories.items()}

# for category, keywords in job_sub_categories.items():
#     for keyword in keywords:
#         job_dataset.loc[job_dataset['Job Title'].str.contains(keyword), 'Job Sub Category'] = category

# # Display the updated DataFrame

In [69]:
# job sub category with 'PHP' value replace to 'Other' 

job_dataset.loc[job_dataset['Job Sub Category'] == 'PHP', 'Job Sub Category'] = 'Other'

In [70]:
job_dataset['Job Sub Category'].value_counts()

Job Sub Category
Other                 6072
DATA                  1375
OTHER_IT               109
JAVA                    12
BACKEND                  9
MOBILE                   7
Information System       6
FRONTEND                 6
.NET                     5
PYTHON                   4
GOLANG                   4
AI                       2
NODEJS                   2
RUBY                     2
QA                       1
IT_SUPPORT               1
Name: count, dtype: int64

In [72]:
job_dataset
# column name = job title, company name, location, posted_date, job type, job id, job sub category, job description, more detail 
# create new dataframe using above column

cleaned_job_dataset = job_dataset[['Job Title', 'Company Name', 'Location', 'Posted Date', 'Job Type', 'Job ID', 'Job Sub Category','More Detail']]

In [73]:
cleaned_job_dataset

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Type,Job ID,Job Sub Category,More Detail
0,Admin Data Entry,PT FIRST PROPERTI ABADI,Yogyakarta,2024-03-14,Full time,74425476,DATA,https://www.jobstreet.co.id/id/job/74425476?ty...
1,Data Engineer,PT PERMODALAN NASIONAL MADANI,Jakarta Selatan,2024-03-16,Kontrak,74488809,DATA,https://www.jobstreet.co.id/id/job/74488809?ty...
2,Data Center Manager,EQUINIX SDN BHD,Jakarta Raya,2024-03-18,Full time,74395033,DATA,https://www.jobstreet.co.id/id/job/74395033?ty...
3,Data Scientist,PT SERASI AUTORAYA MEMBER OF ASTRA,Jakarta Timur,2024-03-14,Full time,74428274,DATA,https://www.jobstreet.co.id/id/job/74428274?ty...
4,Business Analyst Manager,PT HEMA MEDHAJAYA,Cikupa,2024-03-16,Full time,74490352,DATA,https://www.jobstreet.co.id/id/job/74490352?ty...
...,...,...,...,...,...,...,...,...
7612,Sr Sales Manager Wedding Social Event,HILTON BALI RESORT,Jakarta Utara,2024-03-15,,73186431,Other,https://www.jobstreet.co.id/id/job/73186431?ty...
7613,Ui Ux Consultant,PT ERNST YOUNG INDONESIA,Jakarta Raya,2024-03-15,,72380507,Other,https://www.jobstreet.co.id/id/job/72380507?ty...
7614,Retail Credit Function Head,PT BANK AMAR INDONESIA,Jakarta Raya,2024-03-15,,73569700,Other,https://www.jobstreet.co.id/id/job/73569700?ty...
7615,Account Manager Oil Gas,PT TRIMAXINDO INTERNATIONAL INDONESIA,Jakarta Raya,2024-03-15,,72358532,Other,https://www.jobstreet.co.id/id/job/72358532?ty...
