In [23]:
import re
import pandas as pd

In [65]:
# read csv files list_of_jobs_jobstreet.csv
job_dataset = pd.read_csv('jobstreet_cleaned_3.csv')

In [58]:
len(job_dataset)

2128

In [59]:
job_dataset.isnull().sum()

Job Title                 0
Company Name              1
Location                  0
Salary                    0
Job Classification        0
Job Sub Classification    0
Facility                  0
Posted Date               0
Job Type                  0
Job Description           0
More Detail Link          0
og_period_salary          0
Compensation              0
og_salary_currency        0
Job ID                    0
Job Category              0
Scraped Date              0
dtype: int64

In [60]:
job_dataset['Company Name'].fillna('not_written', inplace=True)
job_dataset['Salary'].fillna('not_written', inplace=True)
job_dataset['Facility'].fillna('not_written', inplace=True)

In [66]:
job_dataset.drop_duplicates(inplace=True)
job_dataset.reset_index(drop=True, inplace=True)
job_dataset.shape

(2128, 17)

In [69]:
def clean_column(job_dataset, column_name):
    corpus = []
    for i in range(0, len(job_dataset)):
        text = re.sub('[^a-zA-Z]', ' ', job_dataset[column_name][i])
        text = text.title()
        text = text.split()
        text = ' '.join(text)
        corpus.append(text)

    job_dataset[column_name] = corpus
    return job_dataset

In [70]:
job_dataset.columns

Index(['Job Title', 'Company Name', 'Location', 'Salary', 'Job Classification',
       'Job Sub Classification', 'Facility', 'Posted Date', 'Job Type',
       'Job Description', 'More Detail', 'og_period_salary', 'Compensation',
       'og_salary_currency', 'Job ID', 'Job Category', 'Scraped Date'],
      dtype='object')

In [None]:
column_name = ['Job Title', 'Company Name', 'Location','Job Description', 'Facility','Job Classification','Job Sub Classification']
for i in column_name:
    job_dataset = clean_column(job_dataset, i)

## Cleaning Salary

In [31]:
def checking_salary_period(salary):
    if 'year' in salary:
        return 'year'
    elif 'month' in salary:
        return 'month'
    elif 'week' in salary:
        return 'week'
    elif 'day' in salary:
        return 'day'
    elif 'hour' in salary:
        return 'hour'
    else:
        return 'not_written'

In [32]:
job_dataset['og_period_salary'] = job_dataset['Salary'].apply(checking_salary_period)

In [33]:
def cleaning_salary_period(salary):
    if 'per month' in salary:
        salary = re.sub(r'per month', '', salary)
    elif 'per year' in salary:
        salary = re.sub(r'per year', '', salary)
    elif 'per week' in salary:
        salary = re.sub(r'per week', '', salary)
    elif 'per day' in salary:
        salary = re.sub(r'per day', '', salary)
    elif 'per hour' in salary:
        salary = re.sub(r'per hour', '', salary)
    return salary

In [34]:
job_dataset['Salary'] = job_dataset['Salary'].apply(cleaning_salary_period)

In [35]:
job_dataset['Compensation'] = job_dataset[(job_dataset['Salary'] != 'not_written') & ~(job_dataset['Salary'].str.contains('-')) & ~(job_dataset['Salary'].str.contains('–')) & ~(job_dataset['Salary'].str.contains('[0-9]'))]['Salary']

In [36]:
def checking_currency(salary):
    currency_map = {
        'Rp': 'IDR',
        'IDR': 'IDR',
        'USD': 'USD',
        '$': 'USD',
        'RM': 'RM',
        'SGD': 'SGD',
        'MYR': 'MYR',
        'AUD': 'AUD',
        'EUR': 'EUR',
        'HKD': 'HKD',
        '฿': 'THB',
        'THB': 'THB'
    }
    for key, value in currency_map.items():
        if key in salary:
            return value
    return 'not_written'



In [37]:
job_dataset['og_salary_currency'] = job_dataset['Salary'].apply(checking_currency)

In [38]:
def cleaning_currency(salary):
    currency_symbols = ['IDR', 'Rp', '$', '(USD)', 'SGD', 'MYR', 'AUD', 'EUR', 'RM', 'HKD', '฿', '(THB)', '()']
    
    for symbol in currency_symbols:
        salary = salary.replace(symbol, '')

    if ' ' in salary:
        salary = salary.replace(' ', '')
    elif salary == 'not_written':
        salary = salary

    return salary

In [39]:
job_dataset['Salary'] = job_dataset['Salary'].apply(cleaning_currency)

In [40]:
job_dataset['Salary'] = job_dataset['Salary'].str.replace('[,. ]', '', regex=True).str.strip()

In [41]:
# replace Nan value in compensation with 'not_written'
job_dataset['Compensation'].fillna('not_written', inplace=True)

## Job ID

In [72]:
def extract_job_id(url):
    parts = url.split('/')
    job_id_with_query = parts[-1]
    job_id = job_id_with_query.split('?')[0]
    return job_id

In [73]:
job_dataset['Job ID'] = job_dataset['More Detail'].apply(extract_job_id)

In [74]:
job_dataset = job_dataset.drop_duplicates(subset='Job ID', keep='first')

In [45]:
# job_dataset['Salary'].str.contains('–') replace '–' with '-'
job_dataset['Salary'] = job_dataset['Salary'].str.replace('–', '-')

In [46]:
def average_salary(row):
    try:
        salary = row['Salary']
        if '-' in salary:
            salary = salary.split('-')
            salary = list(map(int, salary))
            salary = sum(salary)/len(salary)
        else:
            salary = salary  # Convert salary to integer
        return salary
    except ValueError as e:
        print(f"Invalid value for salary: {salary} in row {row}. Error: {e}")
        return None  # Return None for invalid salaries




In [47]:
job_dataset['Salary'] = job_dataset.apply(average_salary, axis=1)

In [48]:
# job_dataset.to_csv('jobstreet_cleaned_2.csv', index=False)

In [None]:
job_sub_categories = {
    'business intelligence': ['business intelligence','bisnis analis', 'business analyst', 'it business'],
    'cloud computing': ['cloud', 'cloud consultant', 'cloud devops engineer', 'cloud engineer', 'cloud infrastructure engineer', 'cloud security specialist', 'cloud solutions architect'],
    'cyber security': ['cyber', 'cyber security', 'data privacy', 'network security', 'security'],
    'data': ['data', 'data analyst', 'data engineer', 'data scientist', 'database administrator'],
    'design': ['design', 'desain', 'designer', 'editor', 'grafis', 'graphic', 'interior','ui', 'user experience', 'ux', 'ux researcher', 'ux writer', 'user interface'],
    'ui ux':['ui', 'user experience', 'ux', 'ux researcher', 'ux writer', 'user interface'],
    'information system': ['erp', 'information system', 'system analyst', 'odoo', 'sap','system'],
    'it': ['ai', 'ai engineer', 'ai ethics specialist', 'ai ml', 'ai project', 'ai research scientist', 'ai trainer', 'backend', 'back end', 'cloud', 'dev ops', 'devops', 'devops engineer', 'flutter', 'front end', 'front-end', 'full stack', 'fullstack', 'golang', 'help desk technician', 'ios developer', 'it', 'it manager', 'it risk management', 'it service desk', 'it staff', 'java', 'javascript developer', 'machine learning', 'machine learning engineer', 'ml analyst', 'ml consultant', 'ml engineer', 'ml modeler', 'ml researcher', 'mobile apps', 'mobile developer', 'mobile programmer', 'nextjs', 'php developer', 'python', 'react', 'security specialist', 'software developer', 'software engineer', 'systems analysis', 'technical support', 'web', 'web desain', 'web designer', 'web developer'],
    # 'product management': ['it presales(ordo)', 'product', 'product analyst', 'product owner', 'project'],
    'quality control': ['quality', 'quality assurance', 'quality control', 'software quality assurance', 'qa', 'qc'],
}

job_dataset['Job Sub Category'] = 'Other'

job_dataset['Job Title'] = job_dataset['Job Title'].str.lower()
job_sub_categories = {key.upper(): [title.lower() for title in titles] for key, titles in job_sub_categories.items()}

for category, keywords in job_sub_categories.items():
    for keyword in keywords:
        job_dataset.loc[job_dataset['Job Title'].str.contains(keyword), 'Job Sub Category'] = category

# Display the updated DataFrame

In [80]:
job_sub_categories = {
    'data': ['data mining','big data','data science','data scientist','data engineer','data analyst','data-engineer','scientist','sql'],
    # 'web development': ['web developer', 'backend','back-end', 'full stack', 'web designer','fullstack','web developer ','web desain','back end','web programmer','web','it staff'],
    'frontend':['frontend','front end','front-end','svelte','nuxt'],
    'backend':['backend','back end','back-end','ruby'],
    # 'fullstack':['fullstack','full stack','full-stack','full stack developer','fullstack developer'],
    'ui ux': ['ui ux','uiux','ui/ux designer','ui','user experience ','ux','user interface'],
    'ai/ml engineering': ['ai engineer', 'machine learning','artificial intelligence','deep learning','nlp','natural language processing','computer vision'],
    'business analyst': ['business analyst','bisnis analis','it business','business intelligence'],   
    'cloud engineering': ['cloud','cloud engineer'],
    'cyber security': ['cyber security', 'network security','cyber security ',' security','data privacy','security specialist'],
    'devops': ['devops','dev ops'],
    'it management': ['it manager','it risk management'],
    'it support': ['it support','it application support','helpdesk technician'],
    'network administration': ['network administrator','network '],
    'product management': ['product manager', 'product owner', 'product analyst','project manager'],
    'software quality_assurance': ['software quality assurance'],
    'software engineering': ['software engineer', 'software developer','javascript','developer'],
     "golang" : ["golang"],
    ".net" : [".net", "asp"],
    "php" : ["php", "code igniter", "ci", "laravel", "lumen"],
    "java" : ["java", "springboot"],
    "python" : ["python", "flask"],
    "nodejs" : ["node js", "express js", "expressjs", "nodejs", "node", "node-js"],
      "reactjs" : ["reactjs", "react-js", "react js", "react"],
    "nextjs" : ["nextjs", "next-js", "next js", "next"],
    "angularjs" : ["angular", "angularjs"],
    "fluter" : ["fluter"],
    "kotlin" : ["kotlin"],
    'programmer' : ['programmer'],
    'systems analysis': ['system analyst', 'system administrator','system engineer'],
    'mobile development': ['mobile', 'ios','android','mobile apps','mobile dev','react native','native'],
    'game development': ['game'],
    'information system': ['erp', 'odoo','information system support','information system'],
    'other_it':['"software engineer", "software developer", "software", "full-stack", "fullstack", "programmer", "javascript"']
}

job_dataset['Job Sub Category'] = 'Other'

job_dataset['Job Title'] = job_dataset['Job Title'].str.lower()
job_sub_categories = {key.upper(): [title.lower() for title in titles] for key, titles in job_sub_categories.items()}

for category, keywords in job_sub_categories.items():
    for keyword in keywords:
        job_dataset.loc[job_dataset['Job Title'].str.contains(keyword), 'Job Sub Category'] = category

# Display the updated DataFrame

In [81]:
job_dataset['Job Sub Category'].value_counts()  

Job Sub Category
Other                     1908
PHP                        136
UI UX                       17
INFORMATION SYSTEM          13
PROGRAMMER                   9
IT SUPPORT                   8
PRODUCT MANAGEMENT           7
.NET                         5
IT MANAGEMENT                4
CYBER SECURITY               3
DATA                         3
BUSINESS ANALYST             3
SYSTEMS ANALYSIS             2
MOBILE DEVELOPMENT           2
BACKEND                      2
PYTHON                       1
FRONTEND                     1
DEVOPS                       1
JAVA                         1
NETWORK ADMINISTRATION       1
CLOUD ENGINEERING            1
Name: count, dtype: int64

In [56]:
job_dataset['Job Sub Category']

0               Other
1               Other
2               Other
3               Other
4               Other
            ...      
2162            Other
2163    IT MANAGEMENT
2164            Other
2165            Other
2166            Other
Name: Job Sub Category, Length: 2128, dtype: object