# Import libraries

In [1]:
import pandas as pd
#from autoscraper import AutoScraper
from bs4 import BeautifulSoup
import requests
import numpy as np
import seaborn as sns
import re,random, time
from tqdm import tqdm

# WebScraping Code

In [2]:
def scrape_article_ids(api_url,max_pages):
    job_id =[]
    titles = []
    companies = []
    locations = []
    categorys= []
    subCategorys= []
    job_types=[]
    salarys=[]
    roles=[]
    listingDates = []

    for page_number in tqdm(range(1, max_pages + 1)):
        page_url = f'{api_url}&page={page_number}'
        
        # Send an HTTP request to the API endpoint
        response = requests.get(page_url)
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Extract advertiser IDs from each item in the 'data' list
            for item in data['data']:
                #print(item)
                jid = item['id']
                title = item['title']
                company = item['advertiser'].get('description', '')
                location = item.get('location', '')
                category = item['classification'].get('description', '')
                subCategory= item['subClassification'].get('description', '')
                job_type = item.get('workType', '')
                salary = item.get('salary', '')
                role = item.get('roleId','')
                listingDate = item.get('listingDate','')

                job_id.append(jid)
                titles.append(title)
                companies.append(company)
                locations.append(location)
                categorys.append(category)
                subCategorys.append(subCategory)
                job_types.append(job_type)
                salarys.append(salary)
                roles.append(role)
                listingDates.append(listingDate)
                #print(f"Job ID: {job_id}")

        else:
            print(f"Failed to retrieve data from the API. Status Code: {response.status_code}")


    return job_id, titles, companies, locations, categorys, subCategorys,job_types,salarys,roles,listingDates



In [3]:
def fetch_job_article(job_id):
    article_url = f'https://www.jobstreet.com.my/job/{job_id}'
    try:
        #response = requests.get(article_url, proxies={'https': proxy_url,'http':proxy_url})
        response = requests.get(article_url)
        if response.status_code == 200:
            return response.text
    except Exception as e:
        print(f"Error fetching {job_id}: {e}")
        return None

    print("Failed to retrieve job article using all proxies.")
    return None

def extract_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    div_tag = soup.find('div', class_='y735df0 _1pehz540')
    if div_tag:
        text = div_tag.get_text(strip=True, separator='\n')
        return text
    else:
        return "No description found"

def scrape_and_store_text(job_ids):
    data = {'job_id': [],'job_title':[],'company':[], 'descriptions': [], 'location':[],'category':[],'subcategory':[],'role':[],'type':[],'salary':[],'listingDate':[]}

    for job_id in tqdm(job_ids):
        job_article_content = fetch_job_article(job_id)
        
        delay = random.uniform(2, 7)
        time.sleep(delay)
        
        if job_article_content:
            descriptuion = extract_text(job_article_content)
            data['job_id'].append(job_id)
            data['descriptions'].append(descriptuion)
        else:
            data['job_id'].append(job_id)
            data['descriptions'].append("No description")            

    data['job_title'] = titles
    data['company'] = companies
    data['location'] = locations 
    data['category'] = categorys
    data['subcategory'] = subCategorys
    data['type']=job_types
    data['salary']=salarys
    data['role']=roles
    data['listingDate']=listingDates
    
    
    return data

In [4]:
# api url taken from Network -> Header 
api_url = 'https://www.jobstreet.com.my/api/chalice-search/v4/search?siteKey=MY-Main&sourcesystem=houston&userqueryid=d751713988987e9331980363e24189ce-6144301&userid=744fffcb-217e-4277-aafd-60aaceaac6f9&usersessionid=744fffcb-217e-4277-aafd-60aaceaac6f9&eventCaptureSessionId=744fffcb-217e-4277-aafd-60aaceaac6f9&seekSelectAllPages=true&pageSize=30&include=seodata&locale=en-MY&solId=568d2fe8-e8ef-4998-8e24-3e1ccfb1348b'
max_pages=100
job_id, titles, companies, locations, categorys, subCategorys,job_types,salarys,roles,listingDates = scrape_article_ids(api_url, max_pages)

100%|██████████| 100/100 [00:45<00:00,  2.22it/s]


In [5]:
len(set(job_id))

3200

In [6]:
a = 0
b=3000    #Limit to 3000 postings
job_id = job_id[a:b]
titles = titles[a:b]
companies = companies[a:b]
locations=locations[a:b]
categorys=categorys[a:b]
subCategorys=subCategorys[a:b]
job_types=job_types[a:b]
salarys=salarys[a:b]
roles=roles[a:b]
listingDates=listingDates[a:b]

In [7]:
len(set(job_id))

3000

In [8]:
data = scrape_and_store_text(job_id)


 28%|██▊       | 847/3000 [1:13:12<3:32:55,  5.93s/it]

Failed to retrieve job article using all proxies.


 91%|█████████ | 2729/3000 [3:54:38<25:46,  5.71s/it]

Failed to retrieve job article using all proxies.


100%|██████████| 3000/3000 [4:17:26<00:00,  5.15s/it]


In [9]:
new_df = pd.DataFrame(data)

In [10]:
new_df

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate
0,76145709,Account Executive,Steel Hawk Engineering Sdn Bhd,Key responsibilities:\nAble to handle full set...,Kemaman,Accounting,Bookkeeping & Small Practice Accounting,account-executive,Full time,"RM 2,500 – RM 2,800 per month",2024-05-26T04:07:42Z
1,76146797,"Associate Supervisor, Regulatory Practice",Bank Negara Malaysia,ROLE PURPOSE:\nTo assess regulatory applicatio...,Kuala Lumpur,Banking & Financial Services,Financial Planning,supervisor,Full time,,2024-05-26T08:44:41Z
2,76146002,ADMIN ASSISTANT,GRAND BULLION GOLD SDN. BHD.,GB GOLD MALAYSIA\nWe are looking for a\nAdmin ...,Shah Alam/Subang,Administration & Office Support,Administrative Assistants,administration-officer,Full time,"RM 2,000 – RM 3,000 per month",2024-05-26T05:47:34Z
3,76146806,"Associate Supervisor, Surveillance and Enforce...",Bank Negara Malaysia,ROLE PURPOSE:\nTo conduct surveillance and tak...,Kuala Lumpur,Banking & Financial Services,Compliance & Risk,supervisor,Full time,,2024-05-26T08:49:24Z
4,76127938,"Assistant Manager, Marketing Communications",Boustead Ikano Sdn Bhd,"PURPOSE OF THE JOB\nLead, mentor and develop a...",Kuala Lumpur,Marketing & Communications,Marketing Communications,assistant-marketing-manager,Full time,"RM 5,800 – RM 7,000 per month",2024-05-24T09:56:22Z
...,...,...,...,...,...,...,...,...,...,...,...
2995,75790244,Industrial Engineer II,Mattel (Malaysia) Sdn Bhd,Ensure compliance to QMS & EHS legal and other...,Seberang Perai,Engineering,Industrial Engineering,industrial-engineer,Full time,"RM 5,000 – RM 6,500 per month",2024-05-13T02:14:42Z
2996,75932952,Customer Service Coordinator (Vietnamese Speak...,Hapag-Lloyd Business Services (Malaysia) Sdn. Bhd,Main Objectives of this position:\nProvides th...,Petaling,Call Centre & Customer Service,Customer Service - Call Centre,customer-service-coordinator,Full time,,2024-05-16T10:56:20Z
2997,76111058,Content Creator Social Media Marketing,Beeul Enterprise Sdn Bhd,"🌟 Hey there, ready to join our team as a Socia...",Petaling,Marketing & Communications,Digital & Search Marketing,marketing-and-social-media,Full time,"RM 2,800 – RM 4,000 per month",2024-05-24T05:14:10Z
2998,75790426,Sustainability Engineer I,Mattel (Malaysia) Sdn Bhd,This person has been assigned to assist the EH...,Seberang Perai,Engineering,Electrical/Electronic Engineering,sustainability-engineer,Full time,"RM 3,400 – RM 5,100 per month",2024-05-13T02:20:01Z


## Load and Update Old dataset

In [11]:
df_prev = pd.read_csv("/kaggle/input/jobstreet-all-job-dataset/jobstreet_all_job_dataset.csv")
df_prev

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z
...,...,...,...,...,...,...,...,...,...,...,...
48283,75947041,Staff Nurse,Eternal Clinic Sdn Bhd,"REQUIREMENTS:\n(Under K- Youth Programme, an i...",Selangor,Healthcare & Medical,Nursing - General Medical & Surgical,staff-nurse,Full time,,2024-05-17T02:40:43Z
48284,75921336,Finance Executive,Zeito Plastic Components Sdn Bhd,ZEITO PLASTIC COMPONENTS is a plastic componen...,Hulu Langat,"Manufacturing, Transport & Logistics",Management,finance-executive,Full time,"RM 3,500 – RM 3,800 per month",2024-05-16T04:35:28Z
48285,75956420,Assistant Quantity Surveyor,Inta Bina Sdn Bhd,Job Responsibilities\nAssist Senior QS / CM in...,Petaling,Construction,Surveying,assistant-quantity-surveyor,Full time,,2024-05-17T06:28:46Z
48286,75961993,ISO Management System Lead,Faeth Asia Pacific Sdn. Bhd.,Job Descriptions\nMust be IRCA certified lead ...,Seberang Perai,"Manufacturing, Transport & Logistics",Quality Assurance & Control,systems-lead,Full time,,2024-05-17T09:11:05Z


In [12]:
df_new = pd.concat([df_prev, new_df], ignore_index=True)
df_new

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z
...,...,...,...,...,...,...,...,...,...,...,...
51283,75790244,Industrial Engineer II,Mattel (Malaysia) Sdn Bhd,Ensure compliance to QMS & EHS legal and other...,Seberang Perai,Engineering,Industrial Engineering,industrial-engineer,Full time,"RM 5,000 – RM 6,500 per month",2024-05-13T02:14:42Z
51284,75932952,Customer Service Coordinator (Vietnamese Speak...,Hapag-Lloyd Business Services (Malaysia) Sdn. Bhd,Main Objectives of this position:\nProvides th...,Petaling,Call Centre & Customer Service,Customer Service - Call Centre,customer-service-coordinator,Full time,,2024-05-16T10:56:20Z
51285,76111058,Content Creator Social Media Marketing,Beeul Enterprise Sdn Bhd,"🌟 Hey there, ready to join our team as a Socia...",Petaling,Marketing & Communications,Digital & Search Marketing,marketing-and-social-media,Full time,"RM 2,800 – RM 4,000 per month",2024-05-24T05:14:10Z
51286,75790426,Sustainability Engineer I,Mattel (Malaysia) Sdn Bhd,This person has been assigned to assist the EH...,Seberang Perai,Engineering,Electrical/Electronic Engineering,sustainability-engineer,Full time,"RM 3,400 – RM 5,100 per month",2024-05-13T02:20:01Z


In [13]:
df_new.drop_duplicates(subset=['job_id'],inplace=True)  # Drop any duplicated postings fom old dataset
len(df_new)

50392

In [14]:
df_new = df_new[~(df_new['descriptions'].str.contains('No description'))]
len(df_new)

50391

In [15]:
from datetime import datetime
current_date = datetime.now().strftime("%Y-%m-%d")

df_new.to_csv(f"jobstreet_all_job_dataset.csv",index=None)