## Jobstreet scraping function

### Import module

In [23]:
import requests
import random
import re
import numpy as np
import pandas as pd
import htmldate
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from lxml.html import fromstring
from datetime import datetime, timedelta
from headers import headers_list
from htmldate import find_date

In [3]:
# Get number of jobs and pages for the particular job
def get_job_num(base_url):
    jobs_num_str = extract_page_str(base_url)
    job_num =  re.findall(r'(?<!,)\b(\d{1,3}(?:,\d{3})*)\b(?!,)', jobs_num_str)
    job_num = int(re.sub(',','', job_num[-1]))
    print("No. of Jobs to Scrape:", job_num)
    if job_num == 0:
        max_pages = 0
    # If there are too many jobs to scrape from, we limit to a maximum of 300 jobs
    if job_num > 300:
        max_pages = 30
    else:
        max_pages = int(np.ceil(job_num/10))
    print("No. of Pages to Scrape:", max_pages)
    return job_num, max_pages

In [4]:
# Get the page content in soup format for the particular page
def get_page_soup(x, base_url):
    if x == 0:
        page_append = ""
    else:
        page_append = "&start=" + str(x*10)
    headers = random.choice(headers_list)
    current_page = requests.get(base_url+page_append, headers=headers, timeout=100)
    page_soup = BeautifulSoup(current_page.content, "html.parser")
    return page_soup

In [5]:
# Get the overall job info of a particular job
def get_job_info(job, url="https://www.jobstreet.com"):
    session = HTMLSession()
    # Job_url
    job_url = url +job.find('a', attrs={'class': 'DvvsL_0 _1p9OP'})['href']
    response = session.get(job_url)
    # Get only english headers
    headers = {'Accept-Language': 'en-US,en;q=0.8'}
    job_soup = BeautifulSoup(response.content, 'html.parser')
    # Get info
    # Title
    title = extract_title(job)
    # Company name
    company = extract_company(job)
    # Company location
    location = extract_location(job)
    # Date posted
    date = extract_date(job)
    # Description
    description = extract_description_txt(job_soup)
    return {'Job_Title': title,
            'Company': company,
            'Location': location,
            'Date_Posted': date,
            'Description': description,
            'Job_URL': job_url
    }
    

In [6]:
# Use proxies to get a different IP address
def get_proxies():
    url_free_proxy = 'https://sslproxies.org/'
    response = requests.get(url_free_proxy)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr'):
         if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies

In [7]:
# Get Title
def extract_title(job):
    try:
        return job.find('div', attrs={'class': 'sx2jih0 _2j8fZ_0 sIMFL_0 _1JtWu_0'}).text
    except:
        return None

In [8]:
# Get Company name
def extract_company(job):
    try:
        return job.find('span', attrs={'class': 'sx2jih0 zcydq82c _18qlyvc0 _18qlyvcv _18qlyvc1 _18qlyvc8'}).text
    except:
        return None

In [9]:
# Get Company location
def extract_location(job):
    try:
        return job.find('span', attrs={'class': 'sx2jih0 zcydq82c zcydq8r iwjz4h0'}).text
    except:
        return None

In [10]:
# Get Date posted
def extract_date(job):
    try:
        return find_date(job_url)
    except:
        return None

In [11]:
# Get description
def extract_description_txt(job):
    try:
        return job.find('div', attrs={'class': 'vDEj0_0'}).get_text()
    except:
        return str('No Description')

In [12]:
# Get page number
def extract_page_str(url):
    headers = random.choice(headers_list)
    html = requests.get(url, headers=headers, timeout=100)
    soup = BeautifulSoup(html.content, "html.parser")
    try:
        return soup.find("span", {"class": "sx2jih0 zcydq82c _18qlyvc0 _18qlyvcv _18qlyvc1 _1yp9k9z4 _18qlyvc7"}).get_text()
    except:
        return str(['1','0'])

In [14]:
# Get Jobstreet url based on country
def fetch_country_url(country):
    if country == 'Singapore':
        return 'https://www.jobstreet.com.sg'
    elif country == 'Malaysia':
        return 'https://www.jobstreet.com.my'
    elif country == 'Indonesia':
        return 'https://www.jobstreet.co.id'
    elif country == 'Philippines':
        return 'https://www.jobstreet.com.ph'
    elif country == 'Vietnam':
        return 'https://www.jobstreet.vn'

In [15]:
# Get number of jobs and pages for the particular job
def get_job_num(base_url):
    jobs_num_str = extract_page_str(base_url)
    job_num =  re.findall(r'(?<!,)\b(\d{1,3}(?:,\d{3})*)\b(?!,)', jobs_num_str)
    job_num = int(re.sub(',','', job_num[-1]))
    print("No. of Jobs to Scrape:", job_num)
    if job_num == 0:
        max_pages = 0
    # If there are too many jobs to scrape from, we limit to a maximum of 300 jobs
    if job_num > 300:
        max_pages = 30
    else:
        max_pages = int(np.ceil(job_num/10))
    print("No. of Pages to Scrape:", max_pages)
    return job_num, max_pages

# Get the page content in soup format for the particular page
def get_page_soup(x, base_url):
    if x == 0:
        page_append = ""
    else:
        page_append = "&start=" + str(x*10)
    headers = random.choice(headers_list)
    current_page = requests.get(base_url+page_append, headers=headers, timeout=100)
    page_soup = BeautifulSoup(current_page.content, "html.parser")
    return page_soup

# Get the overall job info of a particular job
def get_job_info(job, url="https://www.jobstreet.com"):
    session = HTMLSession()
    # Job_url
    job_url = url +job.find('a', attrs={'class': 'DvvsL_0 _1p9OP'})['href']
    response = session.get(job_url)
    # Get only english headers
    headers = {'Accept-Language': 'en-US,en;q=0.8'}
    job_soup = BeautifulSoup(response.content, 'html.parser')
    # Get info
    # Title
    title = extract_title(job)
    # Company name
    company = extract_company(job)
    # Company location
    location = extract_location(job)
    # Date posted
    date = extract_date(job)
    # Description
    description = extract_description_txt(job_soup)
    return {'Job_Title': title,
            'Company': company,
            'Location': location,
            'Date_Posted': date,
            'Description': description,
            'Job_URL': job_url
    }

# Use proxies to get a different IP address
def get_proxies():
    url_free_proxy = 'https://sslproxies.org/'
    response = requests.get(url_free_proxy)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr'):
         if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies

# Get Title
def extract_title(job):
    try:
        return job.find('div', attrs={'class': 'sx2jih0 _2j8fZ_0 sIMFL_0 _1JtWu_0'}).text
    except:
        return None

# Get Company name
def extract_company(job):
    try:
        return job.find('span', attrs={'class': 'sx2jih0 zcydq82c _18qlyvc0 _18qlyvcv _18qlyvc1 _18qlyvc8'}).text
    except:
        return None

# Get Company location
def extract_location(job):
    try:
        return job.find('span', attrs={'class': 'sx2jih0 zcydq82c zcydq8r iwjz4h0'}).text
    except:
        return None

# Get Date posted
def extract_date(job):
    try:
        return find_date(job_url)
    except:
        return None

# Get description
def extract_description_txt(job):
    try:
        return job.find('div', attrs={'class': 'vDEj0_0'}).get_text()
    except:
        return str('No Description')

# Get page number
def extract_page_str(url):
    headers = random.choice(headers_list)
    html = requests.get(url, headers=headers, timeout=100)
    soup = BeautifulSoup(html.content, "html.parser")
    try:
        return soup.find("span", {"class": "sx2jih0 zcydq82c _18qlyvc0 _18qlyvcv _18qlyvc1 _1yp9k9z4 _18qlyvc7"}).get_text()
    except:
        return str(['1','0'])

# Get Jobstreet url based on country
def fetch_country_url(country):
    if country == 'Singapore':
        return 'https://www.jobstreet.com.sg'
    elif country == 'Malaysia':
        return 'https://www.jobstreet.com.my'
    elif country == 'Indonesia':
        return 'https://www.jobstreet.co.id'
    elif country == 'Philippines':
        return 'https://www.jobstreet.com.ph'
    elif country == 'Vietnam':
        return 'https://www.jobstreet.vn'

## Jobstreet scraper

### Configuration

In [35]:
# Country to scrape from
country = 'Malaysia'
COUNTRY_URL = fetch_country_url(country)
BASE_URL = COUNTRY_URL + "/en/job-search/{}-jobs/"
BASE_URL

'https://www.jobstreet.com.my/en/job-search/{}-jobs/'

### Scraping functions

In [36]:
def scrape_jobs(title):
    initial = datetime.now()
    interval = datetime.now()
    url = BASE_URL.format(title.lower().replace(' ', '-'))
    print(url)
    page = requests.get(url)
    # Get number of pages to scrape
    job_num, max_pages = get_job_num(url)
    output = []
    # Iterate through pages
    for i in range(0, max_pages):
        # Get info of page
        page_soup = get_page_soup(i, url)
        # Iterate through each job
        for job in page_soup.select(".sx2jih0 zcydq86o zcydq85o zcydq84c zcydq84o"):
            infos = get_job_info(job, COUNTRY_URL)
            infos['Role'] = title
            output.append(infos)
            # Wait
            time.sleep(random.uniform(2, 4))
        print("Successfuly Scrapped Jobs Page No {}/{}. Time taken: {}".format(i+1, max_pages, datetime.now() - interval))
        interval = dt.now()
    df_output = pd.DataFrame.from_dict(output)
    df_output = df_output.replace('\n', '', regex=True)
    print("Total time taken: {}".format(datetime.now() - initial))
    return df_output

In [37]:
url = 'https://www.jobstreet.com.my/en/job-search/Business+Manager/'
page = requests.get(url)
page

<Response [404]>

In [38]:
url = 'https://www.jobstreet.com.my/en/job-search/business-manager-jobs/'
page = requests.get(url)
page

<Response [200]>

In [40]:
job_num, max_pages = get_job_num(url)
job_num, max_pages

No. of Jobs to Scrape: 0
No. of Pages to Scrape: 0


(0, 0)

In [47]:
page_soup = get_page_soup(0, url)
print(page_soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="/static/shared-web/favicon-4e1897dfd0901e8a3bf7e604d3a90636.ico" rel="icon"/>
  <link href="/static/shared-web/iphone-2a9b65f22fc35e35808fcc317eb63810.png" rel="apple-touch-icon"/>
  <link href="/static/shared-web/ipad-d50023448fe0126ad1da4390a4af7f72.png" rel="apple-touch-icon" sizes="76x76"/>
  <link href="/static/shared-web/iphoneRetina-e8d65115bab819c629d8265de1e94120.png" rel="apple-touch-icon" sizes="120x120"/>
  <link href="/static/shared-web/ipadRetina-a71dfaf93883a40d06c0c7b6a97fad99.png" rel="apple-touch-icon" sizes="152x152"/>
  <meta content="/static/shared-web/banner-0c2ac79883746c7700892a4915e53610.png" name="twitter:image"/>
  <meta content="summary" name="twitter:card"/>
  <meta content="@JobStreetMY" name="twitter:site"/>
  <meta content="/static/shared-web/banner-0c2ac79883746c7700892a4915e53610.png" property=

In [53]:
# job_elems = page_soup.find_all('div', class_='sx2jih0 zcydq84g zcydq83g zcydq86g zcydq85g zcydq81w zcydq82o zcydq8cg zcydq8c4')
job_elems = page_soup.find_all('article')
job_elems

[<article class="sx2jih0 sx2jih1 zcydq88 zcydq83q zcydq824 _58veS_0" data-automation="job-card-0"><div class="sx2jih0 zcydq832 zcydq8d2 zcydq82a zcydq810 zcydq83q"><div class="sx2jih0 zcydq82a"><div class="sx2jih0 zcydq852 zcydq842 zcydq872 zcydq862 zcydq82a zcydq832 zcydq8d2 zcydq8cq"><div class="sx2jih0"><div class="sx2jih0 zcydq89i" data-automation="job-card-logo"><img alt="Julie's Marketing Sdn. Bhd.'s logo" class="sx2jih0 _1qfRc_0" src="https://image-service-cdn.seek.com.au/d2976227d77cf304696a2fd99c431c845b28e67a/ee4dce1061f3f616224767ad58cb2fc751b8d2dc"/></div><div class="sx2jih0 zcydq89i"><h1 class="sx2jih0 zcydq82q _18qlyvc0 _18qlyvcv _18qlyvc3 _18qlyvc8"><a class="DvvsL_0 _1p9OP" href="/en/job/business-development-manager-4659839?jobId=jobstreet-my-job-4659839&amp;sectionRank=1&amp;token=0~c3625ce6-20d8-4567-b115-0c20cae23387&amp;fr=SRP%20Job%20Listing" rel="nofollow noopener noreferrer" target="_top"><div class="sx2jih0 _2j8fZ_0 sIMFL_0 _1JtWu_0">Business Development Manager

In [71]:
def get_text(job, type_, class_):
    try:
        return job.find(type_, class_=class_).text
    except:
        return None

In [70]:
print(job_elems[0].prettify())

<article class="sx2jih0 sx2jih1 zcydq88 zcydq83q zcydq824 _58veS_0" data-automation="job-card-0">
 <div class="sx2jih0 zcydq832 zcydq8d2 zcydq82a zcydq810 zcydq83q">
  <div class="sx2jih0 zcydq82a">
   <div class="sx2jih0 zcydq852 zcydq842 zcydq872 zcydq862 zcydq82a zcydq832 zcydq8d2 zcydq8cq">
    <div class="sx2jih0">
     <div class="sx2jih0 zcydq89i" data-automation="job-card-logo">
      <img alt="Julie's Marketing Sdn. Bhd.'s logo" class="sx2jih0 _1qfRc_0" src="https://image-service-cdn.seek.com.au/d2976227d77cf304696a2fd99c431c845b28e67a/ee4dce1061f3f616224767ad58cb2fc751b8d2dc"/>
     </div>
     <div class="sx2jih0 zcydq89i">
      <h1 class="sx2jih0 zcydq82q _18qlyvc0 _18qlyvcv _18qlyvc3 _18qlyvc8">
       <a class="DvvsL_0 _1p9OP" href="/en/job/business-development-manager-4659839?jobId=jobstreet-my-job-4659839&amp;sectionRank=1&amp;token=0~c3625ce6-20d8-4567-b115-0c20cae23387&amp;fr=SRP%20Job%20Listing" rel="nofollow noopener noreferrer" target="_top">
        <div class="s

In [81]:
output = []

for job in job_elems:
    output.append({
        'title': get_text(job, 'div', 'sx2jih0 _2j8fZ_0 sIMFL_0 _1JtWu_0'),
        'company': get_text(job, 'span', 'sx2jih0 zcydq82q _18qlyvc0 _18qlyvcv _18qlyvc1 _18qlyvc8'),
        'location': get_text(job, 'span', 'sx2jih0 zcydq82q zcydq810 iwjz4h0'),
        'job_url': "https://www.jobstreet.com.my" + job.find('a', class_='DvvsL_0 _1p9OP')['href']
    })

df_job = pd.DataFrame.from_dict(output)
df_job

Unnamed: 0,title,company,location,job_url
0,Business Development Manager,Julie's Marketing Sdn. Bhd.,Petaling Jaya,https://www.jobstreet.com.my/en/job/business-d...
1,BUSINESS DEVELOPMENT MANAGER,TEM Electronics (M) Sdn Bhd,Sungai Petani,https://www.jobstreet.com.my/en/job/business-d...
2,Business Development Manager,Rentwise Sdn Bhd,Shah Alam/Subang,https://www.jobstreet.com.my/en/job/business-d...
3,"MANAGER,BUSINESS DEVELOPMENT",ANIKA INSURANCE BROKERS SDN BHD,Kuala Lumpur,https://www.jobstreet.com.my/en/job/manager-bu...
4,Business Development Manager,Agensi Pekerjaan Job Opportunities & Biz Solut...,Petaling Jaya,https://www.jobstreet.com.my/en/job/business-d...
5,"Senior Manager,Business Planning",Green Packet International Sdn Bhd,Selangor,https://www.jobstreet.com.my/en/job/senior-man...
6,Business Development Manager - CISCO (Penang),Ingram Micro Malaysia Sdn Bhd,Selangor,https://www.jobstreet.com.my/en/job/business-d...
7,Business Development Manager,Arissto (Malaysia) Sdn. Bhd.,Selangor,https://www.jobstreet.com.my/en/job/business-d...
8,Business Development Manager,GDT SDN. BHD.,Bukit Mertajam,https://www.jobstreet.com.my/en/job/business-d...
9,SME Business Relationship Manager,Planworth Global Factoring Sdn. Bhd.,Kuala Lumpur,https://www.jobstreet.com.my/en/job/sme-busine...


### Actual Scraping

In [39]:
job_title = 'Business Manager'
df_job = scrape_jobs(job_title)

https://www.jobstreet.com.my/en/job-search/business-manager-jobs/
No. of Jobs to Scrape: 0
No. of Pages to Scrape: 0
Total time taken: 0:00:01.392948
