In [30]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import math
import requests
import pandas as pd

### Method 1: Use the root url like "https://www.mycareersfuture.sg" to set the query value to get the results

In [27]:
# need to download the driver support in selenium, refer to below 2 helpers.
# https://selenium-python.readthedocs.io/installation.html#drivers
# https://sites.google.com/a/chromium.org/chromedriver/downloads

google_chrome_driver_path = '/Users/alexjzy/Desktop/Py-Projects/text_mining/chromedriver'
root_url = 'https://www.mycareersfuture.sg'

driver=webdriver.Chrome(google_chrome_driver_path)
driver.get(root_url)
# to wait the page finish loading.
time.sleep(2) 
# find and type in the search bar with "machine learning"
driver.find_element_by_name('search-text').send_keys('machine learning') 
time.sleep(1) 
# find and click the search button
driver.find_element_by_id('search-button').click()
time.sleep(1) 
# now, get the html of all the search result
html = driver.page_source

### Method 2: Can just joint the content you wanna query fill in the url:
see the url like https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=0
- set the *search=* equals to your wanted content such as 'machine%20learning', "%20" means the 'space' which is the URL encoding rule, refer to https://zh.wikipedia.org/wiki/%E7%99%BE%E5%88%86%E5%8F%B7%E7%BC%96%E7%A0%81 
- set the *page=* equals to the web pages you wanna jump to. attention the **no result**.

In [2]:
basic_url = 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page={}'
urls = []
# construct 20 pages
for page in range(0, 20):
    query_url = basic_url.format(page)
    urls.append(query_url)
# driver.get(urls[0])
# html = driver.page_source


### Start to extract the contents

In [9]:
def fetch_data(url, head, payload):
    response = requests.get(url, headers=head, params=payload)
    if response.status_code == 200:
        return response.json()
    else:
        return {'info': 'error', 'error_code': response.status_code}

In [19]:
google_chrome_driver_path = '/Users/alexjzy/Desktop/Py-Projects/text_mining/chromedriver'
driver=webdriver.Chrome(google_chrome_driver_path)
query_url = 'https://api.mycareersfuture.sg/jobs?search=machine%20learning&sortBy=new_posting_date'
response = fetch_data(query_url, {}, {})
result_num = math.ceil(response['count']/20)

In [11]:
basic_url = 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page={}'
urls = []
# construct 20 pages
for page in range(0, result_num ):
    query_url = basic_url.format(page)
    urls.append(query_url)
print(result_num)
print("urls shape: ", len(urls))

11
urls shape:  11


In [43]:
query_url = 'https://api.mycareersfuture.sg/jobs?search=machine%20learning&sortBy=new_posting_date'
response = fetch_data(query_url, {}, {})
result_num = math.ceil(response['count']/20)

In [44]:
urls

['https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=0',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=1',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=2',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=3',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=4',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=5',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=6',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=7',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=8',
 'https://www.mycareersfuture.sg/search?search=machine%20learning&sortBy=new_posting_date&page=9',
 'https://

In [45]:
def get_job_description(uuid):
    api_basic = 'https://api.mycareersfuture.sg/job/{}'
    api_jd_url = api_basic.format(uuid)
    json = fetch_data(api_jd_url, {}, {})
    jd = BeautifulSoup(json['job_description']).get_text(strip=True)
    jr = BeautifulSoup(json['other_requirements']).get_text(strip=True)
    jsk = [item['skill'] for item in json['skills']]
    sal_max = json['min_monthly_salary']
    sal_min = json['max_monthly_salary']
    return jd, jr, jsk, sal_max, sal_min
    

In [46]:
def get_detail(card):
    company = card.find("p", {"name": "company"}).get_text()
    job_title = card.find("h1", {"name": "job_title"}).get_text()
    
    # extract the data
    location = card.find_all("p", {"name": "location"})[0].get_text() if len(card.find_all("p", {"name": "location"})) > 0 else None
    employment_type = card.find_all("p", {"name": "employment_type"})[0].get_text() if len(card.find_all("p", {"name": "employment_type"})) > 0 else None
    seniority = card.find_all("p", {"name": "seniority"})[0].get_text() if len(card.find_all("p", {"name": "seniority"})) > 0 else None
    category = card.find_all("p", {"name": "category"})[0].get_text() if len(card.find_all("p", {"name": "category"})) > 0 else None
    
    # get the job detail and collect the jd and requirements which are the raw text
    job_uuid = card.find("a", href=True)['href'].split('-')[-1]
    job_description, job_requirement, job_skills, salary_max, salary_min = get_job_description(job_uuid)
    return {
        "company": company,
        "job_title": job_title,
        "location": location,
        "employment_type": employment_type,
        "seniority": seniority,
        "category": category,
        "job_description": job_description,
        "job_requirement": job_requirement,
        "job_skills": job_skills,
        "job_uuid": job_uuid,
        "salary_min": salary_min,
        "salary_max": salary_max
    }

In [47]:
def get_card_info(page_url, res):
    driver.get(page_url)
    time.sleep(2)
    html = driver.page_source
    soup = BeautifulSoup(html)
    card_jobs = soup.find("div", {"class": "card-list"})
    cards = card_jobs.find_all("div", {"class": "card relative"})
    for card in cards:
        res.append(get_detail(card))

In [48]:
result = []
for url in urls:
    get_card_info(url, result)


In [49]:
career_res = pd.DataFrame.from_dict(result)
career_res['job_skills'] = career_res.job_skills.apply(lambda x: ', '.join(x))

In [70]:
test = career_res["job_description"][0]

In [71]:
soup_test = BeautifulSoup(test)
soup_test.get_text(strip=True)

'Join Fraud Investigation & Dispute Services (FIDS) within Assurance and you will help clients work through complex issues of fraud, regulatory compliance and business disputes. You will be part of a multidisciplinary, culturally aligned team that works with clients and their legal advisors.The opportunityYou will experience ongoing professional development through diverse experiences, world-class learning and individually tailored coaching. Thatâ€™s how we develop outstanding leaders who team to deliver on our promises to all of our stakeholders, and in so doing, play a critical role in building a better working world for our people, for our clients and for our communities. Sound interesting? Well this is just the beginning. Because whenever you join, however long you stay, the exceptional EY experience lasts a lifetime.Skills and attributes for successWork on client engagement teams in carrying out both reactive and proactive forensic analysis of large amounts of structured and unstr