In [1]:
import csv
import time
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  

In [3]:
driver = webdriver.Chrome(options=options)
driver.maximize_window()
time.sleep(5)

In [4]:
def collect_job_details(card, url):
    
    job_details = {}  # Initialize an empty dictionary inside the function

    job_title_tag = card.find('a', attrs={'data-automation': 'jobTitle'})
    if job_title_tag:
        job_details['JobTitle'] = job_title_tag.text.strip()
    else:
        job_details['JobTitle'] = ''

    job_company = card.find('a', attrs={'data-automation': 'jobCompany'})
    if job_company:
        job_details['company'] = job_company.text.strip()
    else:
        job_details['company'] = ''

    job_loc = card.find('a', attrs={'data-automation': 'jobLocation'})
    if job_loc:
        location = job_loc.text.strip()
        job_details['location'] = location
    else:
        job_details['location'] = ''

    job_sal = card.find('span', attrs={'data-automation': 'jobSalary'})
    if job_sal:
        salary = job_sal.text.strip()
        job_details['salary'] = salary
    else:
        job_details['salary'] = 'undisclosed'

    job_date = card.find('span', attrs={'data-automation': 'jobListingDate'})
    if job_date:
        date_posted = job_date.text.strip()
        job_details['date_posted'] = date_posted
    else:
        job_details['date_posted'] = ''
        
    job_details['ExtractDate'] = datetime.today().strftime('%Y-%m-%d')

    
    if job_title_tag:
        job_url = 'https://www.jobstreet.com.my' + job_title_tag['href']
        job_details['job_url'] = job_url

        # Navigate to Job Listing
        driver.get(job_url)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-automation="jobAdDetails"]')))

        soup = BeautifulSoup(driver.page_source, 'lxml')

        # Get job details
        job_sum = soup.find('div', {'data-automation': 'jobAdDetails'})
        if job_sum:
            summary = job_sum.text.strip().lower()
            job_details['summary'] = summary
        else:
            job_details['summary'] = ''

        # Get review URL
        job_review_tag = soup.find('a', attrs={'data-automation': 'job-header-company-review-link'})
        if job_review_tag:
            rev_url = 'https://www.jobstreet.com.my' + job_review_tag['href']
            job_details['review_url'] = rev_url

            # Navigate to review page
            driver.get(rev_url)

            soup = BeautifulSoup(driver.page_source, 'lxml')

            # Get ratings
            overalltag = soup.find('h1', class_='_1gk5bsz0 xuf42b4y kytf1h0 kytf1hl h4n4qm4 kytf1hm kytf1h21')
            if overalltag:
                overallR = overalltag.text.strip()
                job_details['overall_rating'] = overallR

            numRTag = soup.find('strong', class_='kytf1h3')
            if numRTag:
                numR = numRTag.text.strip()
                job_details['num_ratings'] = numR
        else:
            job_details['review_url'] = ''

    else:
        job_details['job_url'] = ''

    # Navigate back to job page
    driver.get(url)


    return job_details


def main():
    """Run the main program routine"""

    max_pages = 1  # Maximum number of pages to scrape
    page_count = 0  # Current page count
    
    jobs=[]
    
    url='https://www.jobstreet.com.my/internship-for-computer-science-students-jobs'

    # extract the job data
    while page_count < max_pages:
        driver.get(url)

        soup = BeautifulSoup(driver.page_source, 'lxml')
        cards = soup.find_all ('div','y735df0 _1akoxc50 _1akoxc56')

        for card in cards:
            job_details = collect_job_details(card, url)
            jobs.append(job_details)
            print(job_details)
            
        page_count += 1
        print("Page count:", page_count)
            
        try:
            url = 'https://www.jobstreet.com.my' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['JobTitle', 'company', 'location', 'salary', 'date_posted', 'ExtractDate', 'job_url', 'summary', 'review_url', 'overall_rating', 'num_ratings'])
        writer.writeheader()
        writer.writerows(jobs)

In [5]:
# run the main program
main()

{'JobTitle': 'IT Internship', 'company': 'Vision Technology Consulting Sdn Bhd', 'location': 'Kuala Lumpur', 'salary': 'undisclosed', 'date_posted': '2d ago', 'ExtractDate': '2024-04-27', 'job_url': 'https://www.jobstreet.com.my/job/75397128?type=standout&ref=search-standalone&origin=cardTitle#sol=965028df0a49ffefbdca3c615da1a832a0712ef9', 'summary': "responsibilities:analysis & design of enterprise software application, data model, integration and/or server architecturecoordinate, plan and deliver on all tasks related to systems infrastructure design, installation configuration, administration, across all environments.prepare functional, technical, integration and/or server architecture design specificationssoftware configuration, development & customizationperform operational tasks and activities (e.g. application monitoring, performance tuning and migration between environments)implements & govern code migration processes to reduce risks in projectprovide application supportassist w