In [282]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import re
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)

In [320]:
listing_base_url = 'https://www.seek.com.au{}'
base_url = 'https://www.seek.com.au/data-analyst-jobs/in-All-Sydney-NSW?page={}'

# List to hold job links
job_id_url = []

# Fetch pages from 1 to 2 (you can change the range as needed)
for page_num in range(1, 140):
    url = base_url.format(page_num)
    
    # Send a GET request to fetch the page content
    response = requests.get(url)
    
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all job listings
    job_listings = soup.find_all("div", class_="a3yfdf0 _5xlhbl5h _5xlhbl53")

    # Loop through each job listing
    for job in job_listings:
        # Extract the job title and link
        title_element = job.find("a", {"data-automation": "jobTitle"})
        if title_element:
            job_link = title_element['href']  # Get the relative job link
            full_job_link = listing_base_url.format(job_link)  # Create the full job link
            job_id_url.append(full_job_link)  # Append the job link to the list

In [322]:
job_id_url[:3]

['https://www.seek.com.au/job/79748646?type=promoted&ref=search-standalone&origin=cardTitle',
 'https://www.seek.com.au/job/79715282?type=promoted&ref=search-standalone&origin=cardTitle',
 'https://www.seek.com.au/job/79560131?type=standout&ref=search-standalone&origin=cardTitle']

Scape through each detail postings pages

In [317]:
job_titles = []       # Extract job title
job_titles_set = set()  # Initialize the set for unique job titles
found_job = False     # Flag to track if a job title was found

for link in job_id_url: 
    job_title_element = listing.find("h1", {"data-automation": "job-detail-title"})
    if job_title_element:
        job_title = job_title_element.text.strip()
        job_titles.append(job_title)
        job_titles_set.add(job_title)  # Add the job title directly to the set
        found_job = True

    if not found_job:
        job_titles.append('N/A')


In [316]:
# Initialize lists to hold the scraped data
salaries = []
classifications = []
work_modes = []
locations = []
companies = []

# Loop through the first three job URLs
for link in job_id_url: 
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find detail listings
    detail_listings = soup.find_all("div", class_='a3yfdf0 _5xlhbl5b _5xlhblhf _5xlhbl6r')

    # Initialize flags for data found
    found_salary = False
    found_classification = False
    found_work_mode = False
    found_location = False
    found_job = False
    found_company = False
    company_set = set()
    job_titles_set= set()

    for listing in detail_listings:

        # Extract salary
        salary_element = listing.find("span", {"data-automation": "job-detail-salary"})
        if salary_element:
            salaries.append(salary_element.text.strip())
            found_salary = True

        # Extract classification
        class_element = listing.find("span", {"data-automation": "job-detail-classifications"})
        if class_element:
            classifications.append(class_element.text.strip())
            found_classification = True

        # Extract work mode
        work_element = listing.find("span", {"data-automation": "job-detail-work-type"})
        if work_element:
            work_modes.append(work_element.text.strip())
            found_work_mode = True

        # Extract location
        loc_element = listing.find("span", {"data-automation": "job-detail-location"})
        if loc_element:
            locations.append(loc_element.text.strip())
            found_location = True

        # Extract company
        company_element = listing.find("span", {"data-automation": "advertiser-name"})
        if company_element:
            comp = company_element.text.strip()
            if comp not in company_set:  # Check if the company is already added
                companies.append(comp)
                company_set.add(comp)
                found_company = True

    # Append 'N/A' if data was not found
    if not found_salary:
        salaries.append('N/A')
    if not found_classification:
        classifications.append('N/A')
    if not found_work_mode:
        work_modes.append('N/A')
    if not found_location:
        locations.append('N/A')
    if not found_company:
        companies.append('N/A')



Extract the `keywords` in the `job_description`

In [8]:
keywords = [
    "SQL", "python", "SAS", "Power BI", "Tableau", "Excel",
    "R", "ML", "AI", "Jira", "mathematics", "statistical",
    "Databricks", "Snowflake", "ETL", "Java", "Analytics",
    "AWS", "Matlab", "quantitative"
    ]
pattern = re.compile('|'.join(keywords), re.IGNORECASE)

In [41]:
skills =[]

for link in job_id_url:
    # Fetch the job detail page
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find detail listings
    detail_listings = soup.find_all("div", {"data-automation": "jobAdDetails"})

    # Initialize a list to store matching skills
    matching_skills = []

    # Iterate over each detail listing
    for listing in detail_listings:
        # Extract all text within the listing
        text_content = listing.get_text(strip=True)
        
        # Find all keywords in the text
        found_skills = pattern.findall(text_content)
        
        # Remove duplicates and keep only unique keywords
        unique_skills = list(set(found_skills))
        
        # Append the list of unique skills to the matching_skills array
        skills.append(unique_skills)

skills[:4]

[['SQL', 'Analytics', 'ml', 'ai', 'AI', 'Jira', 'r', 'Python', 'ML'],
 ['SQL', 'Analytics', 'R', 'Java', 'ai', 'r'],
 ['R', 'statistical', 'r', 'ai'],
 ['SQL', 'R', 'Excel', 'ai', 'r']]

Extract the posting date for each posting

In [294]:
posted_dates = []  # List to store formatted posted dates

for link in job_id_url:
    response = requests.get(link)  # Fetch the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all relevant span elements
    spans = soup.find_all('span', class_='a3yfdf0 _5xlhbl4z _2s7o740 _2s7o741 _2s7o7422 _1708b944 _2s7o747')
    found_date = False
    
    for span in spans:
        # Extract the text from the span
        text = span.get_text()  # Get the content of the span
        
        # Check for days ago format
        match_days = re.search(r'(\d+)d ago', text)
        if match_days:
            days_ago = int(match_days.group(1))
            posted_date = datetime.now() - timedelta(days=days_ago)
            formatted_date = posted_date.strftime('%Y-%m-%d')
            posted_dates.append(formatted_date)
            print(f"Posted {days_ago} days ago on: {formatted_date}")
            found_date = True
            break  # Exit loop after finding the first date

        # Check for hours ago format
        match_hours = re.search(r'(\d+)h ago', text)
        if match_hours:
            hours_ago = int(match_hours.group(1))
            posted_date = datetime.now() - timedelta(hours=hours_ago)
            formatted_date = posted_date.strftime('%Y-%m-%d')
            posted_dates.append(formatted_date)
            print(f"Posted {hours_ago} hours ago on: {formatted_date}")
            found_date = True
            break  # Exit loop after finding the first date

    if not found_date:
        posted_dates.append('N/A')  # Append 'N/A' only once per link
        print(f"No date information found for link: {link}")


Posted 14 days ago on: 2024-10-12
Posted 25 days ago on: 2024-10-01
Posted 9 days ago on: 2024-10-17
Posted 1 days ago on: 2024-10-25
Posted 1 days ago on: 2024-10-25
Posted 1 days ago on: 2024-10-25
Posted 2 days ago on: 2024-10-24
Posted 3 days ago on: 2024-10-23
Posted 1 days ago on: 2024-10-25
Posted 20 hours ago on: 2024-10-25
Posted 7 days ago on: 2024-10-19
Posted 8 days ago on: 2024-10-18
Posted 7 days ago on: 2024-10-19
Posted 1 days ago on: 2024-10-25
Posted 9 days ago on: 2024-10-17
Posted 8 days ago on: 2024-10-18
Posted 9 days ago on: 2024-10-17
Posted 2 days ago on: 2024-10-24
Posted 7 days ago on: 2024-10-19
Posted 12 days ago on: 2024-10-14
Posted 18 days ago on: 2024-10-08
Posted 3 days ago on: 2024-10-23
Posted 2 days ago on: 2024-10-24
Posted 21 hours ago on: 2024-10-25
Posted 1 days ago on: 2024-10-25
Posted 1 days ago on: 2024-10-25
Posted 11 days ago on: 2024-10-15
Posted 15 days ago on: 2024-10-11
Posted 1 days ago on: 2024-10-25
Posted 14 days ago on: 2024-10-12

In [310]:
# Check the lengths of each list to easure consistency 
print("Length of company list:", len(companies))
print("Length of salaries list:", len(salaries))
print("Length of locations list:", len(locations))
print("Length of classifications list:", len(classifications))
print("Length of work_modes list:", len(work_modes))
print("Length of skills list:", len(skills))
print("Length of title list:", len(job_titles))
print("Length of posted date:", len(posted_dates))

Length of company list: 550
Length of salaries list: 550
Length of locations list: 550
Length of classifications list: 550
Length of work_modes list: 550
Length of skills list: 550
Length of title list: 550
Length of posted date: 550


Create the DataFrame to store all the valriables

In [313]:
df = pd.DataFrame({
    'job_title': job_titles,
    'company': companies,
    'location': locations,
    'work_type': work_modes,
    'classification': classifications,
    'technical_skills': skills,
    'salary': salaries,
    'posted_date':posted_dates,
    'job_id_url':job_id_url
    })

In [314]:
df.head()

Unnamed: 0,job_title,company,location,work_type,classification,technical_skills,salary,posted_date,job_id_url
0,Senior Technical Data Analyst - AI Platforms,Zurich Financial Services Australia,"North Sydney, Sydney NSW",Full time,Business/Systems Analysts (Information & Communication Technology),"[SQL, Analytics, ml, ai, AI, Jira, r, Python, ML]",,2024-10-12,https://www.seek.com.au/job/79440953?type=promoted&ref=search-standalone&origin=cardTitle
1,BI Analytics Consultant - Risk,Protecht Group,Sydney NSW,Full time,Analysts (Consulting & Strategy),"[SQL, Analytics, R, Java, ai, r]",,2024-10-01,https://www.seek.com.au/job/79174328?type=promoted&ref=search-standalone&origin=cardTitle
2,Data Analyst (APS6),Department of Education,Sydney NSW,Full time,Government - Federal (Government & Defence),"[R, statistical, r, ai]",Attractive Salary + 15.4% Superannuation,2024-10-17,https://www.seek.com.au/job/79560131?type=standout&ref=search-standalone&origin=cardTitle
3,Data and Process Analyst,ISS Facility Services,"Macquarie Park, Sydney NSW",Full time,Business/Systems Analysts (Information & Communication Technology),"[SQL, R, Excel, ai, r]",Flexible work arrangements.,2024-10-25,https://www.seek.com.au/job/79726671?type=standout&ref=search-standalone&origin=cardTitle
4,Reporting Analyst,ALDI Stores Australia,"Minchinbury, Sydney NSW",Full time,Developers/Programmers (Information & Communication Technology),"[Analytics, R, ai, r, analytics, Tableau]","$130,000 – $150,000 per year",2024-10-25,https://www.seek.com.au/job/79724348?type=standard&ref=search-standalone&origin=cardTitle


In [281]:
df.to_csv('job_listing.csv', index = False)