<a href="https://colab.research.google.com/github/YahiaML/Linkedin-Web-scraping-series/blob/main/8_Scrap_in_structured_format.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Lets test how to scrap the Job Details and the Preferred Candidate info in a clean format

In [None]:
# How we were Job Details scraping them before
url = 'https://www.bayt.com/en/egypt/jobs/executive-secretary-to-the-chairman-5175578/'  # random page to test on
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


# Job Details
try:
    job_details = soup.find("dl", {"class": "dlist is-spaced is-fitted t-small"}).text.strip()
except:
    job_details = np.nan

print(job_details)


In [None]:
# How we were Preferred Candidate scraping them before
try:
    preferred_candidate = [value.text.strip() for value in soup.find("h2", {"class": "h5"}, string="Preferred Candidate").find_next_siblings("dl")]
except:
    preferred_candidate = np.nan

print(preferred_candidate)

### This is not the most neat way to do that. So, lets try another method

In [None]:
url = 'https://www.bayt.com/en/egypt/jobs/executive-secretary-to-the-chairman-5175578/'  # random page to test on
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
try:
    job_details = {}
    job_details_list = [value for value in soup.find("h2", {"class": "h5"}, string="Job Details").find_next_sibling("dl").find_all("div")]
    for detail in job_details_list:
        job_details[detail.find("dt").text] = detail.find("dd").text
except:
    job_details = np.nan
job_details

In [None]:
try:
    preferred_candidate = {}
    preferred_candidate_requirements = [value for value in soup.find("h2", {"class": "h5"}, string="Preferred Candidate").find_next_siblings("dl")]
    for requirement in preferred_candidate_requirements:
        preferred_candidate[requirement.find("dt").text] = requirement.find("dd").text
except:
    preferred_candidate = np.nan
preferred_candidate

# Now, lets combine those to the main script

# Step 0:  Determine the number of pages

In [4]:
# Determine the number of pages
url = 'https://www.bayt.com/en/egypt/jobs/data-analysis-jobs'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
number_of_jobs = int(soup.find("b", {"data-automation-id": "XJobsFound"}).text.replace("jobs found", "").strip())
number_of_pages = int(np.ceil(number_of_jobs / 20))

# Step 1: Scrap all main pages and sub-pages links

In [None]:

# Lists to store data
job_titles, job_links, company_names, locations, posted_from_list, job_types, experience_list, other_info_list = [],[],[],[],[],[],[],[]

# Step 1: Determine URL pattern
for page_number in range(1,number_of_pages+1):
    url = f'https://www.bayt.com/en/egypt/jobs/data-analysis-jobs/?page={page_number}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Step 2: Use find_all to get the containers that hold all job info
    job_containers = soup.find_all('li', {'class': 'has-pointer-d'})

    # Step 3: Iterate over each container and extract relevant data
    for container in job_containers:

        # Job title
        try:
            job_title = container.find('h2').text.strip()
        except:
            job_title = np.nan

        # Job link
        try:
            job_link = container.find('h2').find('a').get('href')
            job_link = "https://www.bayt.com" + job_link  # Making it a full URL
        except:
            job_link = np.nan

        # Company name
        try:
            company_name = container.find('b').text.strip()
        except:
            company_name = np.nan

        # Location
        try:
            location = container.find('div', {'class': 't-mute'}).text.strip()
        except:
            location = np.nan

        # Posted from
        try:
            posted_from = container.find('div', {'data-automation-id': 'job-active-date'}).text.strip()
        except:
            posted_from = np.nan

        # Job type (Remote/On-site)
        try:
            job_type = container.find('li', {'class': 'jb-label-remote'}).text.strip()
        except:
            job_type = np.nan

        # Experience level and years of experience
        try:
            experience = container.find('li', {'class': 'jb-label-careerlevel'}).text.strip()
        except:
            experience = np.nan

        # Additional info (if any)
        try:
            other_info = container.find('div', {'class': 'm10t t-small'}).text.strip()
        except:
            other_info = np.nan

        # Append info to relevant lists
        job_titles.append(job_title)
        job_links.append(job_link)
        company_names.append(company_name)
        locations.append(location)
        posted_from_list.append(posted_from)
        job_types.append(job_type)
        experience_list.append(experience)
        other_info_list.append(other_info)

# Create a DataFrame
jobs_df = pd.DataFrame({'Job Title': job_titles,'Job Link': job_links,'Company Name': company_names,'Location': locations,'Posted From': posted_from_list,'Job Type': job_types,'Experience': experience_list,"Additional Info": other_info_list})

jobs_df


# Step 2: Scrap the whole website data

In [None]:

# Lists to store the data
job_titles, companies, company_pages, company_locations, posting_dates, will_be_closed_ats, salary_ranges, full_descriptions, full_description_htmls, skills_list, skills_htmls, jobs_details_list, job_details_htmls, preferred_candidates, preferred_candidates_htmls, = [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

for index,row in jobs_df.iterrows():
    url = row['Job Link']
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')


    # Job Title
    try:
        job_title = soup.find("h1").text.strip()
    except:
        job_title = np.nan

    # Company
    try:
        company = soup.find("a", {"class": "is-black t-bold"}).text.strip()
    except:
        company = np.nan

    # Company Page
    try:
        company_page = "https://www.bayt.com" + soup.find("a", {"class": "is-black t-bold"}).get("href")
    except:
        company_page = np.nan

    # Company Location
    try:
        company_location = soup.find("span", {"class": "t-mute"}).text.strip()
    except:
        company_location = np.nan

    # Posting Date
    try:
        posting_date = soup.find("div", {"class": "t-small"}).find("span").text.strip()
    except:
        posting_date = np.nan

    # Will Be Closed At
    try:
        will_be_closed_at = soup.find("div", {"class": "t-small"}).find("span", {"class": "u-none"}).text.strip()
    except:
        will_be_closed_at = np.nan

    # Salary Range
    try:
        salary_range = soup.find("b", {"class": "t-small"}).text.strip()
    except:
        salary_range = np.nan

    # Full Description
    try:
        full_description = soup.find("div", {"class": "card-content p20t is-spaced"}).find("div", {"class": "t-break"}).text.strip()
    except:
        full_description = np.nan

    # Full Description HTML
    try:
        full_description_html = soup.find("div", {"class": "card-content p20t is-spaced"}).find("div", {"class": "t-break"})
    except:
        full_description_html = np.nan

    # Skills
    try:
        skills = soup.find("div", {"class": "card-content is-spaced t-break print-break-before p20t"}).text.strip()
    except:
        skills = np.nan

    # Skills HTML
    try:
        skills_html = soup.find("div", {"class": "card-content is-spaced t-break print-break-before p20t"})
    except:
        skills_html = np.nan

    # Job Details
    try:
        job_details = {}
        job_details_list = [value for value in soup.find("h2", {"class": "h5"}, string="Job Details").find_next_sibling("dl").find_all("div")]
        for detail in job_details_list:
            job_details[detail.find("dt").text] = detail.find("dd").text
    except:
        job_details = np.nan

    # Preferred Candidate
    try:
        preferred_candidate = {}
        preferred_candidate_requirements = [value for value in soup.find("h2", {"class": "h5"}, string="Preferred Candidate").find_next_siblings("dl")]
        for requirement in preferred_candidate_requirements:
            preferred_candidate[requirement.find("dt").text] = requirement.find("dd").text
    except:
        preferred_candidate = np.nan


    # append
    job_titles.append(job_title)
    companies.append(company)
    company_pages.append(company_page)
    company_locations.append(company_location)
    posting_dates.append(posting_date)
    will_be_closed_ats.append(will_be_closed_at)
    salary_ranges.append(salary_range)
    full_descriptions.append(full_description)
    full_description_htmls.append(full_description_html)
    skills_list.append(skills)
    skills_htmls.append(skills_html)
    jobs_details_list.append(job_details)
    preferred_candidates.append(preferred_candidate)

# Create DataFrame
full_df = pd.DataFrame({"Job Title": job_titles,"Company": companies,"Company Page": company_pages,"Company Location": company_locations,"Posting Date": posting_dates,"Will Be Closed At": will_be_closed_ats,"Salary Range": salary_ranges,"Full Description": full_descriptions,"Full Description HTML": full_description_htmls,"Skills": skills_list,"Skills HTML": skills_htmls,"Job Details": jobs_details_list,"Preferred Candidate": preferred_candidates})

# Print the DataFrame
full_df