<a href="https://colab.research.google.com/github/YahiaML/Linkedin-Web-scraping-series/blob/main/3_Info_containers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Get the webpage HTML
url = 'https://www.bayt.com/en/egypt/jobs/data-analysis-jobs/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Use find_all to get the containers that hold all job info
job_containers = soup.find_all('li', {'class': 'has-pointer-d'})

# Lists to store data
job_titles, job_links, company_names, locations, posted_from_list, job_types, experience_list, other_info_list = [],[],[],[],[],[],[],[]

# Step 3: Iterate over each container and extract relevant data
for container in job_containers:

    # Job title
    try:
        job_title = container.find('h2').text.strip()
    except:
        job_title = np.nan

    # Job link
    try:
        job_link = container.find('h2').find('a').get('href')
        job_link = "https://www.bayt.com" + job_link  # Making it a full URL
    except:
        job_link = np.nan

    # Company name
    try:
        company_name = container.find('b').text.strip()
    except:
        company_name = np.nan

    # Location
    try:
        location = container.find('div', {'class': 't-mute'}).text.strip()
    except:
        location = np.nan

    # Posted from
    try:
        posted_from = container.find('div', {'data-automation-id': 'job-active-date'}).text.strip()
    except:
        posted_from = np.nan

    # Job type (Remote/On-site)
    try:
        job_type = container.find('li', {'class': 'jb-label-remote'}).text.strip()
    except:
        job_type = np.nan

    # Experience level and years of experience
    try:
        experience = container.find('li', {'class': 'jb-label-careerlevel'}).text.strip()
    except:
        experience = np.nan

    # Additional info (if any)
    try:
        other_info = container.find('div', {'class': 'm10t t-small'}).text.strip()
    except:
        other_info = np.nan

    # Append info to relevant lists
    job_titles.append(job_title)
    job_links.append(job_link)
    company_names.append(company_name)
    locations.append(location)
    posted_from_list.append(posted_from)
    job_types.append(job_type)
    experience_list.append(experience)
    other_info_list.append(other_info)

# Create a DataFrame
jobs_df = pd.DataFrame({
    'Job Title': job_titles,
    'Job Link': job_links,
    'Company Name': company_names,
    'Location': locations,
    'Posted From': posted_from_list,
    'Job Type': job_types,
    'Experience': experience_list,
    "Additional Info": other_info_list
})

jobs_df


In [None]:
# === What will happen if you only relayed on find_all() and didn't follow the 2 metioned steps == #
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Get the webpage HTML
url = 'https://www.bayt.com/en/egypt/jobs/data-analysis-jobs/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# use find_all() for direct extraction of each field without handling missing data
job_titles = [job.text.strip() for job in soup.find_all('h2')]
job_links = ["https://www.bayt.com" + job.find('a').get('href') for job in soup.find_all('h2')]
company_names = [company.text.strip() for company in soup.find_all('b')]
locations = [location.text.strip() for location in soup.find_all('div', {'class': 't-mute'})]
posted_from_list = [posted.text.strip() for posted in soup.find_all('div', {'data-automation-id': 'job-active-date'})]
job_types = [job_type.text.strip() for job_type in soup.find_all('li', {'class': 'jb-label-remote'})]
experience_list = [experience.text.strip() for experience in soup.find_all('li', {'class': 'jb-label-careerlevel'})]
other_info_list = [other_info.text.strip() for other_info in soup.find_all('div', {'class': 'm10t t-small'})]

# Attempt to create a DataFrame
try:
    jobs_df = pd.DataFrame({
      'Job Title': job_titles,
      'Job Link': job_links,
      'Company Name': company_names,
      'Location': locations,
      'Posted From': posted_from_list,
      'Job Type': job_types,
      'Experience': experience_list,
      "Additional Info": other_info_list
      })

    jobs_df
except Exception as e:
  print("Error creating DataFrame:")
  print(e)
  print("-------------------------------")
# This is caused by
print("number of collected job_titles:",len(job_titles))
print("number of collected job_links:",len(job_links))
print("number of collected company_names:",len(company_names))
print("number of collected locations:",len(locations))
print("number of collected posted_from_list:",len(posted_from_list))
print("number of collected job_types:",len(job_types))
print("number of collected experience_list:",len(experience_list))
print("number of collected other_info_list:",len(other_info_list))
