# Practice Webscraping with requests and BeautifulSoup

In [2]:
import requests
from bs4 import BeautifulSoup

In [29]:
URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

In [27]:
# Finding by attribute id within 'div' tag
results = soup.find(id='ResultsContainer')

In [34]:
# Uncomment below to view the results
# print(results.prettify())

In [38]:
# Finding job elements by class_ (need the underscore because class is already used in python)
job_elements = results.find_all('div', class_='card-content')

In [50]:
# Printing only html code for the first job posting
for job_element in job_elements[:1]:
    print(job_element, end='\n'*2)

<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>



In [45]:
# Printing title, company, and location for only the first 5 postings
for job_element in job_elements[:5]:
    title_element = job_element.find("h2", class_="title")
    company_element = job_element.find("h3", class_="company")
    location_element = job_element.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    print()

Senior Python Developer
Payne, Roberts and Davis
Stewartbury, AA

Energy engineer
Vasquez-Davidson
Christopherville, AA

Legal executive
Jackson, Chambers and Levy
Port Ericaburgh, AA

Fitness centre manager
Savage-Bradley
East Seanview, AP

Product manager
Ramirez Inc
North Jamieview, AP



In [46]:
# Searching for python jobs
# Making sure to match any lower/upper case, I use lambda function to make sure we capture python jobs
python_jobs = results.find_all('h2', string=lambda text: 'python' in text.lower())
print(len(python_jobs))

10


In [54]:
# Searching for the parent of the python job, so we can see the rest of the job posting information
python_job_elements = [h2_element.parent.parent.parent for h2_element in python_jobs]
print(python_job_elements[:1])

[<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>]


In [55]:
# Now print out text content for top 5 python jobs
for element in python_job_elements[:5]:
    title = element.find('h2', class_='title')
    company = element.find('h3', class_='company')
    location = element.find('p', class_='location')
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip(), end='\n'*2)

Senior Python Developer
Payne, Roberts and Davis
Stewartbury, AA

Software Engineer (Python)
Garcia PLC
Ericberg, AE

Python Programmer (Entry-Level)
Moss, Duncan and Allen
Port Sara, AE

Python Programmer (Entry-Level)
Cooper and Sons
West Victor, AE

Software Developer (Python)
Adams-Brewer
Brockburgh, AE



In [57]:
# Extract links for python jobs
for element in python_job_elements:
    links = element.find_all('a')
    for link in links:
        link_url = link['href']
        print(f"Apply here: {link_url}\n")

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-20.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-30.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-40.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-developer-50.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/back-end-web-developer-python-django-60.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github

In [58]:
# We only want the second link for the job application page.
# How do we change the above code to make it collect only the second URL?
for element in python_job_elements:
    links = element.find_all('a')[1]['href']
    print(f"Apply here: {link_url}\n")

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html

