In [126]:
import pandas as pd

import requests
import bs4

## Extracting News Articles
(https://web-scraping-demo.zgulde.net/news)

In [7]:
# make the http request and turn the response into a beautiful soup object
response = requests.get('https://web-scraping-demo.zgulde.net/news')
html = response.text
soup = bs4.BeautifulSoup(html)

1. Find the container for the information we want `article_container`
1. Within the container, identify the entities that we want to produce a list
1. Process each individual entity (identify the pieces that we want and extract them)

In [12]:
article_container = soup.select('.grid.gap-y-12')[0]

In [19]:
articles = article_container.select('.grid.grid-cols-4.gap-x-4.border')

In [24]:
article = articles[0]
# get a pretty printed representation of our element
print(article.prettify())

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
  <h2 class="text-2xl text-green-900">
   help human fly
  </h2>
  <div class="grid grid-cols-2 italic">
   <p>
    1989-11-04
   </p>
   <p class="text-right">
    By Stephanie Mendoza
   </p>
  </div>
  <p>
   Song learn day party. Nor again between knowledge.
Reach exist huge. Role each process receive role inside around safe. Simple these understand main specific guy.
  </p>
 </div>
</div>



`.select` vs `.find` or `.find_all`

- `.select` always gives back a list, sometimes the list is empty, sometimes it has a single element, sometimes it has multiple elements
- `.find` and `.find_all` accept a *tag name*
- `.find` returns a single element
- `.find_all` returns a list of elements
- With `.select` multiple class names each have a `.` in front of them
- With `.find` or `.find_all` you can use a `class_` keyword argument, but the classes must match exactly

In [53]:
article.find('div', class_='grid grid-cols-2 italic')

<div class="grid grid-cols-2 italic">
<p> 1989-11-04 </p>
<p class="text-right">By Stephanie Mendoza </p>
</div>

In [41]:
def process_article(article):
    title = article.find('h2').text
    date_and_byline_div = article.select('.grid.grid-cols-2.italic')[0]
    date_p, by_p = date_and_byline_div.find_all('p')
    summary = article.find_all('p')[-1].text
    
    return {
        "title": title,
        "date": date_p.text,
        "by": by_p.text,
        "summary": summary
    }

In [46]:
process_article(articles[3])

{'title': 'energy plan go',
 'date': ' 1973-10-24 ',
 'by': 'By James Serrano ',
 'summary': 'Back certain democratic still. Ready power begin medical security energy.\nPrepare nature hotel Republican see onto statement. Artist foreign space dinner.'}

In [49]:
pd.DataFrame([process_article(article) for article in articles])

Unnamed: 0,title,date,by,summary
0,help human fly,1989-11-04,By Stephanie Mendoza,Song learn day party. Nor again between knowle...
1,serious generation vote,2016-08-08,By Amy Collier,Upon watch attention first could not. Religiou...
2,couple hold these,1985-09-04,By Cody Davis,Significant card should whole stage. Part cont...
3,energy plan go,1973-10-24,By James Serrano,Back certain democratic still. Ready power beg...
4,wish may final,2020-10-06,By Ryan Baker,High hard quite approach threat. Feel nice sur...
5,sense plant tend,1979-04-18,By Brian Pugh,Back do team. Sell good strategy third includi...
6,fire down report,2015-10-04,By Andrew Gould,World author popular laugh. Wind message whole...
7,tough history can,1983-04-13,By Stephanie Andrews,Will summer huge blue statement. Reason later ...
8,he start time,1991-12-13,By Alicia Clark,Trip tonight skill garden even mention price. ...
9,matter low including,2016-06-14,By Brian Hodges,Small more rock candidate five without these. ...


## Extracting People Information
(https://web-scraping-demo.zgulde.net/people)

In [6]:
response = requests.get('https://web-scraping-demo.zgulde.net/people')

html = response.text

soup = bs4.BeautifulSoup(html)


In [9]:
people_container = soup.select('.grid.grid-cols-2.gap-x-12.gap-y-16')[0]

In [65]:
people = people_container.select('.person.border.rounded.px-3.py-5.grid.grid-cols-2.gap-x-3')

In [99]:
person = people[0]
person

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Curtis Jackson</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Integrated contextually-based analyzer"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">jeremiahharris@robertson.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">+1-054-107-2461</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                39971 Johnson Mission <br/>
                West Alexander, WV 24725
            </p>
</div>
</div>

In [100]:
people[0].text

'\nCurtis Jackson\n\n            "Integrated contextually-based analyzer"\n        \n\n\njeremiahharris@robertson.com\n\n+1-054-107-2461\n\n\n\n\n                39971 Johnson Mission \n                West Alexander, WV 24725\n            \n\n'

In [101]:
print(people[0].text)


Curtis Jackson

            "Integrated contextually-based analyzer"
        


jeremiahharris@robertson.com

+1-054-107-2461




                39971 Johnson Mission 
                West Alexander, WV 24725
            




In [102]:
name = person.find('h2').text
name

'Curtis Jackson'

In [103]:
quote = person.find('p').text.strip()
quote

'"Integrated contextually-based analyzer"'

In [104]:
info_div = person.select('.grid.grid-cols-9')[0]
email, phone = info_div.find_all('p')

In [105]:
email.text

'jeremiahharris@robertson.com'

In [106]:
phone.text

'+1-054-107-2461'

In [107]:
address_div = person.select('.address.grid.grid-cols-9')[0]
address = address_div.find('p').text.strip()

In [108]:
address

'39971 Johnson Mission \n                West Alexander, WV 24725'

In [109]:
{
        "name": name,
        "quote": quote,
        "email": email,
        "phone": phone,
        "address": address
    }

{'name': 'Curtis Jackson',
 'quote': '"Integrated contextually-based analyzer"',
 'email': <p class="email col-span-8">jeremiahharris@robertson.com</p>,
 'phone': <p class="phone col-span-8">+1-054-107-2461</p>,
 'address': '39971 Johnson Mission \n                West Alexander, WV 24725'}

In [115]:
def process_person(person):
    name = person.find('h2').text
    quote = person.find('p').text.strip()
    info_div = person.select('.grid.grid-cols-9')[0]
    email, phone = info_div.find_all('p')
    address_div = person.select('.address.grid.grid-cols-9')[0]
    address = address_div.find('p').text.strip()
    
    return {
        "name": name,
        "quote": quote,
        "email": email,
        "phone": phone,
        "address": address
    }

In [119]:
process_person(people[0])

{'name': 'Curtis Jackson',
 'quote': '"Integrated contextually-based analyzer"',
 'email': <p class="email col-span-8">jeremiahharris@robertson.com</p>,
 'phone': <p class="phone col-span-8">+1-054-107-2461</p>,
 'address': '39971 Johnson Mission \n                West Alexander, WV 24725'}

In [120]:
#solution from walkthrough
response = requests.get('https://web-scraping-demo.zgulde.net/people')
soup = bs4.BeautifulSoup(response.text)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Dennis Mcmahon<

In [121]:
people_div = soup.select('#people')[0]
people = people_div.select('.person')

In [122]:
person = people[0]
print(person.prettify())

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">
  Dennis Mcmahon
 </h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
  "Right-sized systemic definition"
 </p>
 <div class="grid grid-cols-9">
  <i class="bi bi-envelope-fill text-purple-800">
  </i>
  <p class="email col-span-8">
   kelly14@hotmail.com
  </p>
  <i class="bi bi-telephone-fill text-purple-800">
  </i>
  <p class="phone col-span-8">
   151.446.9246
  </p>
 </div>
 <div class="address grid grid-cols-9">
  <i class="bi bi-geo-fill text-purple-800">
  </i>
  <p class="col-span-8">
   888 Andrea Junctions
   <br/>
   North Hayleytown, IA 47016
  </p>
 </div>
</div>



In [123]:
def process_person(person):
    return {
        'name': person.find(class_='name').text,
        'quote': person.find(class_='quote').text.strip(),
        'email': person.find(class_='email').text,
        'phone': person.find(class_='phone').text,
        'address': person.find(class_='address').text.strip(),
    }

In [124]:
process_person(people[3])

{'name': 'Nicholas Medina',
 'quote': '"Re-engineered radical open architecture"',
 'email': 'madeline29@marquez.com',
 'phone': '753.274.5992x35738',
 'address': '5035 Maria Pine \n                Lake Oliviachester, MO 94597'}

In [127]:
pd.DataFrame([process_person(person) for person in people])

Unnamed: 0,name,quote,email,phone,address
0,Dennis Mcmahon,"""Right-sized systemic definition""",kelly14@hotmail.com,151.446.9246,888 Andrea Junctions \n North H...
1,Kayla Grant,"""Persistent value-added focus group""",jacksonmark@hotmail.com,043.171.9253x053,60177 Gloria Harbor \n Ayersmou...
2,Joseph Kim,"""Grass-roots reciprocal intranet""",tinamack@wallace.com,001-040-827-7230x64133,1145 Brown Mills Apt. 283 \n Je...
3,Nicholas Medina,"""Re-engineered radical open architecture""",madeline29@marquez.com,753.274.5992x35738,5035 Maria Pine \n Lake Oliviac...
4,Ann Kelly,"""Automated executive help-desk""",grogers@hotmail.com,1156385171,997 Kelly Crossing Apt. 233 \n ...
5,Suzanne Nunez,"""Customizable next generation customer loyalty""",pthompson@gmail.com,(606)148-4385,7152 Moore Run \n North Marcust...
6,Christopher Martin,"""Exclusive systemic toolset""",alejandrostein@yahoo.com,2895916944,18842 Sheila Path Apt. 818 \n S...
7,Paula Jimenez,"""Expanded clear-thinking moderator""",regina56@rhodes-lyons.info,001-702-073-5056,0460 Sarah Locks \n West Stepha...
8,April West,"""Grass-roots encompassing process improvement""",jamie47@young-golden.com,7959438803,86310 Powers Mount \n Lopezberg...
9,Ashley Holmes,"""Organized zero tolerance paradigm""",espinozamargaret@burnett.com,899-676-0828,5143 Pratt Mills \n North Charl...
