In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [3]:
url = 'https://rldaggie.github.io/sample-html/'

res = requests.get(url)

### Status Codes

In [4]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [5]:
res.content

b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <title>The title</title>\n\n    <style media="screen">\n      tbody tr {\n        color: red;\n      }\n    </style>\n  </head>\n  <body>\n    <h1 class="foobar" id="title">This is an h1</h1>\n\n    <div>\n      <h1 class="foobar">This is yet another heading.</h1>\n\n      Something inside the div\n    </div>\n\n    <h3>Todo List</h3>\n    <ol class="todo">\n      <li class="foobar">Take out trash</li>\n      <li>Pay billz</li>\n      <li class="foobar">Feed dog</li>\n    </ol>\n\n    <h3>Completed</h3>\n    <ol class=\'done\'>\n      <li>Mow lawn</li>\n      <li class="foobar"><span>Take out compost</span></li>\n      <li><span>Create scraping lecture</span></li>\n    </ol>\n\n    <p class=\'foobar\'>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commod

In [6]:
soup = BeautifulSoup(res.content, 'lxml')

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [7]:
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>The title</title>
<style media="screen">
      tbody tr {
        color: red;
      }
    </style>
</head>
<body>
<h1 class="foobar" id="title">This is an h1</h1>
<div>
<h1 class="foobar">This is yet another heading.</h1>

      Something inside the div
    </div>
<h3>Todo List</h3>
<ol class="todo">
<li class="foobar">Take out trash</li>
<li>Pay billz</li>
<li class="foobar">Feed dog</li>
</ol>
<h3>Completed</h3>
<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>
<p class="foobar">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. <span>Duis aute irure dolor</span> in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. <em>Excepteu

# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [11]:
h1 = soup.find('h1')

In [12]:
type(h1)

bs4.element.Tag

In [15]:
#Pull the text from the tag - This is the text that the user sees
h1.text

'This is an h1'

In [18]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [19]:
soup.find_all('h1')

[<h1 class="foobar" id="title">This is an h1</h1>,
 <h1 class="foobar">This is yet another heading.</h1>]

In [22]:
#indexing h1
soup.find_all('h1')[1].text

'This is yet another heading.'

In [21]:
#let's use list comp to grab all the texts

[h.text for h in soup.find_all('h1')]

['This is an h1', 'This is yet another heading.']

# Creating a `pandas` DataFrame from a scrape

### Todo List

In [23]:
ol = soup.find('ol')

### GA Directory

In [24]:
ol = soup.find('ol', attrs = {'class' : 'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [26]:
#scaffolding

jobs =[]

for li in ol.find_all('li'):
    print(li.text)
    
    job = {
        'task':li.text
    } #This is my dictionary of jobs
    jobs.append(job)
    
pd.DataFrame(jobs)

Mow lawn
Take out compost
Create scraping lecture


Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


In [28]:
#table
table = soup.find('table', {'id': 'directory'})

In [44]:
people = []

for row in table.find_all('tr')[1:]: #this is getting rid of the header row
    person = {} #this is where we add our information
    
    person['name'] = row.find('a').text.strip()
    person['role'] = row.find('td').text.strip()
    person['email'] = row.find('a').attrs['href'][7:]
    
    people.append(person)
    
pd.DataFrame(people)

Unnamed: 0,name,role,email
0,Praveen,Student,praveen@ga.co
1,Fred,Student,fred@ga.co
2,Homer,Student,homer@ga.co
3,Kyle,Student,kyle@ga.co
4,Sam,Student,sam@ga.co
5,Javier,Student,javier@ga.co
6,Nengkuan,Student,nengkuan@ga.co
7,Kieth,Student,kieth@ga.co
8,Bola,Student,bola@ga.co
9,Steve,Student,steve@ga.co


### Basketball Reference

In [45]:
url_b = 'https://www.basketball-reference.com/'

res_1 = requests.get(url_b)

res_1.status_code

200

In [47]:
soup_bb = BeautifulSoup(res_1.content, 'lxml')

In [54]:
east = soup_bb.find('table', {'id':'confs_standings_E'})

Find the parent → table
Start our list
We do our loop
We create our dictionaries
We append it
Then we look at it

In [59]:
#Scaffolding for the BB table

teams = []

for row in east.find_all('tr')[1:]: #We want to skip the header row
    team = {}
    team['abbreviation'] = row.find('a').text #We will get the txt from the a tag
    team['name'] = row.find('a').attrs['title'] #We are going to reference the title for 
    team['wins'] = row.find('td', {'data-stat': 'wins'}).text
    team['losses'] = row.find('td', {'data-stat': 'losses'}).text
    teams.append(team)
    
pd.DataFrame(teams)


Unnamed: 0,abbreviation,name,wins,losses
0,PHI,Philadelphia 76ers,24,12
1,BRK,Brooklyn Nets,24,13
2,MIL,Milwaukee Bucks,22,14
3,BOS,Boston Celtics,19,17
4,NYK,New York Knicks,19,18
5,MIA,Miami Heat,18,18
6,CHO,Charlotte Hornets,17,18
7,TOR,Toronto Raptors,17,19
8,CHI,Chicago Bulls,16,18
9,IND,Indiana Pacers,16,19
