**BeautifulSoup Example 3**

1. Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup

2. Set the URL

In [2]:
url = 'https://avinashjairam.github.io/syllabus.html'

3. Store contents of page in a variable and check status code

In [3]:
page = requests.get(url)
page.status_code

200

4. Raw HTML

In [4]:
page.content

b'<!DOCTYPE html>\n\n<html>\n     <head>\n\t<title>CIS 3120 SYLLABUS></title>\n     </head>\n\n\n\n      <body>\n\t<h4>Instructor:</h4> \n\t<p id="instructor"> Mr. Avinash Jairam </p>\n\n\t<h4>Class Time: </h4> \n\t<p id ="time"> Saturday: 11:10AM - 2:05PM - ONLINE via BlackBoard Collaborate </p>\n\n\t<h4>Office hours:</h4> \n\t<p id = "office_hours"> Saturday: 1:00PM - 3:00PM </p>\n\n\t<h4>Email:</h4>\n        <p id="email"> avinash.jairam@baruch.cuny.edu </p>\n\n\t<h4>Course Website :</h4> \n\t<p id="website"> Blackboard</p>\n\n\t<h4>Course Description:</h4>\n\t<p id="description">\n\t This course introduces the aspects of programming that can\n\tsupport business analytics. The course covers hands-on issues in programming for\n\tanalytics which include accessing data, \n\tcreating informative data graphics, writing functions, \n\tdebugging, and organizing and commenting code.</p>\n\n      </body>\n\n\n</html>\n'

5. Import raw HTML into BeautifulSoup

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')

6. Get formatted HTML

In [6]:
# soup.prettify() # prints raw HTML as one line
print(soup.prettify())  # prints HTML in formatted lines

<!DOCTYPE html>
<html>
 <head>
  <title>
   CIS 3120 SYLLABUS&gt;
  </title>
 </head>
 <body>
  <h4>
   Instructor:
  </h4>
  <p id="instructor">
   Mr. Avinash Jairam
  </p>
  <h4>
   Class Time:
  </h4>
  <p id="time">
   Saturday: 11:10AM - 2:05PM - ONLINE via BlackBoard Collaborate
  </p>
  <h4>
   Office hours:
  </h4>
  <p id="office_hours">
   Saturday: 1:00PM - 3:00PM
  </p>
  <h4>
   Email:
  </h4>
  <p id="email">
   avinash.jairam@baruch.cuny.edu
  </p>
  <h4>
   Course Website :
  </h4>
  <p id="website">
   Blackboard
  </p>
  <h4>
   Course Description:
  </h4>
  <p id="description">
   This course introduces the aspects of programming that can
	support business analytics. The course covers hands-on issues in programming for
	analytics which include accessing data, 
	creating informative data graphics, writing functions, 
	debugging, and organizing and commenting code.
  </p>
 </body>
</html>



7. Scrape for paragraph tags

In [7]:
p = soup.find('p')  # only finds first instance of the specified tag on the page
all_p_tags = soup.find_all('p') # returns ALL instances of that tag on the page as a python list

7a. print all the conent in paragraph tags

In [8]:
for p_tag in all_p_tags:
  print(p_tag.get_text())

 Mr. Avinash Jairam 
 Saturday: 11:10AM - 2:05PM - ONLINE via BlackBoard Collaborate 
 Saturday: 1:00PM - 3:00PM 
 avinash.jairam@baruch.cuny.edu 
 Blackboard

	 This course introduces the aspects of programming that can
	support business analytics. The course covers hands-on issues in programming for
	analytics which include accessing data, 
	creating informative data graphics, writing functions, 
	debugging, and organizing and commenting code.


8. Scrape for email

In [10]:
email = soup.find('p', id='email')
print(email.get_text())

 avinash.jairam@baruch.cuny.edu 


9. Scrape for office hours

In [12]:
# Method 1
office_hours= soup.find_all('p', id="office_hours")
office_hours = office_hours[0].get_text()
print(office_hours)

# Method 2
# Since an id will only occur once per page, can also utilize the .find() method
office_hours = soup.find('p', id ='office_hours')
office_hours = office_hours.get_text()
print(office_hours)

 Saturday: 1:00PM - 3:00PM 
 Saturday: 1:00PM - 3:00PM 


10. Scrape for website

In [14]:
# Method 1
website = soup.find_all('p', id = "website")
website = website[0].get_text()
print(website)

# Method 2
# Since an id will only occur once per page, can also utilize the .find() method
website = soup.find('p', id = 'website')
website = website.get_text()
print(website)

 Blackboard
 Blackboard


11. Scrape for course description

In [15]:
# Method 1
course_description=soup.find_all('p', id="description")
course_description=course_description[0].get_text()
print(course_description)

# Method 2
# Since an id will only occur once per page, can also utilize the .find() method
course_description = soup.find('p', id = 'description')
course_description = course_description.get_text()
print(course_description)



	 This course introduces the aspects of programming that can
	support business analytics. The course covers hands-on issues in programming for
	analytics which include accessing data, 
	creating informative data graphics, writing functions, 
	debugging, and organizing and commenting code.
