# Introduction to BeautifulSoup and requests

In [6]:
import requests
from bs4 import BeautifulSoup

`requests.get(website)` -  is used to access a perticular website provided as an argument.<br>
`verify=False` forces the method to not verify the SSL certificate.

`.status_code` attribute of the `.get()` instance, returns the HTTP status code of the request.<br>
200 means OK.<br>
404 means Not Found.

In [8]:
result = requests.get("https://www.google.com/", verify=False)
print(result.status_code)



200


`.header` attribute of the `.get()` instance returns the HTTP header of the website being accesses.

In [10]:
print(result.headers)

{'Date': 'Mon, 05 Sep 2022 06:05:06 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2022-09-05-06; expires=Wed, 05-Oct-2022 06:05:06 GMT; path=/; domain=.google.com; Secure, AEC=AakniGNrAnzB8ns6ArXw5TipwYPppRnaOtY9VY0-8MsO-vA1uyFWl3pJsg; expires=Sat, 04-Mar-2023 06:05:06 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax, NID=511=RswgjlzNHVsbD_NdyXgzX2gv3kndQe9BQB3Ih7qM2Lvn3qopDCoEbZuUfjifUbKF05UcNmvktjL3X_2mk8Mt8wJCN5EZQ5691mTk0uxAdyWYt582sVG-7UlpaErbclFk-1yVIir8SRg9bbAQ-1B4A7t6RCoClPuJQFcBEOCngGM; expires=Tue, 07-Mar-2023 06:05:06 GMT; path=/; domain=.google.com; HttpOnly', 'Transfer-Encoding': 'chunked'}


`.content` attribute returns the source of that page. Extract the content of that perticular page.

In [11]:
src =  result.content

In [12]:
src

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="LXi2qQoHMmSpmoor9McBYg">(function(){window.google={kEI:\'kpEVY7f_J-bE5OUPns6K6Ag\',kEXPI:\'0,1302536,56873,6058,207,4804,2316,383,246,5,5367,1123753,1197787,614,380090,16114,17444,11238,1109,16465,4858,1362,9291,3021,17587,4998,13228,3847,10622,22741,5081,1593,1279,2742,149,1103,840,6297,109,3405,606,2023,1777,520,14670,3227,2845,7,5599,27619,552,1851,2614,13142,3,576,1014,1,5444,149,11323,2652,4,1528,2304,7039,22023,3050,2658,7357,13658,21223,5800,2557,4094,4052,3,3541,1,39047,2,3105,2,14022,2715,11401,11623,6700,2380,28741,4568,6253,23424,1252,5835,14968,4332,8,7476,445,2,2,1,26632,5665,2490,6582,799,2,14678,1290,872,9116,10518,7,1922,9779,24,6518,12588,1510,3782,6900,4832,5763,1210,8991,68,790,193,1

This returns the HTML content of the webpage. Now using BeautifulSoup to parse that and be able to use that info.

Now we pass the src object to the BeautifulSoup class, which creates a beautifulsoup object to parse and process the source. Extract certain type of info from src

In [13]:
soup = BeautifulSoup(src, 'lxml')

In [17]:
print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="LXi2qQoHMmSpmoor9McBYg">
   (function(){window.google={kEI:'kpEVY7f_J-bE5OUPns6K6Ag',kEXPI:'0,1302536,56873,6058,207,4804,2316,383,246,5,5367,1123753,1197787,614,380090,16114,17444,11238,1109,16465,4858,1362,9291,3021,17587,4998,13228,3847,10622,22741,5081,1593,1279,2742,149,1103,840,6297,109,3405,606,2023,1777,520,14670,3227,2845,7,5599,27619,552,1851,2614,13142,3,576,1014,1,5444,149,11323,2652,4,1528,2304,7039,22023,3050,2658,7357,13658,21223,5800,2557,4094,4052,3,3541,1,39047,2,3105,2,14022,2715,11401,11623,6700,2380,28741,4568,6253,23424,1252,5835,14968,4332,8,7476,445,2,2,1,26632,5665,2490,6582,799,2,14678,1290,872,9116,10518,7,1922,9779,24,6518,12588,1510,3782,6900,4832,5763

`find_all()` method of the BeautifulSoup object used to find perticular tags in the above src file.<br>
Here the find_all() method is used to find the links with "a" tags.<br>
Returns a list.

In [14]:
links = soup.find_all("a")

In [15]:
links

[<a class="gb1" href="https://www.google.co.in/imghp?hl=en&amp;tab=wi">Images</a>,
 <a class="gb1" href="https://maps.google.co.in/maps?hl=en&amp;tab=wl">Maps</a>,
 <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>,
 <a class="gb1" href="https://www.youtube.com/?tab=w1">YouTube</a>,
 <a class="gb1" href="https://news.google.com/?tab=wn">News</a>,
 <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>,
 <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>,
 <a class="gb1" href="https://www.google.co.in/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>,
 <a class="gb4" href="http://www.google.co.in/history/optout?hl=en">Web History</a>,
 <a class="gb4" href="/preferences?hl=en">Settings</a>,
 <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;passive=true&amp;continue=https://www.google.com/&amp;ec=GAZAAQ" id="gb_70" target="_top">Sign in</a>,
 <a href="/advanced_search?hl=en-IN&amp;authuse

In [18]:
for link in links:
    print(link.get('href'))

https://www.google.co.in/imghp?hl=en&tab=wi
https://maps.google.co.in/maps?hl=en&tab=wl
https://play.google.com/?hl=en&tab=w8
https://www.youtube.com/?tab=w1
https://news.google.com/?tab=wn
https://mail.google.com/mail/?tab=wm
https://drive.google.com/?tab=wo
https://www.google.co.in/intl/en/about/products?tab=wh
http://www.google.co.in/history/optout?hl=en
/preferences?hl=en
https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ
/advanced_search?hl=en-IN&authuser=0
https://www.google.com/url?q=https://beinternetawesome.withgoogle.com/en_in/%3Futm_source%3Dgoogle%26utm_medium%3Dhpp%26utm_campaign%3Dbts2022&source=hpp&id=19031019&ct=3&usg=AOvVaw2zHZgNScDFvf5Hz_-9ei_z&sa=X&ved=0ahUKEwi3kJDk_fz5AhVmIrkGHR6nAo0Q8IcBCAU
https://www.google.com/setprefs?sig=0_8Ut2Bvn_X1UZp8KuotRUw9Zm2Zs%3D&hl=hi&source=homepage&sa=X&ved=0ahUKEwi3kJDk_fz5AhVmIrkGHR6nAo0Q2ZgBCAc
https://www.google.com/setprefs?sig=0_8Ut2Bvn_X1UZp8KuotRUw9Zm2Zs%3D&hl=bn&source=home

In [22]:
for link in links:
    if "About" in link.text:
        print(link)
        print(link.attrs['href'])

<a href="/intl/en/about.html">About Google</a>
/intl/en/about.html


In [20]:
type(links[1])

bs4.element.Tag

Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. Four common kinds of objects: 
>Tag, <br>
NavigableString, <br>
BeautifulSoup, <br>
and Comment.

In [24]:
links[2]

<a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>

In [25]:
links[2].text

'Play'

In [29]:
links[2].attrs['href']

'https://play.google.com/?hl=en&tab=w8'

In [30]:
links[2].attrs['class']

['gb1']

In [31]:
soup.a

<a class="gb1" href="https://www.google.co.in/imghp?hl=en&amp;tab=wi">Images</a>

In [33]:
convo = requests.get("https://archive.library.iitb.ac.in/items/show/5124", verify=False)



In [34]:
convo_src = convo.content

In [35]:
convo_soup = BeautifulSoup(convo_src, 'lxml')

In [37]:
convo_soup_a = convo_soup.find_all('a')

In [73]:
convo_soup_a[100].find('img').attrs

{'class': ['full'],
 'src': 'https://archive.library.iitb.ac.in/files/fullsize/14cffcc2b4e7704731f4bce352e148d8.jpg',
 'alt': 'MEDALIST (94).JPG',
 'title': 'MEDALIST (94).JPG'}

In [65]:
convo_soup_a[100].find('img').attrs['alt'][:6]

'MEDALI'

In [116]:
urls = []
for link in convo_soup_a:
    

KeyError: 'class'

In [114]:
convo_soup.find_all("img")[10].attrs

{'class': ['full'],
 'src': 'https://archive.library.iitb.ac.in/files/fullsize/0c38a45c874abf9c397f3227fc3f4886.jpg',
 'alt': 'MEDALIST (10).JPG',
 'title': 'MEDALIST (10).JPG'}

In [107]:
result = requests.get("https://www.whitehouse.gov/briefing-room/")
src = result.content
soup = BeautifulSoup(src, 'lxml')

In [109]:
urls = []
for h2_tag in soup.find_all("h2"):
    a_tag = h2_tag.find("a")
    if a_tag is not None:
        urls.append(a_tag.attrs['href'])

#TODO

## Objects in BeautifulSoup