# Load necessary libraries

In [1]:
import requests #To retrieve the html content of a page
from bs4 import BeautifulSoup as bs #To convert the request object to managable information

# Load the first page

In [2]:
r = requests.get("http://keithgalli.github.io/web-scraping/example.html")

#Convert to a beautiful soup object
soup = bs(r.content)
print(soup.prettify()) # El prettify es para que lo ponga con las indentaciones

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start scrapping with the Beautiful Soup library

## Using find/find_all
find returns the first element encountered and returns a single tag object
find_all returns all the elements that meet the criteria and returns a list-like object with tag objects inside. It is iterable

In [3]:
#Find the first header. THIS RETURNS A SINGLE TAG
first_header = soup.find('h2') #'h2' es el nombre en HTML para headers de tamaño 2
print(first_header)

#Find all the headers. THIS RETURNS A LIST OF TAGS
headers = soup.find_all('h2')
headers

<h2>A Header</h2>


[<h2>A Header</h2>, <h2>Another header</h2>]

In [4]:
#Pass in a list of things to look for
first_header = soup.find(['h2','h1']) #Como solo es find, devuelve el primer elemento que encuentre de la lista
print(first_header)

headers = soup.find_all(['h2','h1'])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [5]:
#Pass attributes to filter only the elements we want
paragraph = soup.find_all('p', attrs={'id':'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [6]:
#We can nest find/find_all calls
body = soup.find('body')
div = body.find_all('div')
paragraph = div[0].find_all('p') #Como find_all devuelve una lista, debo seleccionar el elemento [0]
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [7]:
#Search for SPECIFIC strings within the HTML sections
paragraph = soup.find('p')
my_string = soup.find('p', string='Some italicized text') #encuentra el primer elemento que sea p Y que tenga el string completo "Some italicized text"
print(my_string)

<p><i>Some italicized text</i></p>


In [40]:
#To search for WORDS in one or many parts of the HTML code, we can use the regex library
import re

my_word_matches = soup.find_all('p', string=re.compile('Some', re.IGNORECASE))#Esto solo hace que sea true cuando el string incluya 'Some'
                                                         #Para que no haga caso de mayúsculas y minúsculas
                                                         #re.compile(r'sOMe', flags=re.IGNORECASE)
print(my_word_matches)

headers = soup.find_all('h2', string = re.compile('(h|H)eader')) #with this syntax it looks for both 'header' and 'Header'
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


## Select (CSS selector)
This retutns a python list with tag objects inside

In [9]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [10]:
content = soup.select('p i') #Devuelve los elementos i que estén en un p
content

[<i>Some italicized text</i>]

In [11]:
content = soup.select('div p') #Devuelve los elementos p que estén dentro de un div
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [12]:
paragraphs = soup.select('h2 ~ p' ) #Devueve los paragraphs que estén justo después (y en el mismo nivel) que un h2
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [13]:
bold_element = soup.select('p#paragraph-id b') # El hashtag es para indicar el id que busco de los paragraphs
bold_element

[<b>Some bold text</b>]

In [14]:
paragraphs = soup.select ('body > p')#Devuelve los paragraphs que sean descendientes DIRECTOS del body , o sea hijos, no nietos.
print(paragraphs)

#Este loop selecciona los elementos i de cada uno de los dos elementos de paragraphs
for paragraph in paragraphs:
  print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [15]:
# Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML

In [16]:
header = soup.find('h2')
header.string #Extraigo el string del elemento y no todo el tag

'A Header'

In [17]:
div = soup.find('div')
print(div.prettify())

#Si hay más de un elemento hijo, usar get_text()
div_string = div.get_text()  #Si uso la propiedad string no me arroja nada por que hay más de un elemento al cual sacarle el string
print(div_string)

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [18]:
#Get a specific property from an element
link = soup.select('a')
print(link)
print(link[0]['href'])

#Lo mismo pero usando find
link_1=soup.find('a')
print(link_1)
print(link_1['href'])

#Get id
paragraph = soup.select('p#paragraph-id')
print(paragraph)
print(paragraph[0]['id'])

[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]
https://keithgalli.github.io/web-scraping/webpage.html
<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>
https://keithgalli.github.io/web-scraping/webpage.html
[<p id="paragraph-id"><b>Some bold text</b></p>]
paragraph-id


## Code Navigation

In [19]:
# Path syntax
print(soup.body.div.p.a['href']) #voy accediendo a hijos con cada punto

https://keithgalli.github.io/web-scraping/webpage.html


In [20]:
#Get parent, child or sibling
siblings = soup.body.select('div')[0].find_next_siblings()
print(siblings)

#Lo mismo pero con find
siblings = soup.body.find('div').find_next_siblings()
print(siblings)

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


# Exercises 

Go to https://keithgalli.github.io/web-scraping/webpage.html

## Load the page

In [21]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

#Convert to a beautiful soup object
webpage = bs(r.content)
print(webpage.prettify()) # El prettify es para que lo ponga con las indentaciones

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Grab all the social media links from the webpage

Do it in 3 different ways

In [22]:
#Way 1
social_elements = webpage.find_all('ul', attrs={'class':'socials'})
social_links = social_elements[0].find_all('a')
social_links_href=[ i['href'] for i in social_links]
print(social_links_href)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [23]:
#Way 2
social_elements = webpage.select('h2 ~ ul')[1]
social_links = social_elements.find_all('a')
social_links_href=[ i['href'] for i in social_links]
print(social_links_href)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [24]:
#Way 3
social_elements = webpage.find_all('li', attrs={'class':re.compile('social')})
social_links_href=[ i.find('a')['href'] for i in social_elements]
print(social_links_href)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [25]:
#Way 4, coz im a badass
social_elements = webpage.select('ul.socials a')
social_links_href=[i['href'] for i in social_elements]
print(social_links_href)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


## Scrape the table

In [26]:
#import pandas to store table as DataFrame
import pandas as pd

In [27]:
#Using pandas builtin function to scrape a TABLE from html code
table = webpage.find('table', attrs={'class':'hockey-stats'})
df = pd.read_html(str(table))[0]
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [28]:
#Scrape the table using find/findall
l = []
table = webpage.find_all('table', attrs={'class':'hockey-stats'})[0]
table_rows = table.find('tbody').find_all('tr')
table_columns = table.find('thead').find_all('th')
column_names=[column.string for column in table_columns]
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip(' \n\t\r') for tr in td]
    l.append(row)
#print(l)
df_table=pd.DataFrame(l, columns=column_names, )

In [29]:
df_table

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Messing around with the new DataFrame

In [30]:
import numpy as np
df_table_nan=df_table.apply(lambda row: row.apply(lambda x: np.nan if x=='' else x)) #Nested apply functions para aplicar una función a acada elemento de cada row de un DataFrame
df_table_nan

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [31]:
df_table_2=df_table.replace('',np.nan)
df_table_2

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [32]:
a=df_table_2.loc[:,'GP'].iloc[:,0]
df_table_2.loc[(pd.to_numeric(a)) >= 12]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,


## Find all the sentences that contain the word 'is' in the fun facts

In [33]:
fun_facts = webpage.find_all('ul', attrs={'class':'fun-facts'})[0].select('li')
#fun_facts = webpage.select('ul.fun-facts')[0].find_all('li')
fun_facts

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>,
 <li>Middle name is Ronald</li>,
 <li>Never had been on a plane until college</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>,
 <li>A favorite book series of mine is <i>Ender's Game</i></li>,
 <li>Current video game of choice is <i>Rocket League</i></li>,
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]

In [34]:
sentences_with_is = [sentence.get_text() for sentence in fun_facts if 'is ' in sentence.get_text()] 
sentences_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## Download images from the webpage

In [36]:
from os.path  import basename #La función basename devuelve la última parte de un url (en este caso el nombre de la imagen)
from google.colab import drive
drive.mount('/content/gdrive') #Ahuevo se debe montar el drive con este path y guardar todo en "My Drive" (mind the capitalization)

ModuleNotFoundError: No module named 'google'

In [None]:
url = "http://keithgalli.github.io/web-scraping" #URL madre donde están las imágenes
for link in webpage.select("img"): 
  lnk = link["src"]
  lnk = lnk.replace('./', '') #Esto es solo porque en una de las fotos el nombre lleva ese './' y se lo quito
  full_url = url + '/'+ lnk
  with open('/content/gdrive/My Drive/'+ basename(full_url),"wb") as f:
    f.write(requests.get(full_url).content)

## Scrape some links to get a secret message from other webpages contained in the links

In [None]:
links_of_files = webpage.select('div.block a')
links_of_files = [link['href'] for link in links_of_files]
secret_sentence = []
url = "http://keithgalli.github.io/web-scraping" #URL madre donde están los links a las otras páginas
for link in links_of_files:
  full_url = url+'/'+link
  r = requests.get(full_url)
  secret_soup = bs(r.content)
  secret_word = secret_soup.find('p', attrs={'id':'secret-word'}) 
  secret_sentence.append(secret_word.string)
secret_sentence_string = ' '.join(secret_sentence)
print(secret_sentence_string)