In [185]:
import requests
from bs4 import BeautifulSoup

# Using scrapping and BS with noraml local files

In [21]:
with open("home.html", "r") as file:
    content = file.read()
    soup = BeautifulSoup(content, features='lxml')
    tags = soup.find_all("h5")
    course_cards = soup.find_all("div", class_="card")
    for course in course_cards:
        course_name = course.h5.text
        course_price = course.a.text.split()[-1]

        print(f"{course_name} costs {course_price}")
    

Python for beginners costs 20$
Python web development costs 110$
Python data science costs 50$


# Real case of scrapping through the real website

In [38]:
html_text = requests.get('https://www.jobs.cz/prace/?q%5B%5D=python').text
soup = BeautifulSoup(html_text, 'lxml')
job = soup.find('article', class_='SearchResultCard')
company_name = job.find('h2', class_='SearchResultCard__title').text.strip()
company_city = job.find('li', {"data-test": "serp-locality", "class": "SearchResultCard__footerItem"}).text.strip()
print(company_city)


Praha – Karlín


### .select method is helping to work and choose elements regarding CSS, using CSS selectors 

### .get_text is helpful to find and return all text inside a specific found tag, when there are multiple elements on same level
#### it will return all bald text from each of tag containes(better to check on practice)

### .string if element by searching tag has only 1 element inside, can be used to return a text from it

### ['attr name'] to get a specific property from element 
### example:    test = soup.find_all("a")
###             test[0]['href'] - to return bald link from first a element founded

### inside find_all we can use attrs={attr_name: attr_value, ...} as a dictionary of multiple attributes to search a specific range of them

In [156]:
base_url = "https://keithgalli.github.io/web-scraping"
r = requests.get(base_url + "/webpage.html")

soup = BeautifulSoup(r.content, 'lxml')

print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

### grab all social links from the webpage

In [51]:
# 1st method - find_all
socials = soup.body.find('ul', class_='socials') # return tag-type object so, working
# with a tag
links = socials.get_text()
print(links)



Instagram: https://www.instagram.com/keithgalli/
Twitter: https://twitter.com/keithgalli
LinkedIn: https://www.linkedin.com/in/keithgalli/
TikTok: https://www.tiktok.com/@keithgalli



In [53]:
# 2nd method - select
socials = soup.select('h2 ~ ul.socials > li') # return a list of elements as a result
# so working with itterations 
links = []
for link in socials:
    links.append(link.get_text())
links


['Instagram: https://www.instagram.com/keithgalli/',
 'Twitter: https://twitter.com/keithgalli',
 'LinkedIn: https://www.linkedin.com/in/keithgalli/',
 'TikTok: https://www.tiktok.com/@keithgalli']

In [57]:
# 3rd method also select but with different approach

links = soup.select('ul.socials a')
actual_links = [link['href'] for link in links]
print(actual_links)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [59]:
# 4th method using more advanced(logic) CSS selector tools
# since all of socials are part of .social class and only after is their own class
# we can select them as well by doing this:

links = soup.select('li.social a')
actual_links = [link['href'] for link in links]
print(actual_links)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


### scrape the table

In [122]:
import pandas as pd

# 2nd variant to find and parse through column names
# thead = soup.select('thead > tr > th')
# col_names = [col_title.string for col_title in thead]
# print(col_names)

thead = soup.find('thead').find_all("th")
col_names = [col_title.string for col_title in thead]


table = soup.select("table.hockey-stats tbody tr")
rows = []
for tr in table:
    td = tr.find_all('td')
    row = [c.text.strip() for c in td]
    rows.append(row)

df = pd.DataFrame(rows, columns=col_names)
df


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Grab all fun facts that use word "is"

In [150]:
import re

pattern = re.compile('is')
facts = soup.find('ul', class_='fun-facts').find_all('li')

facts_with_is = [
    tag.get_text() for tag in facts if re.search(r'\bis\b', tag.get_text())
] 


# Other method by using find + re

# facts_with_is1 = [fact.get_text() for fact in facts if fact.find(string=re.compile('is'))]


# 1st easy method by using is with if-condition

# for fact in facts:
#     if "is" in fact.get_text():
#         facts_with_is.append(fact.get_text())

# for fact in facts:
#     print(fact.get_text())

# facts = [fact.get_text() for fact in facts]
# print(facts)

['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]
['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]


###  in HTML there is nbsp(non-breaking space) existing which is made to fix something to not move part of non-breaking word/price anythig else while moving to next row. After parsing it will be loaded as \xa0 so after it's useful to replace all \xa0 with normals spaces.
### Example of using: 24&nbsp;000 Kc will keep whole price (24 000) as one block which will prevent something as 24\n 000 on next row 

# Downloading the Image

In [172]:
from pathlib import Path

curr_dir = Path().resolve()
italy_imgs_dir = Path(curr_dir / "italy_imgs")
italy_imgs_dir.mkdir(exist_ok=True)


all_images_urls = [base_url + "/" + image['src'] for image in soup.select("div.row div.column img")]
# just in case if any of images has different extension as .png or .jpg
urls_extensions = [url[-4:] for url in all_images_urls]

for idx, image_url in enumerate(all_images_urls, start=1):
    r = requests.get(image_url).content
    with open(f"{italy_imgs_dir}/image_{idx}{urls_extensions[idx-1]}", "wb") as writer:
        writer.write(r)

# downloading the file using writing by bites


### Mystery Message Challenge
### Scrape links and grab all 'p' tags with id='secret-word'

In [196]:
all_links = [f'{base_url}/{link['href']}' for link in soup.select('div div.block a')]
secret_words = []
for link in all_links:
    r = requests.get(link)
    webpage = BeautifulSoup(r.content, 'lxml')
    secret_word = webpage.find('p', attrs={"id": "secret-word"}).string
    secret_words.append(secret_word)
secret_letter = " ".join(secret_words)
print(secret_letter)

Make sure to smash that like button and subscribe !!!
