## 1. Import requests and BeautifulSoup

In [1]:
#Install BeautifulSoup by typing 'pip3 install beautifulsoup4' in command prompt or terminal then pressing enter
#Install requests by typing 'pip3 install requests' then pressing enter
#Beautiful Soup allows us to use the html and grab different data and we can use the data for whatever our goal is
#requests module allows us to download the html 

import requests
from bs4 import BeautifulSoup

## 2. Fetching the HTML content

In [2]:
#Create a response variable and use a get request to get the page
# A status code of 200 (<Response [200]>) means that the request was successful.
res = requests.get('https://news.ycombinator.com/')    
print(res)

<Response [200]>


In [3]:
#View the entire html file from the site we are scraping
#res.text: This returns the content of the response as a string. 
#It's suitable for text-based content like HTML, XML, JSON, etc. 


print(res.text)     # res.text contains the HTML content of the web page

<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?IapwXzfSddCMYOJKJMFC">
        <link rel="icon" href="y18.svg">
                  <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.svg" width="18" height="18" style="border:1px white solid; display:block"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
                            <a href="newest">new</a> | <a href="fro

## 3. Parsing with Beautiful Soup

In [4]:
#Now we have the HTML content as a string, 
#Let's pass it to Beautiful Soup. Beautiful Soup then parses the HTML and creates a parse tree
#This represents the structure of the HTML document.

soup = BeautifulSoup(res.text, 'html.parser') # Create a BeautifulSoup object named 'soup' by parsing the HTML content
print(soup)

<html lang="en" op="news"><head><meta content="origin" name="referrer"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><link href="news.css?IapwXzfSddCMYOJKJMFC" rel="stylesheet" type="text/css"/>
<link href="y18.svg" rel="icon"/>
<link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
<title>Hacker News</title></head><body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a hre

## 4. Navigating and extracting data

In [5]:
print(soup.body)  # Print the <body> tag and its contents of the parsed HTML

<body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit" rel="nofollow">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
<a href="login?goto=news">login</a>
</span></td>
</tr></table></td></tr>
<tr id="pagespace" style="height:10px" title=""></tr><tr><td><table border="0" cellpadding="0" cellspacing="0">
<tr c

In [6]:
# Print the contents of the <body> tag
# The .contents attribute returns a list of the children of the <body> tag

print(soup.body.contents)

[<center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit" rel="nofollow">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
<a href="login?goto=news">login</a>
</span></td>
</tr></table></td></tr>
<tr id="pagespace" style="height:10px" title=""></tr><tr><td><table border="0" cellpadding="0" cellspacing="0">
<tr class=

In [7]:
# Let's find all the div objects
#a <div> is a fundamental HTML element used to create divisions or sections within a web page

print(soup.find_all('div'))

[<div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upvote"></div>, <div class="votearrow" title="upv

In [8]:
# Let's get all the 'a' tags (all the links in the page)
#<a> tags are used to create hyperlinks, also known as anchor links. The term "a" stands for "anchor."

print(soup.find_all('a'))

[<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>, <a href="news">Hacker News</a>, <a href="newest">new</a>, <a href="front">past</a>, <a href="newcomments">comments</a>, <a href="ask">ask</a>, <a href="show">show</a>, <a href="jobs">jobs</a>, <a href="submit" rel="nofollow">submit</a>, <a href="login?goto=news">login</a>, <a href="vote?id=40298927&amp;how=up&amp;goto=news" id="up_40298927"><div class="votearrow" title="upvote"></div></a>, <a href="https://blog.google/technology/ai/google-deepmind-isomorphic-alphafold-3-ai-model/">AlphaFold 3 predicts the structure and interactions of life's molecules</a>, <a href="from?site=blog.google"><span class="sitestr">blog.google</span></a>, <a class="hnuser" href="user?id=zerojames">zerojames</a>, <a href="item?id=40298927">17 hours ago</a>, <a href="hide?id=40298927&amp;goto=news">hide</a>, <a href="item?id=40298927">397 comments</a>, <a href="vote?id=40302201

In [9]:
#Get the title tag
print(soup.title)

<title>Hacker News</title>


In [10]:
#Get the first <a> tag that comes up
print(soup.a)

<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>


In [11]:
#Find the first item

print(soup.find('a'))

<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>


In [12]:
# Go to the first link on the webpage being scraped and right click and inspect it
# Use the id attribute and use it to find the score with the same tag

print(soup.find(id="score_40287020"))

None


In [13]:
#Grab data using a CSS selector
#Let's grab span tags with scores

print(soup.select('.score'))

[<span class="score" id="score_40298927">910 points</span>, <span class="score" id="score_40302201">343 points</span>, <span class="score" id="score_40287341">54 points</span>, <span class="score" id="score_40297423">38 points</span>, <span class="score" id="score_40290735">59 points</span>, <span class="score" id="score_40283954">131 points</span>, <span class="score" id="score_40303661">106 points</span>, <span class="score" id="score_40302200">115 points</span>, <span class="score" id="score_40297946">250 points</span>, <span class="score" id="score_40272514">176 points</span>, <span class="score" id="score_40304453">261 points</span>, <span class="score" id="score_40300454">525 points</span>, <span class="score" id="score_40272339">242 points</span>, <span class="score" id="score_40303425">17 points</span>, <span class="score" id="score_40303338">39 points</span>, <span class="score" id="score_40299761">53 points</span>, <span class="score" id="score_40300126">335 points</span>, <s

In [14]:
print(soup.select('#score_40287020'))

[]


In [15]:
# Select and print all elements with the CSS class "titleline" from the parsed HTML document

print(soup.select('.titleline'))

[<span class="titleline"><a href="https://blog.google/technology/ai/google-deepmind-isomorphic-alphafold-3-ai-model/">AlphaFold 3 predicts the structure and interactions of life's molecules</a><span class="sitebit comhead"> (<a href="from?site=blog.google"><span class="sitestr">blog.google</span></a>)</span></span>, <span class="titleline"><a href="https://hao-ai-lab.github.io/blogs/cllm/">Consistency LLM: converting LLMs to parallel decoders accelerates inference 3.5x</a><span class="sitebit comhead"> (<a href="from?site=hao-ai-lab.github.io"><span class="sitestr">hao-ai-lab.github.io</span></a>)</span></span>, <span class="titleline"><a href="https://temporal.io/blog/durable-distributed-asyncio-event-loop">Temporal Python – A durable, distributed asyncio event loop (2023)</a><span class="sitebit comhead"> (<a href="from?site=temporal.io"><span class="sitestr">temporal.io</span></a>)</span></span>, <span class="titleline"><a href="https://symbolica.io/">Symbolica Computer Algebra Syst

In [16]:
#We have a list, let's grab the first item
# Print the first element with class 'titleline'
# soup.select('.titleline') selects all elements with class 'titleline'
# [0] selects the first element from the list of elements with class 'titleline'

print(soup.select('.titleline')[0])

<span class="titleline"><a href="https://blog.google/technology/ai/google-deepmind-isomorphic-alphafold-3-ai-model/">AlphaFold 3 predicts the structure and interactions of life's molecules</a><span class="sitebit comhead"> (<a href="from?site=blog.google"><span class="sitestr">blog.google</span></a>)</span></span>


In [17]:
# Select all <a> tags that are direct children of elements with class 'titleline'
# The .select() method returns a list of elements that match the CSS selector '.titleline > a

links = soup.select('.titleline > a')

In [18]:
# Select all elements with class 'score'
# The .select() method returns a list of elements that match the CSS selector '.score'
votes = soup.select('.score')

# Print the first element from the list of elements with class 'score'
# This will print the first element with class 'score' from the parsed HTML document
print(votes[0])


<span class="score" id="score_40298927">910 points</span>


## Customized Hacker News

In [19]:
def create_custom_hn(links, votes):
    # Initialize an empty list to store the custom Hacker News data
    hn = []
    
    # Iterate through the links and votes using enumerate to get both index and item
    for idx, item in enumerate(links):
        # Get the title of the story from the link
        title = links[idx].getText()
        # Get the URL of the story from the link
        href = links[idx].get('href', None)
        # Extract the points (votes) from the corresponding element in the votes list
        # Remove ' points' from the text and convert it to an integer
        points = int(votes[idx].getText().replace(' points', ''))
        
        # Print the points for debugging purposes
        print(points)
        
        # Append a dictionary containing title, link, and points to the hn list
        hn.append({'title': title, 'link': href, 'points': points})
    
    # Return the list of custom Hacker News data
    return hn


In [20]:
print(create_custom_hn(links,votes))

910
343
54
38
59
131
106
115
250
176
261
525
242
17
39
53
335
73
42
112
51
44
71
96
90
147
469
1507
110


IndexError: list index out of range

In [None]:
#Let's make a few changes because the above code can give an error for stories with no scores/votes
res = requests.get('https://news.ycombinator.com/')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline > a')
subtext = soup.select('.subtext')

In [None]:
def create_custom_hn(links,subtext):
    hn = []
    for idx, item in enumerate(links):
        title = links[idx].getText()
        href = links[idx].get('href', None)
        vote = subtext[idx].select('.score')
        if len(vote):
          points = int(votes[idx].getText().replace(' points', ''))
          if points > 99:
            hn.append({'title': title, 'link':href, 'votes': points})
    return hn

print(create_custom_hn(links,subtext))

In [None]:
from pprint import pprint

In [None]:
pprint(create_custom_hn(links,subtext))

# Final app

In [None]:
# Import the necessary modules
import requests
from bs4 import BeautifulSoup
from pprint import pprint

# Send a GET request to the Hacker News website
res = requests.get('https://news.ycombinator.com/')

# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(res.text, 'html.parser')

# Select all links with class 'titleline'
links = soup.select('.titleline > a')

# Select all elements with class 'subtext'
subtext = soup.select('.subtext')

# Define a function to sort stories by their votes
def sort_stories_by_votes(hnlist):
    return sorted(hnlist, key=lambda k: k['votes'], reverse=True)

# Define a function to create a custom Hacker News list
def create_custom_hn(links, subtext):
    hn = []
    # Iterate through the links and subtext
    for idx, item in enumerate(links):
        # Get the title of the story
        title = links[idx].getText()
        # Get the URL of the story
        href = links[idx].get('href', None)
        # Get the votes (points) for the story
        votes = subtext[idx].select('.score')
        # Check if votes exist for the story
        if len(votes):
            # Get the points as an integer
            points = int(votes[0].getText().replace(' points', ''))
            # Check if the story has more than 99 points
            if points > 99:
                # Append the title, link, and votes to the hn list
                hn.append({'title': title, 'link': href, 'votes': points})
    # Return the sorted list of stories by votes
    return sort_stories_by_votes(hn)

# Print the custom Hacker News list
pprint(create_custom_hn(links, subtext))
