## 1. Import requests and BeautifulSoup

In [None]:
#Install BeautifulSoup by typing 'pip3 install beautifulsoup4' in command prompt or terminal then pressing enter
#Install requests by typing 'pip3 install requests' then pressing enter
#Beautiful Soup allows us to use the html and grab different data and we can use the data for whatever our goal is
#requests module allows us to download the html 

import requests
from bs4 import BeautifulSoup

## 2. Fetching the HTML content

In [None]:
#Create a response variable and use a get request to get the page
# A status code of 200 (<Response [200]>) means that the request was successful.
res = requests.get('https://news.ycombinator.com/')    
print(res)

In [None]:
#View the entire html file from the site we are scraping
#res.text: This returns the content of the response as a string. 
#It's suitable for text-based content like HTML, XML, JSON, etc. 


print(res.text)     # res.text contains the HTML content of the web page

## 3. Parsing with Beautiful Soup

In [None]:
#Now we have the HTML content as a string, 
#Let's pass it to Beautiful Soup. Beautiful Soup then parses the HTML and creates a parse tree
#This represents the structure of the HTML document.

soup = BeautifulSoup(res.text, 'html.parser') # Create a BeautifulSoup object named 'soup' by parsing the HTML content
print(soup)

## 4. Navigating and extracting data

In [None]:
print(soup.body)  # Print the <body> tag and its contents of the parsed HTML

In [None]:
# Print the contents of the <body> tag
# The .contents attribute returns a list of the children of the <body> tag

print(soup.body.contents)

In [None]:
# Let's find all the div objects
#a <div> is a fundamental HTML element used to create divisions or sections within a web page

print(soup.find_all('div'))

In [None]:
# Let's get all the 'a' tags (all the links in the page)
#<a> tags are used to create hyperlinks, also known as anchor links. The term "a" stands for "anchor."

print(soup.find_all('a'))

In [None]:
#Get the title tag
print(soup.title)

In [None]:
#Get the first <a> tag that comes up
print(soup.a)

In [None]:
#Find the first item

print(soup.find('a'))

In [None]:
# Go to the first link on the webpage being scraped and right click and inspect it
# Use the id attribute and use it to find the score with the same tag

print(soup.find(id="score_40287020"))

In [None]:
#Grab data using a CSS selector
#Let's grab span tags with scores

print(soup.select('.score'))

In [None]:
print(soup.select('#score_40287020'))

In [None]:
# Select and print all elements with the CSS class "titleline" from the parsed HTML document

print(soup.select('.titleline'))

In [None]:
#We have a list, let's grab the first item
# Print the first element with class 'titleline'
# soup.select('.titleline') selects all elements with class 'titleline'
# [0] selects the first element from the list of elements with class 'titleline'

print(soup.select('.titleline')[0])

In [None]:
# Select all <a> tags that are direct children of elements with class 'titleline'
# The .select() method returns a list of elements that match the CSS selector '.titleline > a

links = soup.select('.titleline > a')

In [None]:
# Select all elements with class 'score'
# The .select() method returns a list of elements that match the CSS selector '.score'
votes = soup.select('.score')

# Print the first element from the list of elements with class 'score'
# This will print the first element with class 'score' from the parsed HTML document
print(votes[0])


## Customized Hacker News

In [None]:
def create_custom_hn(links, votes):
    # Initialize an empty list to store the custom Hacker News data
    hn = []
    
    # Iterate through the links and votes using enumerate to get both index and item
    for idx, item in enumerate(links):
        # Get the title of the story from the link
        title = links[idx].getText()
        # Get the URL of the story from the link
        href = links[idx].get('href', None)
        # Extract the points (votes) from the corresponding element in the votes list
        # Remove ' points' from the text and convert it to an integer
        points = int(votes[idx].getText().replace(' points', ''))
        
        # Print the points for debugging purposes
        print(points)
        
        # Append a dictionary containing title, link, and points to the hn list
        hn.append({'title': title, 'link': href, 'points': points})
    
    # Return the list of custom Hacker News data
    return hn


In [None]:
print(create_custom_hn(links,votes))

In [None]:
#Let's make a few changes because the above code can give an error for stories with no scores/votes
res = requests.get('https://news.ycombinator.com/')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline > a')
subtext = soup.select('.subtext')

In [None]:
def create_custom_hn(links,subtext):
    hn = []
    for idx, item in enumerate(links):
        title = links[idx].getText()
        href = links[idx].get('href', None)
        vote = subtext[idx].select('.score')
        if len(vote):
          points = int(votes[idx].getText().replace(' points', ''))
          if points > 99:
            hn.append({'title': title, 'link':href, 'votes': points})
    return hn

print(create_custom_hn(links,subtext))

In [None]:
from pprint import pprint

In [None]:
pprint(create_custom_hn(links,subtext))

# Final app

In [None]:
# Import the necessary modules
import requests
from bs4 import BeautifulSoup
from pprint import pprint

# Send a GET request to the Hacker News website
res = requests.get('https://news.ycombinator.com/')

# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(res.text, 'html.parser')

# Select all links with class 'titleline'
links = soup.select('.titleline > a')

# Select all elements with class 'subtext'
subtext = soup.select('.subtext')

# Define a function to sort stories by their votes
def sort_stories_by_votes(hnlist):
    return sorted(hnlist, key=lambda k: k['votes'], reverse=True)

# Define a function to create a custom Hacker News list
def create_custom_hn(links, subtext):
    hn = []
    # Iterate through the links and subtext
    for idx, item in enumerate(links):
        # Get the title of the story
        title = links[idx].getText()
        # Get the URL of the story
        href = links[idx].get('href', None)
        # Get the votes (points) for the story
        votes = subtext[idx].select('.score')
        # Check if votes exist for the story
        if len(votes):
            # Get the points as an integer
            points = int(votes[0].getText().replace(' points', ''))
            # Check if the story has more than 99 points
            if points > 99:
                # Append the title, link, and votes to the hn list
                hn.append({'title': title, 'link': href, 'votes': points})
    # Return the sorted list of stories by votes
    return sort_stories_by_votes(hn)

# Print the custom Hacker News list
pprint(create_custom_hn(links, subtext))
