# Practical scraping demo

Scraping IGHIE.com to get news articles for categories, likes and dislikes authors and full article text.

In [1]:
# Set the BASE_URL to be the root of the site igihe.com
BASE_URL = 'http://www.igihe.com/'

In [2]:
# Create a function scrape_article that we can use to scrape information 
# from individual article pages. 

def scrape_article(article_url):
    '''
    Scrape an article from igihe.com and get the author, date, full text and thumbs up and down
    
    Args:
        article_url (str): The full URL to the article on igihe.com
        
    Returns:
        list: A list containing strings for the author, date, thumbs up, thumbs down and 
              full text of the article
    '''
    
    # Get the raw HTML string of the article from the URL
    raw_html = requests.get(article_url)
    
    # Convert the raw HTML string into a BeautifulSoup object 
    soup = BeautifulSoup(raw_html.text)

    # Extract the author and the date of the article using the 'overview-article' class
    # this returns both author and date seperated with a \n character. Use split to seperate
    # them into two different variables.
    author, date = soup.find(class_='overview-article').get_text().strip().split('\n')
    
    # Extract the thumbs up and thumbs down using the classes th-ok and th-no respectively
    thumbs_up = soup.find(class_='th-ok').get_text().strip()
    thumbs_down = soup.find(class_='th-no').get_text().strip()
    
    # Extract the full text of the article using the class fulltext
    full_text = soup.find(class_='fulltext').get_text()
    
    # Return a list of author, date, thumbs up, thumbs down and the full article text.
    return [author, date, thumbs_up, thumbs_down, full_text]

In [None]:
import requests 
import pandas as pd 
from bs4 import BeautifulSoup

# Get the raw HTML string using requests
raw_html = requests.get(BASE_URL)
# Convert the raw HTML string into a Beautiful Soup object
soup = BeautifulSoup(raw_html.text)

# Use the Beautiful soup object to find all of the articles
# as all articles have the class 'article-wrap'
articles_html = soup.find_all(class_='article-wrap')

# Set up a empty list to store our scraped data
rows = []

# Print out the total number of articles found on the BASE_URL page. 
print('Number of articles found: ' + str(len(articles_html)))

# Loop over every article 
for article_html in articles_html:
    
    # The article category is stored in a <h6> tag within the 'article-wrap' class
    # scrape the text using get_text and remove excess whitespace using strip
    category = article_html.find('h6').get_text().strip()
    
    # Look for the class 'homenews-title' in the article html 
    headline_html = article_html.find(class_='homenews-title')
    
    # If there isn't any 'homenews-title' class then the article is formatted 
    # differently, but it will have a 'homenews-title2' class. This article format
    # does not contain information about photos, videos and audios we need to set
    # that manually. 
    if headline_html is None:
        # Look for the homenews-title2 class and scrape the text, stripping out 
        # any excess whitespace
        headline_html = article_html.find(class_='homenews-title2')
        headline = headline_html.get_text().strip()
        
        # Get the article URL from the headline, as the headline is the link 
        # to the article, this is stored in the href attribute of an a tag
        article_url = headline_html.find('a')['href']
        
        # Use a try / except statement to handle errors when attempting to scrape 
        # individual article pages, if there is any error, create a blank list to 
        # add to our scraped data. 
        try:
            # Use the function scrape_article to pull out the article information 
            # that we are interested in.
            article_info = scrape_article('http://www.igihe.com/' + article_url)
        except:
            article_info = ['','','','','']

        # Set the numbers of photos, videos and audios as 0 as these headlines 
        # do not show the number of photos, videos or audios.
        photos = '0'
        videos = '0'
        audios = '0'

    # If there is a 'homenews-title' class in the article, then it is a normally
    # formatted article
    else:
        # Scrape the headline, which is contained in the 'homenews-title' class
        # which has already been selected and stored in the variable headline_html
        headline = headline_html.get_text().strip()
        
        # Get the article URL from the headline, as the headline is the link 
        # to the article, this is stored in the href attribute of an a tag
        article_url = headline_html.find('a')['href']

        
        # Use a try / except statement to handle errors when attempting to scrape 
        # individual article pages, if there is any error, create a blank list to 
        # add to our scraped data. 
        try:
            # Use the function scrape_article to pull out the article information 
            # that we are interested in.
            article_info = scrape_article('http://www.igihe.com/' + article_url)
        except:
            article_info = ['','','','','']
        
        # Photos are stored in the 'article_photos' class, scrape the text and 
        # remove excess whitespace
        photos = article_html.find(class_='article_photos').get_text().strip()

        # If the text inside 'article_photos' is blank then manually set the value 
        # of photos to 0. 
        if photos == '':
            photos = '0'

        # Repeat the same process for photos for 'article_videos' and 'article_audios'
        videos = article_html.find(class_='article_videos').get_text().strip()
        audios = article_html.find(class_='article_audios').get_text().strip()
    
    # Add all the scraped information into a list, and add that list to a rows object
    rows.append([category, headline, photos, videos, audios, article_url] + article_info)
    
    
# Convert our list of lists into a pandas dataframe and add on the column names
df = pd.DataFrame(rows, columns=['category','headline', 'photos', 'videos', 'audios', 'article_url', 
                                 'author', 'date', 'thumbs_up', 'thumbs_down', 'full_text']) 

df