In [1]:
# Import libraries

import numpy as np
import pandas as pd

from requests import get
import re
from bs4 import BeautifulSoup

import os

### Exercises

By the end of this exercise, you should have a file named **acquire.py** that contains the specified functions. If you wish, you may break your work into separate files for each website (e.g. acquire_codeup_blog.py and acquire_news_articles.py), but the end function should be present in acquire.py (that is, acquire.py should import get_blog_articles from the acquire_codeup_blog module.)

### 1. Codeup Blog Articles

Scrape the article text from the following pages:

- https://codeup.com/codeups-data-science-career-accelerator-is-here/
- https://codeup.com/data-science-myths/
- https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
- https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
- https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

In [2]:
# Create a list of urls

urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
        'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/', 
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

# Print the length
len(urls)

5

In [4]:
# Create an empty list
blog_articles = []

# For Loop the urls to subtract the title and content

for url in urls:
    headers = {'User-Agent': 'Codeup Data Science'}
    
    # Use response.content to make the soup object
    response = get(url, headers=headers)
    
    # Create the soup object by passing the HTML string and choice of parser.
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # The h1 element holds the title
    title = soup.find('h1', class_='jupiterx-post-title')
    
    # Grab the text from page
    content = soup.find('div', class_='jupiterx-post-content')
    
    # Store the title and text in a dictionary
    d = {'title': title.text, 'content': content.text}
    
    # Append the dictionary to the list
    blog_articles.append(d)
    
    # Convert the list of dicts to a dataframe
    df = pd.DataFrame(blog_articles)
    
    # Write the df to a json file for faster access
    df.to_json('codeup_blogs.json')

blog_articles

[{'title': 'Codeup’s Data Science Career Accelerator is Here!',
  'content': 'The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspac

#### Build the Helper Functions

In [3]:
# Create a helper functioin that requests and parse HTML returning a soup object

def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object
    '''
    headers = {'User-Agent': 'Codeup Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [4]:
def acquire_codeup_blogs(urls, cached=False):
    '''
    This function takes in a list of Codeup Blog urls and a parameter with default cashed == False.
    It scrapes the title and text for each url, creates a list of dictionaries with title and tex for each blog,
    creates a list of dictionaries, converts list to df, and returns df
    If cached == True, the function returns a dataframe from a json file.     
    '''
    if cached == True:
        df = pd.read_json('codeup_blogs.json') # cached == False completes a fresh scrape for df. 
    else:
        
        blog_articles = []
        
        for url in urls:
            soup = make_soup(url)
            title = soup.find('h1', class_='jupiterx-post-title')
            content = soup.find('div', class_='jupiterx-post-content')
            d = {'title': title.text, 'content': content.text}
            blog_articles.append(d)
        
        df = pd.DataFrame(blog_articles)
        df.to_json('codeup_blogs.json')
    
    return df

In [7]:
# Test the functions

codeup_blogs = acquire_codeup_blogs(urls)
codeup_blogs

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### Bonus URL Scrape

In [9]:
# Hit codeup's main blog page to scrape the urls. 

url = 'https://codeup.com/resources/#blog'
soup = make_soup(url)
type(soup)

bs4.BeautifulSoup

In [11]:
# Filter my soup to return a list of all anchor elements from my HTML
urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')

# Print the type of the element
print(type(urls_list[0]))

# Take a peek at the urls_list
urls_list[0]

<class 'bs4.element.Tag'>


<a class="jet-listing-dynamic-link__link" href="https://codeup.com/introducing-salary-refund-guarantee/"><span class="jet-listing-dynamic-link__label">Introducing Our Salary Refund Guarantee</span></a>

In [14]:
# Filter the href attribute value for each anchor element in my list
# 40 urls are scraped. 
# Duplicates exit

urls = [url.get('href') for url in urls_list]
len(urls)

40

In [15]:
# Use the set comprehension to return only unique urls

urls = {url.get('href') for url in urls_list}
len(urls)

20

In [16]:
# Convert the set to a list

urls = list(urls)
print(f'There are {len(urls)} unique urls in the list)')
urls

There are 20 unique urls in the list)


['https://codeup.com/journey-into-web-development/',
 'https://codeup.com/codeup-wins-civtech-datathon/',
 'https://codeup.com/new-scholarship/',
 'https://codeup.com/codeup-alumni-make-water/',
 'https://codeup.com/introducing-salary-refund-guarantee/',
 'https://codeup.com/codeup-inc-5000/',
 'https://codeup.com/succeed-in-a-coding-bootcamp/',
 'https://codeup.com/how-were-celebrating-world-mental-health-day-from-home/',
 'https://codeup.com/what-data-science-career-is-for-you/',
 'https://codeup.com/transition-into-data-science/',
 'https://codeup.com/codeup-in-houston/',
 'https://codeup.com/from-slacker-to-data-scientist/',
 'https://codeup.com/covid-19-data-challenge/',
 'https://codeup.com/what-is-python/',
 'https://codeup.com/codeups-application-process/',
 'https://codeup.com/what-is-machine-learning/',
 'https://codeup.com/math-in-data-science/',
 'https://codeup.com/what-to-expect-at-codeup/',
 'https://codeup.com/build-your-career-in-tech/',
 'https://codeup.com/education-

#### Build the Helper Function

In [17]:
def get_blog_urls():
    '''
    This function scrapes all of the Codeup blog urls from the main Codeup blog page
    Returning a list of urls
    '''
    base_url = 'https://codeup.com/resources/#blog' 
    soup = make_soup(base_url)
    urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
    urls = {url.get('href') for url in urls_list}
    urls = list(urls)
    
    return urls

In [18]:
# Now test the function
# cached == False does a fresh scrape.

all_blogs = acquire_codeup_blogs(urls=get_blog_urls())

# Print the shape
all_blogs.shape

(20, 2)

In [19]:
# Take a peek at the df
all_blogs.head()

Unnamed: 0,title,content
0,Alumni Share their Journey into Web Development,Everyone starts somewhere. Many developers out...
1,Codeup Grads Win CivTech Datathon,Many Codeup alumni enjoy competing in hackatho...
2,Announcing: The Annie Easley Scholarship to Su...,We have an exciting announcement! We’re launch...
3,How Codeup Alumni are Helping to Make Water,Imagine having a kit mailed to you with all th...
4,Introducing Our Salary Refund Guarantee,"Here at Codeup, we believe it’s time to revolu..."


In [22]:
# cached == True reads in a dataframe from 'codeup_blogs.json'

all_blogs = acquire_codeup_blogs(urls=get_blog_urls(), cached=True)
all_blogs.head()

Unnamed: 0,title,content
0,Alumni Share their Journey into Web Development,Everyone starts somewhere. Many developers out...
1,Codeup Grads Win CivTech Datathon,Many Codeup alumni enjoy competing in hackatho...
2,Announcing: The Annie Easley Scholarship to Su...,We have an exciting announcement! We’re launch...
3,How Codeup Alumni are Helping to Make Water,Imagine having a kit mailed to you with all th...
4,Introducing Our Salary Refund Guarantee,"Here at Codeup, we believe it’s time to revolu..."


### 2. News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

- Business
- Sports
- Technology
- Entertainment

The end product of this should be a function named **get_news_articles that returns a list of dictionaries**, where each dictionary has this shape:

In [10]:
# Make the soup object using helper function

url = 'https://inshorts.com/en/read/entertainment'
soup = make_soup(url)

# Print the dtype
type(soup)

bs4.BeautifulSoup

### Scrape news cards from entertainment main page

In [12]:
# Scrape a ResultSet of all the news cards on the page and inspect the elements on the first card. 

cards = soup.find_all('div', class_='news-card')

# Print how many cards on the main page

print(f'There are {len(cards)} news cards on this page.')


# Inspect the first card
cards[0]

There are 24 news cards on this page.


<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/twinkle-posts-pic-of-mela-villains-poster-says-movie-left-mark-or-scar-on-me-1605594707888" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Ankush Verma" itemprop="name"></span>
</span>
<span content="Twinkle posts pic of 'Mela' villain's poster, says movie 'left mark or scar on me'" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2020/11_nov/17_tue/img_1605592871184_3.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organization">
<span 

### Scrape the title from each news card

In [13]:
# Print the dtype of the element in card
type(cards[0])

bs4.element.Tag

In [14]:
# Create a list of titles using the span element and itemprop attribute with text method

titles = [card.find('span', itemprop='headline').text for card in cards]

# Inspect the first 5 titles

titles[:5]

["Twinkle posts pic of 'Mela' villain's poster, says movie 'left mark or scar on me'",
 'Milind Soman picks up garbage on trek to a temple; shares pic',
 "I was replaced in a film as hero's wife didn't want me to be part of it: Taapsee",
 "Tamil actor's video amid cancer treatment surfaces, fans say 'he looks skinny'",
 'Tamil TV series actor hacked to death, CCTV footage shows argument with gang']

### Scrape the author from news card

In [15]:
# Create a list of authors using the span elememnt and class attribute with text method.

authors = [card.find('span', class_='author').text for card in cards]
authors[:5]

['Ankush Verma',
 'Pragya Swastik',
 'Anmol Sharma',
 'Ankush Verma',
 'Daisy Mowke']

### Scrape the text from news card

In [16]:
# Create a list of content strings using the div element and itemprop attribute with text method.

contents = [card.find('div', itemprop='articleBody').text for card in cards]
contents[:5]

['Twinkle Khanna shared a picture of a poster of Tinu Verma, who played the villain in \'Mela\', on the back of a truck. "What can I say except Mela has certainly left a mark or a scar...on me and the rest of the nation," she said. "Certain things...are timeless! This popped up in my messages today," she added.',
 'Actor Milind Soman picked garbage on a trek to a Shiva temple at the top of a hill. He was told there were no dustbins because monkeys kept throwing garbage out of the bins and the garbage is eventually burnt in the forest. "I think the time has come for us to be smarter than monkeys," Soman wrote on Instagram.',
 'Talking about "negativity and misogyny" she faced early in her career, actress Taapsee Pannu said, "I was once replaced in a film because...hero\'s wife didn\'t want me to be part of it." Recalling another incident, she said, "I was dubbing for [a film] and I was told that the hero didn’t like my dialogue so I should change it."',
 'A video showing Tamil actor Tha

In [17]:
# Create an empty list, articles, to hold the dictionaries for each article
articles = []

# Loop through each news card on the page and get what we want

for card in cards:
    title = card.find('span', itemprop = 'headline').text
    author = card.find('span', class_ = 'author').text
    content = card.find('div', itemprop = 'articleBody').text
    
    # Create a dictonary, article, for each news card
    article = {'title': title, 'author': author, 'content': content}
    
    # Add the dictionary, article, to our list of dictionaries, articles
    articles.append(article)
    
# Check the length of the articles: should be 24
print(len(articles))

# Inspect the first member in the list
articles[0]

24


{'title': "Twinkle posts pic of 'Mela' villain's poster, says movie 'left mark or scar on me'",
 'author': 'Ankush Verma',
 'content': 'Twinkle Khanna shared a picture of a poster of Tinu Verma, who played the villain in \'Mela\', on the back of a truck. "What can I say except Mela has certainly left a mark or a scar...on me and the rest of the nation," she said. "Certain things...are timeless! This popped up in my messages today," she added.'}

### Build Helper Function

In [18]:
def get_news_articles(cached=False):
    '''
    This function with default cached == False does 
    '''
    # option to read in a json file instead of scrape for df
    if cached == True:
        df = pd.read_json('inhorts_articles.json')
        
    # option to create a fresh scrape for df
    else:
        
        # Set base_url that will be used in get request
        base_url = 'https://inshorts.com/en/read/'
        
        # List of topics to scrape
        topics = ['business', 'sports', 'technology', 'entertainment']
        
        # Create an empty list, articles, to hold our dictionaries
        articles = []
        
        # For Loop through each topic
        for topic in topics:
            
            # Create the url for the topic
            topic_url = base_url + topic
            
            # Create the soup object by the helper function
            soup = make_soup(topic_url)
            
            # Scrape a ResultSet of all the news cards on the page and inspect the elements on the first card. 
            cards = soup.find_all('div', class_='news-card')
            
            # Loop through each news card on the page and get what we want
            for card in cards:
                title = card.find('span', itemprop = 'headline').text
                author = card.find('span', class_ = 'author').text
                content = card.find('div', itemprop = 'articleBody').text
                
                # Create a dictonary, article, for each news card
                article = ({'topic': topic, 
                            'title': title, 
                            'author': author, 
                            'content': content})
                
                # Add the dictionary, article, to our list of dictionaries, articles
                articles.append(article)
        
        # Conver the articles to a dataframe
        df = pd.DataFrame(articles)
        
        # Write the df to json file for fast access
        df.to_json('inhorts_articles.json')
        
    return df

In [19]:
# Test the function with cashed == False to do a fresh scrape and create 'articles.json' file

df = get_news_articles(cached=False)
df.head()

Unnamed: 0,topic,title,author,content
0,business,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",Pragya Swastik,The Centre has imposed a 30-day moratorium on ...
1,business,How does Moderna's COVID-19 vaccine candidate ...,Pragya Swastik,Moderna's initial results of late-stage trial ...
2,business,Shutting Delhi markets may prove counterproduc...,Sakshita Khosla,Traders' body CAIT on Tuesday said a proposal ...
3,business,Pfizer shares drop 4.5% as Moderna says its va...,Krishna Veera Vanamali,Pfizer’s shares fell as much as 4.5% on Monday...
4,business,"Musk gets $15bn richer in 2 hours, becomes wor...",Krishna Veera Vanamali,Billionaire Elon Musk added $15 billion to his...


In [20]:
# Quick summarize the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


In [21]:
# Count the articles in each topic 
df.topic.value_counts()

technology       25
sports           25
business         25
entertainment    24
Name: topic, dtype: int64

In [22]:
# Test the function to read in the df from 'articles.csv'

df = get_news_articles(cached=True)
df.head()

Unnamed: 0,topic,title,author,content
0,business,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",Pragya Swastik,The Centre has imposed a 30-day moratorium on ...
1,business,How does Moderna's COVID-19 vaccine candidate ...,Pragya Swastik,Moderna's initial results of late-stage trial ...
2,business,Shutting Delhi markets may prove counterproduc...,Sakshita Khosla,Traders' body CAIT on Tuesday said a proposal ...
3,business,Pfizer shares drop 4.5% as Moderna says its va...,Krishna Veera Vanamali,Pfizer’s shares fell as much as 4.5% on Monday...
4,business,"Musk gets $15bn richer in 2 hours, becomes wor...",Krishna Veera Vanamali,Billionaire Elon Musk added $15 billion to his...


In [23]:
# Quick summarize the df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.9+ KB
