In [None]:
### CBC SCRAPING CODE
### Authors: JONATHAN CHAN and PANDRAMISHI NAGA SIRISHA

###MOST RECENT UPDATE:  
##2020 june 9, 9:19PM
#edited scrape_urls to handle server error on 1000th API call
#wrote docstrings, cleaned code

#TO DO:
#confirm cleanup of code with Sirisha

In [1]:
import urllib.request
import json 
from bs4 import BeautifulSoup
#from datetime import date
import requests
import json
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from datetime import datetime, timedelta
import datetime
import pytz
import dateutil.parser


In [2]:

def get_initial_url(search_term):
    """returns the URL of the first page API call given a search string
    
    input:
    search_term: search string
    """
    
    words = search_term.split()
    url_prefix = "https://www.cbc.ca/search_api/v1/search?"
    query = "q=" + "%20".join(words)
    url_suffix = "&sortOrder=relevance&page=1&fields=feed"
    first_url = url_prefix + query + url_suffix
    #print("FIRST URL API CALL: ", first_url)
    return first_url
    
get_initial_url("interest rate index")

'https://www.cbc.ca/search_api/v1/search?q=interest%20rate%20index&sortOrder=relevance&page=1&fields=feed'

In [4]:
def scrape_urls(initial_url):
    """This function takes in the first query url and scrapes all other articles from past 1 year and returns 
    the urls of such articles
    
    input: 
    initial_url: URL of first call for CBC search API
    
    """
    count = 0
    url_list = []
    current_date = datetime.datetime.now(datetime.timezone.utc)
    last_year_date = current_date - timedelta(days=365)
    main_url = initial_url
    r = requests.get(initial_url)
    info = r.json()
     
    for json_dict in info:
        if dateutil.parser.parse(json_dict['publishtime']) > last_year_date:
            url_list.append(json_dict["url"])
            
    page_num = 2     
    while len(info) != 0:
        if page_num >= 1000: #internal server error after page 1000
            break
            
        if page_num % 20 == 0:
            print("API CALLS SCRAPED: ", str(page_num))
            print("ARTICLES IN URL LIST: ", len(url_list))
            
        split_url = main_url.split("page")
        new_url = split_url[0] + "page=" + str(page_num) + "&fields=feed" 
        r = requests.get(new_url)
        info = r.json()
        
        for json_dict in info:
            if dateutil.parser.parse(json_dict['publishtime']) > last_year_date:
                url_list.append(json_dict["url"])
        page_num += 1
        
    print("FINAL URLS FROM THE PAST YEAR: ", len(url_list))
    
    return url_list
    

In [6]:
def get_author(soup):
    """returns the author of a BeautifulSoup article if it exists, None if cannot be found
    
    Assume author info is contained within span tag (class: authorText)
    """
    author_span = soup.find("span", {"class": "authorText"})
    
    if author_span:
        return author_span.text
    else:
        #print("No author found in article!")
        return None

In [7]:
def get_title(soup):
    """returns the title of a BeautifulSoup article if it exists, None if cannot be found
    
    Assume title info is contained within h1 tag (class: detailHeadline)
    """
    title_tag = soup.find("h1", {"class": "detailHeadline"})
    
    if title_tag:
        title_text = title_tag.text
        return title_text
    else:
        #print("no title found in article!")
        return None

In [8]:
def get_desc(soup):
    """returns the description of a BeautifulSoup article if it exists, None if not
    
    Assume description is contained within h2 tag (class: deck)
    """
    desc_tag = soup.find("h2", {"class": "deck"})
    
    if desc_tag:
        desc_text = desc_tag.text
        return desc_text
    else:
        #print("No description found in article!")
        return None
    

In [9]:
def get_url_to_image(soup):
    """returns the url to the header image of a CBC article (BeautifulSoup) if it exists, None if not
    
    Assume image url is contained within src attribute of img tag 
    """
    main_image_tag = soup.find("figure", {"class": "imageMedia leadmedia-story full"})
    
    if main_image_tag:
        main_image_url = main_image_tag.find("img").attrs["src"]
        return main_image_url
    else:
        #print("No main header image found in article!")
        return None
        

In [10]:
def get_publish_time(soup):
    """returns a tuple of publish time string and datetime string if found in article, None if not
    
    Assume time is contained within time tag (class: timestamp)
    """
    time_tag = soup.find("time", {"class": "timeStamp"})
    if time_tag:
        datetime_str = time_tag.attrs["datetime"]
        
        #NOTE: if we want to return a datetime object, error when writing to JSON
        #datetime_obj = parser.isoparse(datetime_str)
        #SOLUTION: return as string for now, convert to datetime object later in pipeline
        
        #format of time_tag.text: 
        timetext_str = time_tag.text.split("|")[0].replace("Posted: ", "").strip()
        return (timetext_str, datetime_str)
    else:
        #print("No time information found in article!")
        return None


In [11]:
def get_source(soup, specify_source_type=True):
    """Returns the source of the article if it exists
    if specify_source_type, subdivision of CBC will be returned
    if not, "CBC" will be returned as the source
    
    
    Assume that source always starts with "CBC" (Ex: "CBC news", "CBC radio")
    Assume that source comes before span tag (class: bullet)
    """
    
    #source appears before <span class="bullet"> · </span>
    #if author is attached, there are two bullet tags
    #if no author attached, there is one bullet tag
    source = None
    
    if specify_source_type:
        bullet_spans = soup.find_all("span", {"class": "bullet"})
        for bullet_span in bullet_spans:
            previous_str = str(bullet_span.previous_sibling)
            if previous_str.startswith("CBC"):
                source = previous_str
    else:
        
        source = "CBC"
    
    if source:
        return source
    else:
        #print("no source found in article!")
        return None
    

In [12]:

def get_content(soup, as_string=True):
    """Returns the text content from a CBC article (as BeautifulSoup object)
    if as_string is True, return content as one string,
    if as_string is False, return content as list of paragraph strings
    
    Input: BeautifulSoup object, boolean
    
    """
    
    story_tag = soup.find("div", {"class": "story"}) 
    content_list = []
    
    if story_tag:
        for p_tag in story_tag.find_all("p"):
            p_text = p_tag.text + "\n"
            content_list.append(p_text)

        if as_string:
            final_content = "".join(content_list)
        else:
            final_content = content_list #return content as list of paragraph strings

        return final_content
    else:
        #print("no content found in article!")
        return None


In [13]:

def extract_json_items(url, specify_source_type=True):
    """Returns a json containing the following items from a CBC article:
        url: the url of the article
        urlToImage: the url of the header image
        title: the title of the article 
        description: subheader of the article
        author: author (note that some articles do not specify author)
        source: CBC if specify_source_type == False, subdivision of CBC if True (ex: "CBC radio")
        publishedAt: tuple of (date_string, datetime object)
        
        input: url returned from CBC API in "url" field (missing "http:" as part of URL)
    """
    json_dict = {}
#     output_list = [] 
    article_url = "http:" + url
    
    #get HTML from article URL into BeautifulSoup
    try:
        html_bytes = urllib.request.urlopen(article_url)
  
    except HTTPError as e:
        print('Error code: ', e.code)
        return None
    except URLError as e:
        print('Reason: ', e.reason)
        return None

    else:    
        mybytes = html_bytes.read()
        html = mybytes.decode("utf8")
        html_bytes.close()
        soup = BeautifulSoup(html, 'html.parser')
        author_name = get_author(soup)
        title_text = get_title(soup)
        desc_text = get_desc(soup)
        image_url = get_url_to_image(soup)
        publish_time = get_publish_time(soup)
        news_source = get_source(soup)
        content = get_content(soup, True)
        
        json_dict["author"] = author_name
        json_dict["title"] = title_text 
        json_dict["description"] = desc_text
        json_dict["url"] = article_url
        json_dict["urlToImage"] = image_url
        json_dict["publishedAt"] = publish_time
        json_dict["source"] = news_source
        json_dict["content"] = content
        
        final_json = json.dumps(json_dict)
        return json_dict


In [20]:
#MAIN FUNCTION CREATED BY SIRISHA
def main(query):
    """
    returns a json list containing all articles found from searching 
    CBC API using the given query and creates a json file containing 
    json info for each article
    
    input: query string
    
    
    """
    first_url = get_initial_url(query)
    all_urls = scrape_urls(first_url)
    json_list = []
    json_count = 0
    for each_url in all_urls:
        retrieved_json  = extract_json_items(each_url)
        if retrieved_json is not None:
            json_list.append(retrieved_json)
            
            json_count += 1
            if json_count % 20 == 0:
                print("JSONS ADDED:", json_count)
    
    full_query = query.split(" ")
    file_name_prefix = "_".join(full_query)
    print(file_name_prefix)
    
    with open( file_name_prefix + '_' +'CBC_article' + '.json', 'w') as json_file:
        json.dump(json_list, json_file)
    
    return json_list



## Mortgage Rates

In [21]:
#commented out - Sirisha collected these samples


# cbc_mr_article = main("mortgage rates")
# print(len(cbc_mr_article))
# print(cbc_mr_article[0])

## Interest rates

In [22]:
#commented out - Sirisha collected these samples

# cbc_ir_article =  main("interest rate")
# print(len(cbc_hp_article))
# print(cbc_hp_article[0])

## Housing price

In [23]:
#commented out - Sirisha collected these samples


# cbc_hp_article = main('housing price')
# print(len(cbc_hp_article))
# print(cbc_hp_article[0])

## Employment

In [24]:
cbc_e_article = main('employment')
print(len(cbc_e_article))
print(cbc_e_article[0])

API CALLS SCRAPED:  20
ARTICLES IN URL LIST:  105
API CALLS SCRAPED:  40
ARTICLES IN URL LIST:  111
API CALLS SCRAPED:  60
ARTICLES IN URL LIST:  234
API CALLS SCRAPED:  80
ARTICLES IN URL LIST:  428
API CALLS SCRAPED:  100
ARTICLES IN URL LIST:  591
API CALLS SCRAPED:  120
ARTICLES IN URL LIST:  730
API CALLS SCRAPED:  140
ARTICLES IN URL LIST:  829
API CALLS SCRAPED:  160
ARTICLES IN URL LIST:  913
API CALLS SCRAPED:  180
ARTICLES IN URL LIST:  1007
API CALLS SCRAPED:  200
ARTICLES IN URL LIST:  1077
API CALLS SCRAPED:  220
ARTICLES IN URL LIST:  1129
API CALLS SCRAPED:  240
ARTICLES IN URL LIST:  1185
API CALLS SCRAPED:  260
ARTICLES IN URL LIST:  1227
API CALLS SCRAPED:  280
ARTICLES IN URL LIST:  1290
API CALLS SCRAPED:  300
ARTICLES IN URL LIST:  1327
API CALLS SCRAPED:  320
ARTICLES IN URL LIST:  1392
API CALLS SCRAPED:  340
ARTICLES IN URL LIST:  1459
API CALLS SCRAPED:  360
ARTICLES IN URL LIST:  1502
API CALLS SCRAPED:  380
ARTICLES IN URL LIST:  1536
API CALLS SCRAPED:  400


## GDP

In [26]:
cbc_gdp_article = main('GDP')
print(len(cbc_gdp_article))
print(cbc_gdp_article[0])

API CALLS SCRAPED:  20
ARTICLES IN URL LIST:  80
API CALLS SCRAPED:  40
ARTICLES IN URL LIST:  136
API CALLS SCRAPED:  60
ARTICLES IN URL LIST:  179
API CALLS SCRAPED:  80
ARTICLES IN URL LIST:  218
API CALLS SCRAPED:  100
ARTICLES IN URL LIST:  254
API CALLS SCRAPED:  120
ARTICLES IN URL LIST:  267
API CALLS SCRAPED:  140
ARTICLES IN URL LIST:  296
API CALLS SCRAPED:  160
ARTICLES IN URL LIST:  314
API CALLS SCRAPED:  180
ARTICLES IN URL LIST:  320
API CALLS SCRAPED:  200
ARTICLES IN URL LIST:  326
API CALLS SCRAPED:  220
ARTICLES IN URL LIST:  329
FINAL URLS FROM THE PAST YEAR:  334
JSONS ADDED: 20
JSONS ADDED: 40
JSONS ADDED: 60
JSONS ADDED: 80
JSONS ADDED: 100
JSONS ADDED: 120
JSONS ADDED: 140
JSONS ADDED: 160
JSONS ADDED: 180
JSONS ADDED: 200
JSONS ADDED: 220
JSONS ADDED: 240
JSONS ADDED: 260
JSONS ADDED: 280
JSONS ADDED: 300
JSONS ADDED: 320
GDP
334
{'author': 'Pete Evans', 'title': "Canada's economy shrank at 8% pace in the first three months of 2020, worst since 2009", 'descrip

## Stock Market

In [27]:
cbc_tsx_article = main('stock market')
print(len(cbc_tsx_article))
print(cbc_tsx_article[0])

API CALLS SCRAPED:  20
ARTICLES IN URL LIST:  67
API CALLS SCRAPED:  40
ARTICLES IN URL LIST:  226
API CALLS SCRAPED:  60
ARTICLES IN URL LIST:  341
API CALLS SCRAPED:  80
ARTICLES IN URL LIST:  403
API CALLS SCRAPED:  100
ARTICLES IN URL LIST:  464
API CALLS SCRAPED:  120
ARTICLES IN URL LIST:  519
API CALLS SCRAPED:  140
ARTICLES IN URL LIST:  562
API CALLS SCRAPED:  160
ARTICLES IN URL LIST:  590
API CALLS SCRAPED:  180
ARTICLES IN URL LIST:  619
API CALLS SCRAPED:  200
ARTICLES IN URL LIST:  638
API CALLS SCRAPED:  220
ARTICLES IN URL LIST:  645
API CALLS SCRAPED:  240
ARTICLES IN URL LIST:  656
API CALLS SCRAPED:  260
ARTICLES IN URL LIST:  677
API CALLS SCRAPED:  280
ARTICLES IN URL LIST:  694
API CALLS SCRAPED:  300
ARTICLES IN URL LIST:  716
API CALLS SCRAPED:  320
ARTICLES IN URL LIST:  730
API CALLS SCRAPED:  340
ARTICLES IN URL LIST:  746
API CALLS SCRAPED:  360
ARTICLES IN URL LIST:  759
API CALLS SCRAPED:  380
ARTICLES IN URL LIST:  765
API CALLS SCRAPED:  400
ARTICLES IN 

In [None]:
#TO DELETE - tests


# #TEST ON DIFFERENT CBC LINKS RETURNED BY CBC API

# standard_url = "//www.cbc.ca/news/politics/federal-deficit-higher-than-252-billion-1.5566768"
# radio_url = "//www.cbc.ca/radio/costofliving/slashed-interest-rates-getting-a-piece-of-the-electric-car-pie-and-a-happy-jobs-friday-to-all-1.5486253"
# media_url = "//www.cbc.ca/player/play/1707317315674"
# noauthor_url = "//www.cbc.ca/news/canada/coronavirus-covid19-world-canada-may12-1.5564261"



# extract_json_items(standard_url)

# #note: doesn't work for 'player' URLS ("//www.cbc.ca/player/play/1707317315674")
# #will run, but will return null for most values - player articles have different setup
