In [264]:
### CBC SCRAPING CODE
### Authors: JONATHAN CHAN and PANDRAMISHI NAGA SIRISHA

###MOST RECENT UPDATE:  
##2020 MAY 15, 11:52AM
#wrote separate functions for each JSON element to be collected
#extract_json_items() will run for all articles, and will return null if not in proper format

#TO DO:
#write code for collecting JSON items from all articles returned in CBC API (code cell 4)
#write code to iterate through multiple pages of API call 
#done
#write code to store JSON in a JSON or text document
#Final run: collect 50-100 articles for each of 6 indicators

In [1]:
import urllib.request
import json 
from bs4 import BeautifulSoup
#from datetime import date
import requests
import json
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from datetime import datetime, timedelta
import datetime
import pytz
import dateutil.parser


In [2]:
#https://www.cbc.ca/search_api/v1/search?q=mortgage%20rate&sortOrder=relevance&page=100&fields=feed
def get_initial_url(search_term):
    """returns the URL of the first page API call given a search string"""
    
    words = search_term.split()
    url_prefix = "https://www.cbc.ca/search_api/v1/search?"
    query = "q=" + "%20".join(words)
    url_suffix = "&sortOrder=relevance&page=1&fields=feed"
    first_url = url_prefix + query + url_suffix
    print("FIRST URL API CALL: ", first_url)
    return first_url
    
get_initial_url("interest rate index")

FIRST URL API CALL:  https://www.cbc.ca/search_api/v1/search?q=interest%20rate%20index&sortOrder=relevance&page=1&fields=feed


'https://www.cbc.ca/search_api/v1/search?q=interest%20rate%20index&sortOrder=relevance&page=1&fields=feed'

In [3]:
# yourdate = dateutil.parser.parse('2020-05-13T14:48:05.000Z')
# print(yourdate)
# yourdate > past

In [27]:
def scrape_urls(url):
    """This function takes in the first query url and scrapes all other articles from past 1 year and returns 
    the urls of such articles"""
    count = 0
    url_list = []
    main_url = url
    r = requests.get(url)
    info = r.json()
    last_retrieved_items_count= len(info)
    
    for i in info:
        url_list.append(i['url'])
        count += 1
        
    page_number = 2
    

    while page_number < 100 :
        split_url = main_url.split('page')
        new_url = split_url[0] + "page=" + str(page_number) + "&fields=feed" 
        r = requests.get(new_url)
        info = r.json()
        
        for i in info:       
            url_list.append(i['url'])
            count += 1
                
        page_number += 1   
        print(page_number)
    return url_list

# first_url = get_initial_url("supernova")
# all_urls = scrape_urls(first_url)

In [28]:
def get_author(soup):
    """returns the author of a BeautifulSoup article if it exists, None if cannot be found
    
    Assume author info is contained within span tag (class: authorText)
    """
    author_span = soup.find("span", {"class": "authorText"})
    
    if author_span:
        return author_span.text
    else:
        #print("No author found in article!")
        return None

In [29]:
def get_title(soup):
    """returns the title of a BeautifulSoup article if it exists, None if cannot be found
    
    Assume title info is contained within h1 tag (class: detailHeadline)
    """
    title_tag = soup.find("h1", {"class": "detailHeadline"})
    
    if title_tag:
        title_text = title_tag.text
        return title_text
    else:
        #print("no title found in article!")
        return None

In [30]:
def get_desc(soup):
    """returns the description of a BeautifulSoup article if it exists, None if not
    
    Assume description is contained within h2 tag (class: deck)
    """
    desc_tag = soup.find("h2", {"class": "deck"})
    
    if desc_tag:
        desc_text = desc_tag.text
        return desc_text
    else:
        #print("No description found in article!")
        return None
    

In [31]:
def get_url_to_image(soup):
    """returns the url to the header image of a CBC article (BeautifulSoup) if it exists, None if not
    
    Assume image url is contained within src attribute of img tag 
    """
    main_image_tag = soup.find("figure", {"class": "imageMedia leadmedia-story full"})
    
    if main_image_tag:
        main_image_url = main_image_tag.find("img").attrs["src"]
        return main_image_url
    else:
        #print("No main header image found in article!")
        return None
        

In [32]:
def get_publish_time(soup):
    """returns a tuple of publish time string and datetime string if found in article, None if not
    
    Assume time is contained within time tag (class: timestamp)
    """
    time_tag = soup.find("time", {"class": "timeStamp"})
    if time_tag:
        datetime_str = time_tag.attrs["datetime"]
        
        #NOTE: if we want to return a datetime object, error when writing to JSON
        #datetime_obj = parser.isoparse(datetime_str)
        #SOLUTION: return as string for now, convert to datetime object later in pipeline
        
        #format of time_tag.text: 
        timetext_str = time_tag.text.split("|")[0].replace("Posted: ", "").strip()
        return (timetext_str, datetime_str)
    else:
        #print("No time information found in article!")
        return None


In [33]:
def get_source(soup, specify_source_type=True):
    """Returns the source of the article if it exists
    if specify_source_type, subdivision of CBC will be returned
    if not, "CBC" will be returned as the source
    
    
    Assume that source always starts with "CBC" (Ex: "CBC news", "CBC radio")
    Assume that source comes before span tag (class: bullet)
    """
    
    #source appears before <span class="bullet"> · </span>
    #if author is attached, there are two bullet tags
    #if no author attached, there is one bullet tag
    source = None
    
    if specify_source_type:
        bullet_spans = soup.find_all("span", {"class": "bullet"})
        for bullet_span in bullet_spans:
            previous_str = str(bullet_span.previous_sibling)
            if previous_str.startswith("CBC"):
                source = previous_str
    else:
        
        source = "CBC"
    
    if source:
        return source
    else:
        #print("no source found in article!")
        return None
    

In [34]:

def get_content(soup, as_string=True):
    """Returns the text content from a CBC article (as BeautifulSoup object)
    if as_string is True, return content as one string,
    if as_string is False, return content as list of paragraph strings
    
    Input: BeautifulSoup object, boolean
    
    """
    
    story_tag = soup.find("div", {"class": "story"}) 
    content_list = []
    
    if story_tag:
        for p_tag in story_tag.find_all("p"):
            p_text = p_tag.text + "\n"
            content_list.append(p_text)

        if as_string:
            final_content = "".join(content_list)
        else:
            final_content = content_list #return content as list of paragraph strings

        return final_content
    else:
        #print("no content found in article!")
        return None
    
# get_content(soup)

In [35]:
#NEW - USING NEW FUNCTIONS: 
def extract_json_items(url, specify_source_type=True):
    """Returns a json containing the following items from a CBC article:
        url: the url of the article
        urlToImage: the url of the header image
        title: the title of the article 
        description: subheader of the article
        author: author (note that some articles do not specify author)
        source: CBC if specify_source_type == False, subdivision of CBC if True (ex: "CBC radio")
        publishedAt: tuple of (date_string, datetime object)
        
        input: url returned from CBC API in "url" field (missing "http:" as part of URL)
    """
    json_dict = {}
#     output_list = [] 
    article_url = "http:" + url
    
    #get HTML from article URL into BeautifulSoup
    try:
        html_bytes = urllib.request.urlopen(article_url)
  
    except HTTPError as e:
        print('Error code: ', e.code)
        return None
    except URLError as e:
        print('Reason: ', e.reason)
        return None

    else:    
        mybytes = html_bytes.read()
        html = mybytes.decode("utf8")
        html_bytes.close()
        soup = BeautifulSoup(html, 'html.parser')
        author_name = get_author(soup)
        title_text = get_title(soup)
        desc_text = get_desc(soup)
        image_url = get_url_to_image(soup)
        publish_time = get_publish_time(soup)
        news_source = get_source(soup)
        content = get_content(soup, True)
        
        json_dict["author"] = author_name
        json_dict["title"] = title_text 
        json_dict["description"] = desc_text
        json_dict["url"] = article_url
        json_dict["urlToImage"] = image_url
        json_dict["publishedAt"] = publish_time
        json_dict["source"] = news_source
        json_dict["content"] = content
#         print("JSON KEYS AND VALUES: ")
#         for key, item in json_dict.items():
#             print(key + ": " + str(item))
#         print("-----")
        
        final_json = json.dumps(json_dict)
        return json_dict


In [13]:
extract_json_items('//www.cbc.ca/news/business/powel-trump-negative-rates-1.5567512')

{'author': 'Pete Evans',
 'title': 'U.S. Fed chair rules out negative interest rates even as Trump trumpets them',
 'description': 'U.S. president goes negative on Jerome Powell for rejection of below-zero interest rates',
 'url': 'http://www.cbc.ca/news/business/powel-trump-negative-rates-1.5567512',
 'urlToImage': 'https://i.cbc.ca/1.5258204.1566589338!/fileImage/httpImage/image.jpg_gen/derivatives/16x9_780/869482910.jpg',
 'publishedAt': ('May 13, 2020 10:48 AM ET', '2020-05-13T15:17:41.088Z'),
 'source': 'CBC News',
 'content': 'The head of the Federal Reserve said Wednesday the U.S. central bank is not considering using negative interest rates, despite President Donald Trump seemingly pushing for them.\nIn a tweet Tuesday night, the president said other countries are enjoying the advantages of negative interest rates, and he urged his own central bank to accept the "gift" they would bestow on the U.S. economy.\nTypically central banks tinker with lending rates in an attempt to con

In [36]:
def main(query):
    first_url = get_initial_url(query)
    all_urls = scrape_urls(first_url)
    json_list = []
    
    for each_url in all_urls:
        retrieved_json  = extract_json_items(each_url)
        if retrieved_json is not None:
            print(each_url)
            json_list.append(retrieved_json)
    
    full_query = query.split(" ")
    file_name_prefix = "_".join(full_query)
    print(file_name_prefix)
    
    with open( file_name_prefix + '_' +'CBC_article' + '.json', 'w') as json_file:
        json.dump(json_list, json_file)
    
    return json_list

## Mortgage Rates

In [20]:
# cbc_mr_article = main("mortgage rates")
# print(len(cbc_mr_article))
# print(cbc_mr_article[0])

In [388]:
print(len(cbc_mr_article))

31


## Interest rates

In [16]:
# cbc_hp_article =  main("interest rates")
# print(len(cbc_hp_article))
# print(cbc_hp_article[0])

## Housing price

In [17]:
# cbc_hp_article = main('housing price')
# print(len(cbc_hp_article))
# print(cbc_hp_article[0])

## Employment

In [18]:
# cbc_e_article = main('employment')
# print(len(cbc_e_article))
# print(cbc_e_article[0])

## GDP

In [19]:
# cbc_gdp_article = main('GDP')
# print(len(cbc_gdp_article))
# print(cbc_gdp_article[0])

## Stock Market

In [37]:
cbc_tsx_article = main('TSX')
print(len(cbc_tsx_article))
print(cbc_tsx_article[0])

FIRST URL API CALL:  https://www.cbc.ca/search_api/v1/search?q=TSX&sortOrder=relevance&page=1&fields=feed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
//www.cbc.ca/player/play/1710654531523
//www.cbc.ca/news/business/markets-dollar-oil-friday-1.5529047
//www.cbc.ca/news/business/markets-dollar-oil-thursday-1.5502601
//www.cbc.ca/news/business/tsx-takes-holiday-break-near-2009-high-1.848940
//www.cbc.ca/news/business/markets-dollar-friday-1.5504183
//www.cbc.ca/news/thenational/tsx-falls-by-12-in-biggest-one-day-decline-1.5496250
//www.cbc.ca/news/business/tsx-edges-up-loonie-closes-at-10-year-high-1.397492
//www.cbc.ca/news/business/markets-coronavirus-tuesday-1.5500078
//www.cbc.ca/news/business/markets-monday-1.5498872
//www.cbc.ca/news/business

//www.cbc.ca/news/business/bento-sushi-ipo-1.4137357
//www.cbc.ca/news/business/markets-dollar-oil-monday-1.4126601
//www.cbc.ca/news/business/bay-street-firms-pitch-tsx-rival-stock-market-1.1382786
//www.cbc.ca/news/business/roots-ipo-1.4289154
//www.cbc.ca/news/business/maricann-marijuana-trading-1.4555363
//www.cbc.ca/news/business/dollar-stocks-market-1.4270793
//www.cbc.ca/news/business/tsx-miners-gained-142b-in-2010-1.1088273
//www.cbc.ca/news/business/tsx-loonie-close-higher-before-holiday-break-1.932394
//www.cbc.ca/news/business/dollar-markets-monday-1.4237798
//www.cbc.ca/news/business/canada-goose-ipo-filing-1.3984837
//www.cbc.ca/news/business/gm-closes-at-34-01-on-tsx-1.898152
//www.cbc.ca/news/business/tsx-seesaws-300-points-on-europe-fears-1.1162642
//www.cbc.ca/news/business/roots-ipo-trading-1.4370959
//www.cbc.ca/news/honda-recalls-acura-tsx-cars-in-u-s-1.932006
//www.cbc.ca/news/business/tsx-closes-lower-on-greece-china-worries-1.1260435
//www.cbc.ca/news/business/st

//www.cbc.ca/player/play/2409809241
//www.cbc.ca/radio/asithappens/jacques-hammer-1.5299156
//www.cbc.ca/news/canada/calgary/programs/eyeopener/deborah-yedlin-economic-woes-hit-some-alberta-energy-companies-1.5296754
//www.cbc.ca/news/business/tsx-dollar-markets-1.5305962
//www.cbc.ca/player/play/1704235587782
//www.cbc.ca/news/business/markets-dollar-oil-1.5506470
//www.cbc.ca/player/play/1609494083705
//www.cbc.ca/news/business/markets-dollar-oil-1.5506470
//www.cbc.ca/news/world/equity-markets-swoon-1.428435
//www.cbc.ca/player/play/1647317059958
//www.cbc.ca/news/business/trading-resumption-1.4642248
//www.cbc.ca/news/technology/blackberry-enstream-announce-mobile-payment-deal-1.2673315
//www.cbc.ca/news/canada/saskatchewan/oil-price-sask-government-1.5491175
//www.cbc.ca/player/play/1608627267793
//www.cbc.ca/news/business/stock-options-taxes-budget-1.5063363
//www.cbc.ca/news/business/stock-markets-tuesday-1.5475633
//www.cbc.ca/radio/costofliving/bourse-anarchy-what-fear-of-the-

//www.cbc.ca/news/canada/toronto/hydro-one-ipo-tsx-1.3304644
//www.cbc.ca/news/technology/rbc-voice-biometrics-1.3256961
//www.cbc.ca/news/business/osc-to-accept-no-contest-settlement-agreements-1.2568527
//www.cbc.ca/news/business/dollar-loonie-1.3531753
//www.cbc.ca/news/business/tmx-group-tsx-cse-marijuana-stocks-1.4252687
//www.cbc.ca/news/business/david-hodges-1.4469834
//www.cbc.ca/news/business/osc-to-accept-no-contest-settlement-agreements-1.2568527
//www.cbc.ca/news/business/auxly-cannabis-deal-with-tobacco-giant-imperial-brands-1.5224955
//www.cbc.ca/news/business/stock-market-economy-different-1.3713961
//www.cbc.ca/news/business/large-one-day-swings-1.900795
//www.cbc.ca/news/business/stock-market-volatility-1.3412295
//www.cbc.ca/news/business/bell-canada-shomi-crave-tv-1.3781448
//www.cbc.ca/news/business/financial-markets-oct-11-1.4859417
//www.cbc.ca/news/business/sleep-country-canada-planning-to-go-public-again-in-ipo-1.3114006
//www.cbc.ca/news/business/investors-rush

//www.cbc.ca/news/business/trump-dakota-access-1.3881845
//www.cbc.ca/news/business/oil-output-iea-opec-1.3893920
//www.cbc.ca/news/business/trump-dakota-access-1.3881845
//www.cbc.ca/news/business/prairiesky-jumps-30-after-ipo-1.2658135
//www.cbc.ca/news/business/td-united-states-tax-cut-1.4477730
//www.cbc.ca/news/business/republican-keystone-xl-trump-1.3847672
//www.cbc.ca/news/business/the-roundup-1.4478382
//www.cbc.ca/news/business/cyber-risks-1.4475656
//www.cbc.ca/news/business/the-roundup-1.4473832
//www.cbc.ca/news/business/aphria-hostile-takeover-rejected-green-growth-1.5007539
//www.cbc.ca/news/business/tesla-ride-sharing-1.3815571
//www.cbc.ca/news/business/tesla-ride-sharing-1.3815571
//www.cbc.ca/news/business/economic-update-morneau-1.3814271
//www.cbc.ca/news/business/aphria-hostile-takeover-rejected-green-growth-1.5007539
//www.cbc.ca/news/business/arc-industries-government-1.3801702
//www.cbc.ca/news/business/clearly-contacts-bought-by-essilor-for-445m-1.2553349
//ww

//www.cbc.ca/news/canada/manitoba/canola-growers-complain-to-transport-agency-about-dismal-rail-service-1.2654714
//www.cbc.ca/news/business/magindustries-probed-by-rcmp-over-bribery-allegations-in-congo-1.3091035
//www.cbc.ca/news/business/loblaw-hikes-dividend-as-profit-up-21-in-first-quarter-1.3063024
//www.cbc.ca/news/business/going-but-not-forgotten-nortel-s-many-legacies-1.825421
//www.cbc.ca/news/business/wajax-plans-conversion-to-income-trust-1.551332
//www.cbc.ca/news/business/biovail-suing-u-s-drug-company-over-generic-wellbutrin-xl-1.543306
//www.cbc.ca/news/business/labatt-to-buy-lakeport-beer-maker-in-201m-deal-1.637631
//www.cbc.ca/news/business/pbb-global-reaches-takeover-deal-with-livingston-1.535925
//www.cbc.ca/news/business/going-but-not-forgotten-nortel-s-many-legacies-1.825421
//www.cbc.ca/news/business/magindustries-probed-by-rcmp-over-bribery-allegations-in-congo-1.3091035
//www.cbc.ca/news/business/forbes-billionaires-10-people-who-made-this-year-s-list-1.297874

//www.cbc.ca/news/business/medisys-turns-down-cml-takeover-1.673446
//www.cbc.ca/news/canada/montreal/snc-lavalin-fraud-case-with-links-to-libya-put-off-until-february-1.3275009
//www.cbc.ca/news/business/thomson-selling-higher-learning-division-for-7-75b-us-1.655998
//www.cbc.ca/news/canada/saskatoon/cannimed-therapeutics-lawsuit-aurora-1.4487505
//www.cbc.ca/news/canada/toronto/masonic-temple-in-toronto-sold-to-it-firm-for-12-5m-1.1337979
//www.cbc.ca/news/business/gold-price-on-rebound-after-fed-calms-rate-fears-1.1316452
//www.cbc.ca/news/canada/newfoundland-labrador/husky-energy-reports-major-gas-leak-c-nlopb-says-1.3212633
//www.cbc.ca/news/business/nucor-makes-1-25b-bid-for-harris-steel-1.665431
//www.cbc.ca/news/business/caisse-invests-in-canadian-and-u-s-wind-farms-1.1402981
//www.cbc.ca/news/business/tmx-takeover-fight-turns-hostile-1.1000404
//www.cbc.ca/news/business/dow-closes-above-12-000-for-first-time-1.616911
//www.cbc.ca/news/business/most-canadian-equity-mutual-funds

//www.cbc.ca/news/business/bg-group-offers-228-million-for-aventura-energy-1.490881
//www.cbc.ca/news/canada/newfoundland-labrador/concerns-about-backup-bell-outage-1.4239064
//www.cbc.ca/news/business/maax-sold-to-three-investment-groups-for-640-million-1.476341
//www.cbc.ca/news/business/nexfor-to-spin-off-paper-and-timber-business-1.494446
//www.cbc.ca/news/business/descartes-systems-cfo-quits-shares-fall-1.505680
//www.cbc.ca/news/business/cannabis-consumer-demand-1.4219753
//www.cbc.ca/news/business/nu-gro-agrees-to-192-million-buyout-1.491074
//www.cbc.ca/news/business/great-west-bids-7-3-billion-for-canada-life-in-friendly-takeover-1.381721
//www.cbc.ca/news/business/abitibi-slashes-capacity-jobs-1.394754
//www.cbc.ca/news/business/talisman-energy-sale-of-sudan-oil-interest-completed-1.357604
//www.cbc.ca/news/business/manulife-won-t-make-richer-bid-for-canada-life-1.412070
//www.cbc.ca/news/canada/calgary/oilpatch-oil-gas-markets-alberta-mergers-acquisitions-1.5407215
//www.cbc

//www.cbc.ca/news/business/ace-aviation-holdings-turns-q2-profit-announces-plans-for-jazz-spinoff-1.532595
//www.cbc.ca/news/business/zimbabwe-unveils-50-million-bank-note-to-cope-with-rampant-inflation-1.767655
//www.cbc.ca/news/business/talisman-energy-q2-profit-jumps-76-1.540137
//www.cbc.ca/news/canada/north/adanac-molybdenum-secures-80m-in-interim-financing-1.761649
//www.cbc.ca/news/business/owners-angling-to-sell-gsw-1.548557
//www.cbc.ca/news/business/inex-pharmaceuticals-slashes-more-staff-as-ceo-departs-1.525609
//www.cbc.ca/news/business/inex-pharmaceuticals-slashes-more-staff-as-ceo-departs-1.525609
//www.cbc.ca/news/business/cineplex-galaxy-buying-famous-players-for-500m-1.543547
//www.cbc.ca/news/business/cerberus-ponders-takeover-bid-for-royal-group-technologies-1.550706
//www.cbc.ca/news/business/oilsands-plant-fire-hits-suncor-bottom-line-1.522720
//www.cbc.ca/news/business/easyhome-offers-to-settle-class-action-suit-for-7-38m-1.526323
//www.cbc.ca/news/business/cae-cu

In [151]:
#TEST ON DIFFERENT CBC LINKS RETURNED BY CBC API

standard_url = "//www.cbc.ca/news/politics/federal-deficit-higher-than-252-billion-1.5566768"
radio_url = "//www.cbc.ca/radio/costofliving/slashed-interest-rates-getting-a-piece-of-the-electric-car-pie-and-a-happy-jobs-friday-to-all-1.5486253"
media_url = "//www.cbc.ca/player/play/1707317315674"
noauthor_url = "//www.cbc.ca/news/canada/coronavirus-covid19-world-canada-may12-1.5564261"
extract_json_items(standard_url)

#note: doesn't work for 'player' URLS ("//www.cbc.ca/player/play/1707317315674")
#will run, but will return null for most values - player articles have different setup


'{"author": "John Paul Tasker", "title": "Federal deficit likely to be higher than $252 billion, parliamentary budget\\u00a0officer says", "description": "PBO says it\'s possible federal debt will hit $1 trillion because of pandemic relief spending", "url": "http://www.cbc.ca/news/politics/federal-deficit-higher-than-252-billion-1.5566768", "urlToImage": "https://i.cbc.ca/1.5138746.1558027091!/fileImage/httpImage/image.jpg_gen/derivatives/16x9_780/phoenix-pay-system.jpg", "publishedAt": ["May 12, 2020 5:34 PM ET", "2020-05-13T17:44:54.995Z"], "source": "CBC News"}'