### Using Ajax Api to scrape Bloomberg Articles

In [72]:
import requests
from bs4 import BeautifulSoup
import time
import os
import json
from datetime import datetime, timedelta
import re

In [92]:
def clean_bloomberg_date(date):
    'convert the date of articles to month(abbreviated) day, year'
    if date == None:
        new_date = None
    elif not "ago" in date: # if the original date format of bloomberg post is XXX hours ago
        new_date = date
    else:
        hrs = re.search("\d+", date) #regex extracts the time in hours
        new_date = (datetime.now() - timedelta(hours=int(hrs.group()))) #calculates the date when the post was created
        new_date = datetime.strftime(new_date, '%b %d, %Y')
    return new_date

In [137]:
def bnn_article_scraper(query):
    '''
    Srape the article news from BNN Bloomberg website with a search query
    Return a json file containing the returned articles
    
    input:
    query: (str) search keyword
    '''
    output_list = []
    search_query = 'q=' + '&q='.join(query.split())
    url_prefix = 'https://www.bnnbloomberg.ca'
    url = f'https://www.bnnbloomberg.ca/search/bnn-search-tab-view-7.360399/articles-7.360400?ot=example.AjaxPageLayout.ot&{search_query}&parentPaginationAllowed=false'

    response = requests.get(url)
    api_soup = BeautifulSoup(response.text, 'lxml')
    
    for article in api_soup.find_all('div', {'class': 'article-content'}):
        title = article.a.text.strip()
        #print("article_title:", title)

        article_url = url_prefix + article.a.get('href').strip()
        #print("article_url:", article_url)



        article_response = requests.get(article_url)
        article_soup = BeautifulSoup(article_response.text, 'lxml')
        
        # get date
        date_tag = article_soup.find('div', class_ = "date")
        if date_tag:
            date = date_tag.get_text().strip()
        else:
            date = None
        #print('date:', date)

        # get author
        author_tag = article_soup.find('span', class_ = "author")
        if author_tag:
            author = author_tag.get_text().strip()
        else:
            author = None 
        #print('author:', author)

        # get source
        source_tag = article_soup.find('span', {'class':'source'})
        if source_tag:
            source = source_tag.get_text().strip()
        else:
            source = None
        #print('source:', source)

        # get content
        article_text_tag = article_soup.find('div', {'class':'article-text'})
        article_text = ''
        
        if article_text_tag:
    
            for children in article_text_tag:
                #print(children)
                if children.name == 'p':
                    article_text += ' '+ children.text
            #article_text = article_text_tag.text
            # desc = article_text_tag.p.get_text()
            desc = article_text_tag.text.strip().split("\n")[0]
        else:
            article_text_tag = article_soup.find('div', {'class':'article-text-chart'})
            if article_text_tag:
                for children in article_text_tag:
                    if children.name == 'p':
                        article_text += ' '+ children.text
                # article_text = article_text_tag.get_text(' ')
                # desc = article_text_tag.p.get_text()
                desc = article_text_tag.text.strip().split("\n")[0] # Amy spotted this bug
            else:
                article_text = None
                desc = None
        #print('content:', article_text)

        # get image url
        article_image_tag = article_soup.find('p', {'class':'image-center'})
        if article_image_tag:
            image_url = url_prefix + article_image_tag.img['src']
        else:
            image_url = None
        #print('image_url:', image_url)
        #print('\n')
        
        article_dict = {}
        
        article_dict['source'] = source
        article_dict['author'] = author
        article_dict['title'] = title
        article_dict['description'] = desc
        article_dict['url'] = article_url
        article_dict['urlToImage'] = image_url
        article_dict['publishedAt'] = clean_bloomberg_date(date)
        article_dict['content'] = article_text
        
        output_list.append(article_dict)
        
    with open('_'.join(query.split()) + '_' + str(len(output_list)) + '_' +'Bloomberg_article' + '.json', 'w') as json_file:
        json.dump(output_list, json_file)
        
    return output_list


In [138]:
# article_response = requests.get('https://www.bnnbloomberg.ca/multi-unit-housing-starts-up-in-parts-of-canada-despite-covid-19-1.1433503')
# article_soup = BeautifulSoup(article_response.text, 'lxml')

# article_text_tag = article_soup.find('div', {'class':'article-text'})
# article_text = ''
# if article_text_tag:
    
#     for children in article_text_tag:
#         #print(children)
#         if children.name == 'p':
#             article_text += ' '+ children.text
#     #article_text = article_text_tag.text
#     # desc = article_text_tag.p.get_text()
#     desc = article_text_tag.text.strip().split("\n")[0]
# else:
#     article_text_tag = article_soup.find('div', {'class':'article-text-chart'})
#     if article_text_tag:
#         for children in article_text_tag:
#         #print(children)
#             if children.name == 'p':
#                 article_text += ' '+ children.text

#             #desc = article_text_tag.p.get_text()
#             desc = article_text_tag.text.strip().split("\n")[0] # Amy spotted this bug
#     else:
#         article_text = None
#         desc = None
# print('content:', article_text)

### Mortgage Rates

In [139]:
bloomberg_mr_article = bnn_article_scraper('mortgage rates')

In [140]:
bloomberg_mr_article[0]

{'source': 'The Canadian Press',
 'author': None,
 'title': 'Multi-unit housing starts up in parts of Canada despite COVID-19',
 'description': 'OTTAWA -- Canada Mortgage and Housing Corp. says construction of multi-unit housing projects remained strong in some provinces last month despite the fight against the COVID-19 pandemic.',
 'url': 'https://www.bnnbloomberg.ca/multi-unit-housing-starts-up-in-parts-of-canada-despite-covid-19-1.1433503',
 'urlToImage': None,
 'publishedAt': 'May 8, 2020',
 'content': " OTTAWA -- Canada Mortgage and Housing Corp. says construction of multi-unit housing projects remained strong in some provinces last month despite the fight against the COVID-19 pandemic. CMHC estimates a 10.8 per cent month-over-month increase in its national seasonally adjusted annual rate last month compared with March, excluding Quebec. The federal agency says there was growth in multi-family starts in Ontario, Saskatchewan and Manitoba in April but Quebec was left out of the mo

### Interest Rates

In [141]:
bloomberg_ir_article = bnn_article_scraper('interest rates')

In [142]:
len(bloomberg_ir_article)

100

### Housing price

In [143]:
bloomberg_hp_article = bnn_article_scraper('housing price')

In [144]:
len(bloomberg_hp_article)

100

### Employment

In [145]:
bloomberg_e_article = bnn_article_scraper('employment')

In [146]:
len(bloomberg_e_article)

95

### GDP

In [147]:
bloomberg_gdp_article = bnn_article_scraper('GDP')

In [148]:
len(bloomberg_gdp_article)

100

### Stock Market

In [149]:
bloomberg_tsx_article = bnn_article_scraper('stock market')

In [150]:
len(bloomberg_tsx_article)

100

In [73]:
# response = requests.get(url)
# api_soup = BeautifulSoup(response.text, 'lxml')

In [28]:
# output_list = []
# query = 'mortgage rates'
# search_query = 'q=' + '&q='.join(query.split())
# url_prefix = 'https://www.bnnbloomberg.ca'
# url = f'https://www.bnnbloomberg.ca/search/bnn-search-tab-view-7.360399/articles-7.360400?ot=example.AjaxPageLayout.ot&{search_query}&parentPaginationAllowed=false'

# response = requests.get(url)
# api_soup = BeautifulSoup(response.text, 'lxml')

# for article in api_soup.find_all('div', {'class': 'article-content'}):
#     title = article.a.text.strip()
#     print("article_title:", title)

#     article_url = url_prefix + article.a.get('href').strip()
#     print("article_url:", article_url)



#     article_response = requests.get(article_url)
#     article_soup = BeautifulSoup(article_response.text, 'lxml')
#     # get date
#     date_tag = article_soup.find('div', class_ = "date")
#     if date_tag:
#         date = date_tag.get_text().strip()
#     else:
#         date = None
#     print('date:', date)

#     # get author
#     author_tag = article_soup.find('span', class_ = "author")
#     if author_tag:
#         author = author_tag.get_text().strip()
#     else:
#         author = None 
#     print('author:', author)

#     # get source
#     source_tag = article_soup.find('span', {'class':'source'})
#     if source_tag:
#         source = source_tag.get_text().strip()
#     else:
#         source = None
#     print('source:', source)

#     # get content
#     article_text_tag = article_soup.find('div', {'class':'article-text'})
#     if article_text_tag:
#         article_text = article_text_tag.get_text(' ')
#         desc = article_text_tag.p.get_text()
#     else:
#         article_text_tag = article_soup.find('div', {'class':'article-text-chart'})
#         if article_text_tag:
#             article_text = article_text_tag.get_text(' ')
#             desc = article_text_tag.p.get_text()
#         else:
#             article_text = None
#             desc = None
#     print('content:', article_text)
#     print('desc:', desc)

#     # get image url
#     article_image_tag = article_soup.find('p', {'class':'image-center'})
#     if article_image_tag:
#         image_url = url_prefix + article_image_tag.img['src']
#     else:
#         image_url = None
#     print('image_url:', image_url)
#     print('\n')

#     article_dict = {}

#     article_dict['source'] = source
#     article_dict['author'] = author
#     article_dict['title'] = title
#     article_dict['description'] = None
#     article_dict['url'] = article_url
#     article_dict['urlToImage'] = image_url
#     article_dict['PublishedAt'] = datetime.strptime(date, '%b %d, %Y').date()
#     article_dict['content'] = article_text

#     output_list.append(article_dict)
    
#     break

# # with open('_'.join(query.split()) + '_' + str(len(output_list)) + '_' +'Bloomberg_article' + '.json', 'w') as json_file:
# #     json.dump(output_list, json_file)



In [77]:
# first = api_soup.find('div', {'class': 'article-content'})
# title = first.a.text.strip()
# print('title:', title)
# url = url_prefix + first.a.get('href')
# #print('url:', url)

# article_response = requests.get(url)
# article_soup = BeautifulSoup(article_response.text, 'lxml')
# date = article_soup.find('div', class_ = "date").get_text().strip()
# print('date:', date)

# author = article_soup.find('span', class_ = "author").get_text().strip()
# print('author:', author)

# source = article_soup.find('span', {'class':'source'}).get_text().strip()
# print('source:', source)

# article_text = article_soup.find('div', {'class':'article-text'}).get_text(' ')
# print('content:', article_text)

# article_image = article_soup.find('div', {'class':'article-image'})
# # image_url = url_prefix + article_image['src']
# print(article_image)

