### Using Ajax Api to scrape Bloomberg Articles

In [29]:
import requests
from bs4 import BeautifulSoup
import time
import os
import json
from datetime import datetime

In [30]:
def bnn_article_scraper(query):
    '''
    Srape the article news from BNN Bloomberg website with a search query
    Return a json file containing the returned articles
    
    input:
    query: (str) search keyword
    '''
    output_list = []
    search_query = 'q=' + '&q='.join(query.split())
    url_prefix = 'https://www.bnnbloomberg.ca'
    url = f'https://www.bnnbloomberg.ca/search/bnn-search-tab-view-7.360399/articles-7.360400?ot=example.AjaxPageLayout.ot&{search_query}&parentPaginationAllowed=false'

    response = requests.get(url)
    api_soup = BeautifulSoup(response.text, 'lxml')
    
    for article in api_soup.find_all('div', {'class': 'article-content'}):
        title = article.a.text.strip()
        #print("article_title:", title)

        article_url = url_prefix + article.a.get('href').strip()
        #print("article_url:", article_url)



        article_response = requests.get(article_url)
        article_soup = BeautifulSoup(article_response.text, 'lxml')
        # get date
        date_tag = article_soup.find('div', class_ = "date")
        if date_tag:
            date = date_tag.get_text().strip()
        else:
            date = None
        #print('date:', date)

        # get author
        author_tag = article_soup.find('span', class_ = "author")
        if author_tag:
            author = author_tag.get_text().strip()
        else:
            author = None 
        #print('author:', author)

        # get source
        source_tag = article_soup.find('span', {'class':'source'})
        if source_tag:
            source = source_tag.get_text().strip()
        else:
            source = None
        #print('source:', source)

        # get content
        article_text_tag = article_soup.find('div', {'class':'article-text'})
        if article_text_tag:
            article_text = article_text_tag.get_text(' ')
            desc = article_text_tag.p.get_text()
        else:
            article_text_tag = article_soup.find('div', {'class':'article-text-chart'})
            if article_text_tag:
                article_text = article_text_tag.get_text(' ')
                desc = article_text_tag.p.get_text()
            else:
                article_text = None
                desc = None
        #print('content:', article_text)

        # get image url
        article_image_tag = article_soup.find('p', {'class':'image-center'})
        if article_image_tag:
            image_url = url_prefix + article_image_tag.img['src']
        else:
            image_url = None
        #print('image_url:', image_url)
        #print('\n')
        
        article_dict = {}
        
        article_dict['source'] = source
        article_dict['author'] = author
        article_dict['title'] = title
        article_dict['description'] = desc
        article_dict['url'] = article_url
        article_dict['urlToImage'] = image_url
        article_dict['publishedAt'] = date
        article_dict['content'] = article_text
        
        output_list.append(article_dict)
        
    with open('_'.join(query.split()) + '_' + str(len(output_list)) + '_' +'Bloomberg_article' + '.json', 'w') as json_file:
        json.dump(output_list, json_file)
        
    return output_list


### Mortgage Rates

In [31]:
bloomberg_mr_article = bnn_article_scraper('mortgage rates')

In [32]:
len(bloomberg_mr_article)

100

In [33]:
bloomberg_mr_article[-1]

{'source': 'Reuters',
 'author': 'Matt Scuffham',
 'title': "Canada's financial watchdog warns lenders against bundled mortgage loans",
 'description': "Canada's financial watchdog has warned regulated mortgage providers against teaming up with unregulated rivals to sidestep rules designed to clamp down on risky lending, a top regulator told Reuters.",
 'url': 'https://www.bnnbloomberg.ca/canada-s-financial-watchdog-warns-lenders-against-bundled-mortgage-loans-1.679621',
 'urlToImage': None,
 'publishedAt': 'Feb 23, 2017',
 'content': 'Canada\'s financial watchdog has warned regulated mortgage providers against teaming up with unregulated rivals to sidestep rules designed to clamp down on risky lending, a top regulator told Reuters. \n Carolyn Rogers, assistant superintendent at the Office of the Superintendent of Financial Institutions, said in an interview that the regulator was taking action to stamp out so-called "bundled" loans, which pair a primary mortgage with a second loan fro

### Interest Rates

In [96]:
bloomberg_ir_article = bnn_article_scraper('interest rates')

In [97]:
len(bloomberg_ir_article)

100

### Housing price

In [98]:
bloomberg_hp_article = bnn_article_scraper('housing price')

In [99]:
len(bloomberg_hp_article)

100

### Employment

In [100]:
bloomberg_e_article = bnn_article_scraper('employment')

In [101]:
len(bloomberg_e_article)

94

### GDP

In [102]:
bloomberg_gdp_article = bnn_article_scraper('GDP')

In [103]:
len(bloomberg_gdp_article)

100

### Stock Market

In [104]:
bloomberg_tsx_article = bnn_article_scraper('stock market')

In [105]:
len(bloomberg_tsx_article)

100

In [73]:
# response = requests.get(url)
# api_soup = BeautifulSoup(response.text, 'lxml')

In [28]:
# output_list = []
# query = 'mortgage rates'
# search_query = 'q=' + '&q='.join(query.split())
# url_prefix = 'https://www.bnnbloomberg.ca'
# url = f'https://www.bnnbloomberg.ca/search/bnn-search-tab-view-7.360399/articles-7.360400?ot=example.AjaxPageLayout.ot&{search_query}&parentPaginationAllowed=false'

# response = requests.get(url)
# api_soup = BeautifulSoup(response.text, 'lxml')

# for article in api_soup.find_all('div', {'class': 'article-content'}):
#     title = article.a.text.strip()
#     print("article_title:", title)

#     article_url = url_prefix + article.a.get('href').strip()
#     print("article_url:", article_url)



#     article_response = requests.get(article_url)
#     article_soup = BeautifulSoup(article_response.text, 'lxml')
#     # get date
#     date_tag = article_soup.find('div', class_ = "date")
#     if date_tag:
#         date = date_tag.get_text().strip()
#     else:
#         date = None
#     print('date:', date)

#     # get author
#     author_tag = article_soup.find('span', class_ = "author")
#     if author_tag:
#         author = author_tag.get_text().strip()
#     else:
#         author = None 
#     print('author:', author)

#     # get source
#     source_tag = article_soup.find('span', {'class':'source'})
#     if source_tag:
#         source = source_tag.get_text().strip()
#     else:
#         source = None
#     print('source:', source)

#     # get content
#     article_text_tag = article_soup.find('div', {'class':'article-text'})
#     if article_text_tag:
#         article_text = article_text_tag.get_text(' ')
#         desc = article_text_tag.p.get_text()
#     else:
#         article_text_tag = article_soup.find('div', {'class':'article-text-chart'})
#         if article_text_tag:
#             article_text = article_text_tag.get_text(' ')
#             desc = article_text_tag.p.get_text()
#         else:
#             article_text = None
#             desc = None
#     print('content:', article_text)
#     print('desc:', desc)

#     # get image url
#     article_image_tag = article_soup.find('p', {'class':'image-center'})
#     if article_image_tag:
#         image_url = url_prefix + article_image_tag.img['src']
#     else:
#         image_url = None
#     print('image_url:', image_url)
#     print('\n')

#     article_dict = {}

#     article_dict['source'] = source
#     article_dict['author'] = author
#     article_dict['title'] = title
#     article_dict['description'] = None
#     article_dict['url'] = article_url
#     article_dict['urlToImage'] = image_url
#     article_dict['PublishedAt'] = datetime.strptime(date, '%b %d, %Y').date()
#     article_dict['content'] = article_text

#     output_list.append(article_dict)
    
#     break

# # with open('_'.join(query.split()) + '_' + str(len(output_list)) + '_' +'Bloomberg_article' + '.json', 'w') as json_file:
# #     json.dump(output_list, json_file)



In [20]:
date = 'May 8, 2020'
date_object = datetime.strptime(date, '%b %d, %Y').date()

In [21]:
print(date_object)

2020-05-08


In [77]:
# first = api_soup.find('div', {'class': 'article-content'})
# title = first.a.text.strip()
# print('title:', title)
# url = url_prefix + first.a.get('href')
# #print('url:', url)

# article_response = requests.get(url)
# article_soup = BeautifulSoup(article_response.text, 'lxml')
# date = article_soup.find('div', class_ = "date").get_text().strip()
# print('date:', date)

# author = article_soup.find('span', class_ = "author").get_text().strip()
# print('author:', author)

# source = article_soup.find('span', {'class':'source'}).get_text().strip()
# print('source:', source)

# article_text = article_soup.find('div', {'class':'article-text'}).get_text(' ')
# print('content:', article_text)

# article_image = article_soup.find('div', {'class':'article-image'})
# # image_url = url_prefix + article_image['src']
# print(article_image)

