In [1]:
import pandas as pd
import urllib.request
import json 
from bs4 import BeautifulSoup
import requests
import json
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from datetime import datetime, timedelta, date
import pytz
import dateutil.parser
from collections import defaultdict
import random
import json
import csv

In [2]:
def sample_dataframe_by_month(dataframe, sample_size):
    """
    create sample of dataframe based on publish date, sample size is the number of articles to be extracted from each month
    """
    article_dictionary_by_month = defaultdict(list)
    full_list = []
    for column, row in dataframe.iterrows():
        article_date = (dateutil.parser.parse(row['publishedAt']))
        article_year = article_date.year
        article_month = article_date.month
        article_dictionary_by_month[str(article_year) + '-' + str(article_month)].append(row)

    for month_number, list_of_articles in article_dictionary_by_month.items():
        random.shuffle(list_of_articles)
        subset_list = list_of_articles[:sample_size]
        full_list.extend(subset_list)

    sample_df = pd.DataFrame(full_list)
    sample_df = sample_df.sort_values(by='publishedAt', ascending=False)
    return sample_df

In [39]:
def combine_fp_bloomberg_then_sample(bloomberg_json, fp_json, keyword, article_num_per_month):
    '''
    Combine financial post articles and bloomberg articles, and create a sample by publish date.
    
    input:
    
    bloomberg_json is the file of articles from bloomberg stored in json format
    fp_json is the the file of articles from financial post stored in json format
    keyword is the searching query used to collect these articles
    
    '''
    bloomberg_df = pd.read_json(bloomberg_json)
    bloomberg_df = bloomberg_df[(bloomberg_df.source != 'The Canadian Press') & (bloomberg_df.source != 'Reuters')]
    bloomberg_df = bloomberg_df[bloomberg_df.source.notnull()]
    bloomberg_df['publishedAt'] = bloomberg_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%b %d, %Y').strftime('%Y-%m-%d'))
    
    fp_df = pd.read_json(fp_json)
    fp_df['publishedAt'] = fp_df['publishedAt'].apply(lambda x: ' '.join(x.split()[:3]))
    fp_df['publishedAt'] = fp_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%B %d, %Y').strftime('%Y-%m-%d'))

    concat_df = pd.concat([bloomberg_df, fp_df])
    concat_df.reset_index(drop=True, inplace=True)
    
    concat_df['title_desc_sent_1'] = None
    concat_df['sent_1_note'] = None
    concat_df['title_desc_sent_2'] = None
    concat_df['sent_2_note'] = None
    concat_df = concat_df[['source', 'author', 'title', 'description', 'title_desc_sent_1', 'sent_1_note', 'title_desc_sent_2', 'sent_2_note', 'publishedAt', 'url', 'urlToImage', 'content']]
    
    sample_df = sample_dataframe_by_month(concat_df, article_num_per_month)
    
    
    sample_df.to_csv(keyword + '_sample.csv')
    #return sample_df

In [4]:
def combine_fp_bloomberg(bloomberg_json, fp_json):
    '''
    Combine financial post articles and bloomberg articles.
    
    input:
    
    bloomberg_json is the file of articles from bloomberg stored in json format
    fp_json is the the file of articles from financial post stored in json format
    
    '''
    bloomberg_df = pd.read_json(bloomberg_json)
    bloomberg_df = bloomberg_df[(bloomberg_df.source != 'The Canadian Press') & (bloomberg_df.source != 'Reuters')]
    bloomberg_df = bloomberg_df[bloomberg_df.source.notnull()]
    bloomberg_df['publishedAt'] = bloomberg_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%b %d, %Y').strftime('%Y-%m-%d'))
    
    fp_df = pd.read_json(fp_json)
    fp_df['publishedAt'] = fp_df['publishedAt'].apply(lambda x: ' '.join(x.split()[:3]))
    fp_df['publishedAt'] = fp_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%B %d, %Y').strftime('%Y-%m-%d'))

    concat_df = pd.concat([bloomberg_df, fp_df])
    concat_df.reset_index(drop=True, inplace=True)
    
    concat_df['title_desc_sent_1'] = None
    concat_df['sent_1_note'] = None
    concat_df['title_desc_sent_2'] = None
    concat_df['sent_2_note'] = None
    concat_df = concat_df[['source', 'title', 'description', 'publishedAt']]
    concat_df = concat_df.sort_values(by='publishedAt', ascending=False)
    
    
    
    
    #sample_df.to_csv(keyword + '_sample.csv')
    return concat_df

In [5]:
def get_unannotated_data(bloomberg_json, fp_json, annotated_route, indicator):
    '''get unannotated data (remove annotated data from all the articles that are collected)'''
    total = combine_fp_bloomberg(bloomberg_json, fp_json)
    annotated = pd.read_csv(annotated_route)
    
    if 'Unnamed: 0' in annotated.columns.values:
        drop_list = annotated['Unnamed: 0'].to_list()
    else:
        drop_list = annotated['Column1'].to_list()
    total = total.drop(drop_list)
    total['title_desc'] = total['title'] + '. ' + total['description']
    total = total[['source', 'title_desc', 'publishedAt']]
    total = total.drop_duplicates(subset='title_desc', keep='first' )
    #return total
    total.to_csv('predictions_dataset_' + indicator + '_' + 'Bloomberg.csv')

In [7]:
mr_ua = get_unannotated_data('mortgage_rates_100_Bloomberg_article.json', 'mortgage_rate_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_mortgagerate_annotated_agreed.csv', 'mortgagerates')


In [151]:
combine_fp_bloomberg_then_sample('interest_rates_100_Bloomberg_article.json', 'interest_rate_fpbloomberg.json', 'interest_combined', 100)


In [8]:
get_unannotated_data('interest_rates_100_Bloomberg_article.json', 'interest_rate_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_interestrate_annotated_agreed.csv', 'interestrates')


In [152]:
combine_fp_bloomberg_then_sample('housing_price_100_Bloomberg_article.json', 'housing_fpbloomberg.json', 'housing_combined', 100)


In [9]:
get_unannotated_data('housing_price_100_Bloomberg_article.json', 'housing_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_housing_annotated_agreed.csv', 'housing')


In [None]:
combine_fp_bloomberg_then_sample('GDP_100_Bloomberg_article.json', 'GDP_fpbloomberg.json', 'GDP_combined', 100)


In [10]:
get_unannotated_data('GDP_100_Bloomberg_article.json', 'GDP_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_GDP_annotated_agreed.csv', 'GDP')


In [None]:
combine_fp_bloomberg_then_sample('employment_95_Bloomberg_article.json', 'employment_fpbloomberg.json', 'employement_combined', 100)


In [11]:
get_unannotated_data('employment_95_Bloomberg_article.json', 'employment_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_employment_annotated_agreed.csv', 'employment')


In [None]:
combine_fp_bloomberg_then_sample('stock_market_100_Bloomberg_article.json', 'stock_market_fpbloomberg.json', 'stock_market_combined', 100)


In [12]:
get_unannotated_data('stock_market_100_Bloomberg_article.json', 'stock_market_fpbloomberg.json', '../Annotated_datasets_for_model_finetuning/Bloomberg_TSX_annotated_agreed.csv', 'stockmarket')


In [68]:
# interest_rate_b_df = pd.read_json('interest_rates_100_Bloomberg_article.json')
# interest_rate_b_df['publishedAt'] = interest_rate_b_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%b %d, %Y').strftime('%Y-%m-%d'))

# interest_rate_fpb_article_df = pd.read_json('interest_rate_fpbloomberg.json')
# interest_rate_fpb_article_df['publishedAt'] = interest_rate_fpb_article_df['publishedAt'].apply(lambda x: ' '.join(x.split()[:3]))
# interest_rate_fpb_article_df['publishedAt'] = interest_rate_fpb_article_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%B %d, %Y').strftime('%Y-%m-%d'))

# housing_b_df = pd.read_json('housing_price_100_Bloomberg_article.json')
# housing_b_df['publishedAt'] = housing_b_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%b %d, %Y').date())

# housing_fpb_df = pd.read_json('housing_fpbloomberg.json')
# housing_fpb_df['publishedAt'] = housing_fpb_df['publishedAt'].apply(lambda x: ' '.join(x.split()[:3]))
# housing_fpb_df['publishedAt'] = housing_fpb_df['publishedAt'].apply(lambda x: datetime.strptime(x, '%B %d, %Y').date())


In [69]:
# interest_concat = pd.concat([interest_rate_b_df, interest_rate_fpb_article_df])
# interest_concat.reset_index(drop=True, inplace=True)
# interest_concat.to_json('interst_concat.json', orient='records')

In [70]:
# housing_concat = pd.concat([housing_b_df, housing_fpb_df])
# housing_concat.reset_index(drop=True, inplace=True)
# housing_concat.to_json('housing_concat.json', orient='records')

In [124]:
# interest_list = extract_subset_in_csv('interst_concat.json', 5)
# len(interest_list)

In [108]:
# article_dictionary_by_month = defaultdict(list)
# full_list = []
# for column, row in interest_concat.iterrows():
#     article_date = (dateutil.parser.parse(row['publishedAt']))
#     article_month = article_date.month
#     article_dictionary_by_month[article_month].append(row)
    
# for month_number, list_of_articles in article_dictionary_by_month.items():
#     random.shuffle(list_of_articles)
#     subset_list = list_of_articles[:5]
#     full_list.extend(subset_list)

# sample_df = pd.DataFrame(full_list)