In [1]:
import pandas as pd

In [2]:
def generate_raw_sentiment_score(row):
    '''calculate sentiment score based on best_label'''
    if row['best_label'] == 1:
        result = row['best_confidence'] + 0.5
    elif row['best_label'] == -1:
        result = -row['best_confidence'] - 0.5
    else:
        # total = row['best_confidence'] + row['second_confidence'] + row['least_confidence'] these add up to 1
        if row['second_likely'] == 1:
            result = row['second_confidence'] - row['least_confidence']
        else:
            result = row['least_confidence'] - row['second_confidence']
    return result

In [3]:
def get_raw_sentiment_score(csvpath):
    '''outputs a dataframe that contains the raw sentiment score for all the articles'''
    df = pd.read_csv(csvpath, parse_dates=['publishedAt'])
    df['raw_sentiment_score'] = df.apply(lambda row: generate_raw_sentiment_score(row), axis=1)
    df = df[['source', 'title_desc','publishedAt', 'raw_sentiment_score']].sort_values(by='publishedAt', ascending=False)
    return df

### Unannotated predictions (url may need to change later)

In [4]:
dict_unannotated_predictions = {'url_bbg_gdp_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv?token=AAAAOMVIFM7F56MD6NBHFOS64BUZA',
'url_cbc_gdp_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_CBC_predictions.csv?token=AAAAOMRTCX6IIU5WYLOZN4S64AHNW',
'url_bbg_employment_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_employment_Bloomberg_predictions.csv?token=AAAAOMVW3VDF4NHVJDLZM5C64AOEG',
'url_cbc_employment_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_employment_CBC_predictions.csv?token=AAAAOMTI2MJB2HTSLPNHZ3K64AOIG',
'url_bbg_housing_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_housing_Bloomberg_predictions.csv?token=AAAAOMWNFICAYV5WWRM7YFS64AOL4',
'url_cbc_housing_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_housing_cbc_predictions.csv?token=AAAAOMV7OHFAEKVNIUEDFTK64AOPK',
'url_bbg_interestrates_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_interestrates_bloomberg_predictions.csv?token=AAAAOMULQKA5TIASIXVFMJC64EX2C',
'url_cbc_interestrates_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_interestrate_cbc_predictions.csv?token=AAAAOMWXALIC2RJXODRH54C64EV7C',
'url_bbg_mortgagerates_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_mortgagerates_Bloomberg_predictions.csv?token=AAAAOMWXDUHBDM3HYQM7NAS64EYK4',
'url_cbc_mortgagerates_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_mortgagerates_CBC_predictions.csv?token=AAAAOMQXDG2J2VHNEWUGJ3264EY5S',
'url_bbg_stockmarket_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_stockmarket_Bloomberg_predictions.csv?token=AAAAOMXXRVPOXOE6EHVTCWK64EZCC',
'url_cbc_stockmarket_prediction': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_stockmarket_CBC_predictions.csv?token=AAAAOMT2N6KBDGKB46CD5K264EZEA'}


In [5]:
#url_bbg_gdp_prediction = '../week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv'
#url_cbc_gdp_prediction = '../week_4/predictions/prediction_output/unannotated_GDP_CBC_predictions.csv'

### Annotated data  (url may need to change later)

In [11]:
dict_annotated_data = {
    'url_annotated_gdp': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_GDP_bnn%26CBC.csv?token=AAAAOMTX72MPXGUUBW2LBBK64E2BI',
    'url_annotated_employment': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_employment_bnn%26CBC.csv?token=AAAAOMV7C3KOSHKYUIECHP264E2FM',
    'url_annotated_housing': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_housing_bnn%26CBC.csv?token=AAAAOMWHXFMWYVF6I4XRQK264E2IO',
    'url_annotated_interestrate': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_interest_rate_bnn%26CBC.csv?token=AAAAOMSXLSK54Z4LKODP4RC64E2OU',
    'url_annotated_mortgagerate': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_mortgage_rate_bnn%26CBC.csv?token=AAAAOMWUWJFAY2CD4PGJSRC64E2XQ',
    'url_annotated_stockmarket': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/combined/annotated_stock_bnn%26CBC.csv?token=AAAAOMQR63AFCU6NXWJH7Q264E2ZM'
}

dict_annotated_bbg = {
    'url_annotated_gdp_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_GDP_annotated_agreed.csv?token=AAAAOMXJW5XVJIBY5DUG2QC64FY2M',
    'url_annotated_employment_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_employment_annotated_agreed.csv?token=AAAAOMUMUXY7SB57JSNRCZS64FY4S',
    'url_annotated_housing_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_housing_annotated_agreed.csv?token=AAAAOMX26RY4L4NWDLTNYRK64FY66',
    'url_annotated_interestrates_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_interestrate_annotated_agreed.csv?token=AAAAOMU53TSTTDL32S2BSFK64FZBM',
    'url_annotated_mortgagerates_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_mortgagerate_annotated_agreed.csv?token=AAAAOMUZW5XFOPN53IMFWBS64FZD6',
    'url_annotated_stockmarket_bbg': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/bloomberg/Bloomberg_TSX_annotated_agreed.csv?token=AAAAOMTGULZE4GAX66TD6GC64FZFW'
    
}

dict_annotated_cbc = {
    'url_annotated_gdp_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_GDP_annotated_agreed.csv?token=AAAAOMU3F3S7ZCGC2ISZBIS64F3US',
    'url_annotated_employment_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_employment_annotated_agreed.csv?token=AAAAOMVJ2HYZBK4YLFSGPIS64F3XW',
    'url_annotated_housing_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_housing_annotated_agreed.csv?token=AAAAOMQWGFRTGMMJA25SB6S64F32C',
    'url_annotated_interestrates_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_interestrates_annotated_agreed.csv?token=AAAAOMTFNCWEMFCS7UCYVP264F346',
    'url_annotated_mortgagerates_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_mortgagerates_annotated_agreed.csv?token=AAAAOMQF7SPJE37I6DI6UFS64F36I',
    'url_annotated_stockmarket_cbc': 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/data_extraction/data/annotated_data/cbc/CBC_stockmarket_annotated_agreed.csv?token=AAAAOMXC5P3WA7KDTXJ4GZC64F4AW'
}


In [7]:
# url_bbg_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv?token=AAAAOMVIFM7F56MD6NBHFOS64BUZA'
# #url_bbg_gdp_prediction = '../week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv'
# url_cbc_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_CBC_predictions.csv?token=AAAAOMRTCX6IIU5WYLOZN4S64AHNW'
# #url_cbc_gdp_prediction = '../week_4/predictions/prediction_output/unannotated_GDP_CBC_predictions.csv'
# url_bbg_employment_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_employment_Bloomberg_predictions.csv?token=AAAAOMVW3VDF4NHVJDLZM5C64AOEG'
# url_cbc_employment_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_employment_CBC_predictions.csv?token=AAAAOMTI2MJB2HTSLPNHZ3K64AOIG'
# url_bbg_housing_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_housing_Bloomberg_predictions.csv?token=AAAAOMWNFICAYV5WWRM7YFS64AOL4'
# url_cbc_housing_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_housing_cbc_predictions.csv?token=AAAAOMV7OHFAEKVNIUEDFTK64AOPK'

# bbg_gdp_with_raw = get_raw_sentiment_score(url_bbg_gdp_prediction)
# cbc_gdp_with_raw = get_raw_sentiment_score(url_cbc_gdp_prediction)


In [8]:
# test code for combine_annotated_and_predicted function
# indicators = ['gdp','employment','housing','interest','mortgage','stock']

# df_a = pd.read_csv(dict_annotated_data['url_annotated_gdp'], parse_dates=['publishedAt'])
# df_a = df_a[['source', 'title_desc','publishedAt', 'title_desc_sent_1']]
# df_a['title_desc_sent_1'] = df_a['title_desc_sent_1'].apply(lambda x: x + 0.5 if x == 1 else (x - 0.5 if x == -1 else x))
# indicator = ['gdp','employment','housing','interest','mortgage','stock']

# for indicator in indicators:
#     if indicator in dict_annotated_data['url_annotated_gdp'].lower():
#         df_a['indicator'] = indicator
        

# if 'bnn' in dict_annotated_data['url_annotated_gdp'].lower():
#     df_a['annotation_type'] = 'annotated'
# elif 'predictions' in dict_annotated_data['url_annotated_gdp'].lower():
#     df_a['annotation_type'] = 'predicted'

# df_a = df_a.rename(columns={'title_desc_sent_1':'raw_sentiment_score'}).sort_values(by='publishedAt', ascending=False)


In [9]:
def combine_annotated_and_predicted(dict_annotated_data, dict_unannotated_predictions, source):
    """
    combine the both annotated data and predicted data of all sources and 
    all economic indicators into one big dataframe for visualization
    
    inputs:
    dict_annotated_data: dictionary that contains the locations of all annotated data
    dict_unannotated_predictions: dictionary that contains the locations of all predicted data 
    source: since the annotation data is mixed
    """
    
    indicators = ['gdp','employment','housing','interest','mortgage','stock']
    
    annotated_list = []
    predicted_list = []
    
    for url in dict_annotated_data.values():

        df_a = pd.read_csv(url, parse_dates=['publishedAt'])
        df_a = df_a[['source', 'title_desc','publishedAt', 'title_desc_sent_1']]
        df_a['title_desc_sent_1'] = df_a['title_desc_sent_1'].apply(lambda x: x + 0.5 if x == 1 else (x - 0.5 if x == -1 else x))
        

        for indicator in indicators:
            if indicator in url.lower():
                df_a['indicator'] = indicator

        df_a['annotation_type'] = 'annotated'
        df_a = df_a.rename(columns={'title_desc_sent_1':'raw_sentiment_score'}).sort_values(by='publishedAt', ascending=False)
        annotated_list.append(df_a)
        
    annotated_df = pd.concat(annotated_list)
    
    for url in dict_unannotated_predictions.values():
        df = get_raw_sentiment_score(url)
        
        for indicator in indicators:
            if indicator in url.lower():
                df['indicator'] = indicator
                
        df['annotation_type'] = 'predicted'
        predicted_list.append(df)
    
    predicted_df = pd.concat(predicted_list)
    
    output_df = pd.concat([annotated_df, predicted_df])
    
    return output_df
        
        


In [10]:
aggregate_df = combine_annotated_and_predicted(dict_annotated_data, dict_unannotated_predictions)

In [11]:
aggregate_df

Unnamed: 0,source,title_desc,publishedAt,raw_sentiment_score,indicator,annotation_type
0,CBC,Hamilton economy in 'extraordinary pain' but w...,2020-05-13,-1.500000,gdp,annotated
1,CBC,Federal deficit likely to be higher than $252 ...,2020-05-13,-1.500000,gdp,annotated
2,CBC,Post-secondary schools face rough autumn if pa...,2020-05-07,0.000000,gdp,annotated
3,CBC,"May and Blanchet declare the oilpatch 'dead,' ...",2020-05-06,-1.500000,gdp,annotated
4,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...",Setback to Montreal retail reopening shows roc...,2020-05-04,-1.500000,gdp,annotated
...,...,...,...,...,...,...
76,CBC,N.B. pension management employees score record...,2019-06-17,-0.084275,stock,predicted
90,CBC,Does approving TMX do anything for the Liberal...,2019-06-16,1.311211,stock,predicted
88,CBC,"Progress, but still no deal to avert U.S.-Mexi...",2019-06-07,-1.002063,stock,predicted
70,CBC,Threat of U.S. tariffs on Mexico roils markets...,2019-05-31,-1.067293,stock,predicted


In [12]:
aggregate_df.to_csv('combined_annotation_prediction.csv')

In [13]:
def get_monthly_avg_score(prediction_path, annotation_path):
    """
    Calculate the monthly average sentiment scores for one indicator of one source with
    both predicted data as well as hand annotated data.
    
    input:
    prediction_path: the path of predicted data
    annotation_path: the path for annotated data
    
    output:
    A one column dataframe, the index of which is date (publishedAt) which represents 
    the month of the average score. The monthly_avg_sent_score column is the 
    average sentiment score.
    
    """
    prediction_df = get_raw_sentiment_score(prediction_path)
    
    annotation_df = pd.read_csv(annotation_path, parse_dates=['publishedAt'])
    annotation_df['title_desc'] = annotation_df['title'] + '. ' + annotation_df['description']
    annotation_df = annotation_df[['source', 'title_desc','publishedAt', 'title_desc_sent_1']]
    annotation_df['title_desc_sent_1'] = annotation_df['title_desc_sent_1'].apply(lambda x: x + 0.5 if x == 1 else (x - 0.5 if x == -1 else x))
    annotation_df = annotation_df.rename(columns={'title_desc_sent_1':'raw_sentiment_score'}).sort_values(by='publishedAt', ascending=False)
    
    df = pd.concat([prediction_df, annotation_df])
    df = df.sort_values(by='publishedAt')
    
    ave_df = df.resample('M', on='publishedAt').mean()
    ave_df = ave_df[['raw_sentiment_score']].rename(columns={'raw_sentiment_score': 'monthly_avg_sent_score'})
    #ave_df = ave_df.fillna(method='ffill')
    return ave_df

bbg_gdp_avg = get_monthly_avg_score(dict_unannotated_predictions['url_bbg_gdp_prediction'], dict_annotated_bbg['url_annotated_gdp_bbg'])
cbc_gdp_avg = get_monthly_avg_score(dict_unannotated_predictions['url_cbc_gdp_prediction'], dict_annotated_cbc['url_annotated_gdp_cbc'])


In [14]:
bbg_gdp_avg

Unnamed: 0_level_0,monthly_avg_sent_score
publishedAt,Unnamed: 1_level_1
2018-03-31,-1.5
2018-04-30,
2018-05-31,1.5
2018-06-30,0.0
2018-07-31,1.5
2018-08-31,0.5
2018-09-30,1.0
2018-10-31,-0.5
2018-11-30,0.0
2018-12-31,0.5


In [15]:
cbc_gdp_avg

Unnamed: 0_level_0,monthly_avg_sent_score
publishedAt,Unnamed: 1_level_1
2019-05-31,-0.742719
2019-06-30,-0.424983
2019-07-31,0.19034
2019-08-31,-0.864665
2019-09-30,-0.585624
2019-10-31,0.106173
2019-11-30,-0.74847
2019-12-31,-0.137999
2020-01-31,-0.426799
2020-02-29,-0.814668


In [16]:
source_dict = {'bloomberg': [bbg_gdp_avg, 0.5], 'cbc': [cbc_gdp_avg, 0.5]}
# df_list = []
# for value_pair in source_dict.values():
#     df = value_pair[0]
#     df = df['monthly_avg_sent_score'] * value_pair[1]
#     df = df.to_frame()
#     #print(value_pair[0])
    
#     df_list.append(df)
#     if len(df_list) > 1:
#         df_list[0] = df_list[0].add(df_list[1], fill_value=0)
# df_list[0]

In [17]:
def monthly_weighted_average(source_dict):
    """
    Calculate the monthly weighted average sentiment score for indicators such as 
    GDP, employment rate, housing index, stock index, and mortgage rate.
    output is a one columns dataframe, the index of which is the date (publishedAt) 
    of the weighted average sentiment score, the weighted_ave_sent_score column is 
    the weighted average sentiment scores calculated from multiple sources. 
    
    inputs:
    source_dict: a dictionary of different sources. The key of the dictionary are the 
                 names of news sources. The value of the dictionary are lists, the first 
                 element of the list is the dataframe of the news source that includes 
                 the predicted monthly average sentiment score, the second element is the
                 weight of that news source.
    """
    df_list = []
    for value_pair in source_dict.values():
        df = value_pair[0]
        df = df['monthly_avg_sent_score'] * value_pair[1]
        df = df.to_frame()
        df_list.append(df)
        
        if len(df_list) > 1:
            df_list[0] = df_list[0].add(df_list[1], fill_value=0)
            
    out_df = df_list[0].rename(columns={'monthly_avg_sent_score': 'monthly_weighted_ave_sent_score'})
    return out_df
        

In [18]:
weighted_monthly_ave = monthly_weighted_average(source_dict)
weighted_monthly_ave

Unnamed: 0_level_0,monthly_weighted_ave_sent_score
publishedAt,Unnamed: 1_level_1
2018-03-31,-0.75
2018-04-30,
2018-05-31,0.75
2018-06-30,0.0
2018-07-31,0.75
2018-08-31,0.25
2018-09-30,0.5
2018-10-31,-0.25
2018-11-30,0.0
2018-12-31,0.25
