In [1]:
## below test Jon's visualization code

import dash
import dash_core_components as dcc
import dash_html_components as html
from datetime import datetime
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import random
from plotly.subplots import make_subplots

def parse_govt_data(df, daterow_ind, valuerow_ind, datetime_format_in):
    """returns a list of datetime strings and financial indicator values from 
    dataframe of govt data
    
    assume datetime format for output datetime list = YYYY-MM-DD
    
    """
    date_row = df.iloc[daterow_ind]
    date_row = date_row[2:]
    date_list = date_row.tolist()
    datetime_list = []

    for item in date_list:
        dt = datetime.strptime(item, datetime_format_in)
        datetime_list.append(dt.strftime("%Y-%m-%d"))


    value_row = df.iloc[valuerow_ind]
    value_row = value_row[2:]
    value_list = value_row.tolist()
    
    assert (len(value_list) == len(datetime_list))

    return datetime_list, value_list



filepath = r"C:\Users\gen80\OneDrive\Documents\MDSlectures\capstone_sentiment_analysis\testout\mortgage_rate_jan-2019_feb-2020.csv"
mortgage_df = pd.read_csv(filepath, skiprows=5)
mortgage_df.head()

datetime_mortgage, values_mortgage = parse_govt_data(mortgage_df, 0, 2, "%B %Y")

final_df = pd.DataFrame(values_mortgage, 
                        index =datetime_mortgage, 
                        columns =['values']).reset_index()

final_df["indicator"] = "mortgage"
test_mrg_df = final_df.rename(columns={"index": "dates"})

test_mrg_df.head()

#make toy sentiment data
test_senti = [random.uniform(-1, 1) for i in range(len(values_mortgage))]
test_senti_df = pd.DataFrame(test_senti, columns=["final_sentiment"])
test_senti_df.head()

Unnamed: 0,final_sentiment
0,0.103541
1,0.255537
2,-0.383935
3,0.176109
4,-0.987071


In [50]:
## use senti_df produced from Aaron's code -- containing continuous sentiment score

import pandas as pd
from datetime import datetime

def generate_raw_sentiment_score(row):
    '''calculate sentiment score based on best_label'''
    if row['best_label'] == 1:
        result = row['best_confidence'] + 0.5
    elif row['best_label'] == -1:
        result = -row['best_confidence'] - 0.5
    else:
        # total = row['best_confidence'] + row['second_confidence'] + row['least_confidence'] these add up to 1
        if row['second_likely'] == 1:
            result = row['second_confidence'] - row['least_confidence']
        else:
            result = row['least_confidence'] - row['second_confidence']
    return result

def get_raw_sentiment_score(csvpath):
    '''outputs a dataframe that contains the raw sentiment score for all the articles'''
    df = pd.read_csv(csvpath)
    df['publishedAt'] = df['publishedAt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
    df['raw_sentiment_score'] = df.apply(lambda row: generate_raw_sentiment_score(row), axis=1)
    return df

url_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv?token=AAAAOMT7F4GPS4GRINYCZC2633F5Q'
gdp_with_raw = get_raw_sentiment_score(url_gdp_prediction)
gdp_with_raw

Unnamed: 0.1,Unnamed: 0,source,title_desc,publishedAt,best_label,best_confidence,second_likely,second_confidence,least_likely,least_confidence,raw_sentiment_score
0,0,Bloomberg News,Economic growth stalled in February ahead of v...,2020-04-30,-1,0.997521,1,0.002091,0,0.000388,-1.497521
1,67,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...",America’s longest economic expansion is over —...,2020-04-29,-1,0.998286,1,0.000875,0,0.000839,-1.498286
2,69,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...","Once seen as safer than gold, Canadian real es...",2020-04-16,-1,0.990286,1,0.0059,0,0.003815,-1.490286
3,1,Bloomberg News,China won't be able to bail us out this time. ...,2020-04-16,-1,0.973306,1,0.018563,0,0.008131,-1.473306
4,71,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...",Michael Burry of 'The Big Short' slams coronav...,2020-04-07,-1,0.965097,0,0.017516,1,0.017386,-1.465097
5,2,Bloomberg News,Canada ekes out meager growth in January befor...,2020-03-31,-1,0.998931,1,0.000762,0,0.000307,-1.498931
6,72,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...",Canada ekes out meagre growth in January befor...,2020-03-31,-1,0.989178,1,0.010187,0,0.000635,-1.489178
7,73,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...","White House, senators strike deal on America's...",2020-03-25,-1,0.990717,1,0.009045,0,0.000238,-1.490717
8,75,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...",Stock losses deepen after latest U.S. stimulus...,2020-03-23,-1,0.831419,1,0.163293,0,0.005287,-1.331419
9,76,"{'id': 'fp-bloomberg-news', 'name': 'Bloomberg...","Searching for a bottom, Canada's stock manager...",2020-03-18,-1,0.921441,1,0.040193,0,0.038366,-1.421441


In [60]:
bbg_sentiment_df = gdp_with_raw.set_index('publishedAt')

In [51]:
url_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/amylam/sentiment_analyzer/data/prediction_output/cbc/unannotated_GDP_CBC_predictions.csv?token=AAAAPK4DI6H43GJIUKXHWJK64AHL6'
gdp_with_raw2 = get_raw_sentiment_score(url_gdp_prediction)
gdp_with_raw2

Unnamed: 0,source,title_desc,publishedAt,best_label,best_confidence,second_likely,second_confidence,least_likely,least_confidence,raw_sentiment_score
0,CBC,Canada's economy was slowing down even before ...,2020-04-30,-1,0.998120,1,0.001353,0,0.000526,-1.498120
1,CBC,Canada's economy was slowing even before COVID...,2020-03-31,-1,0.493540,0,0.413250,1,0.093211,-0.993540
2,CBC,Canada's economy rebounded with 0.1% GDP incre...,2020-01-31,-1,0.969221,1,0.021293,0,0.009485,-1.469221
3,CBC,"U.S. reports on GDP, consumer spending remain ...",2019-11-27,-1,0.942319,1,0.053510,0,0.004171,-1.442319
4,CBC,A crisis like no other: Canada's finances coul...,2020-05-16,-1,0.910275,0,0.064336,1,0.025390,-1.410275
...,...,...,...,...,...,...,...,...,...,...
92,CBC,"Budget aimed at health care, education and pov...",2020-02-25,-1,0.970833,1,0.020794,0,0.008373,-1.470833
93,CBC,What Canada's economy would look like if Alber...,2019-09-30,0,0.609898,-1,0.313420,1,0.076682,-0.236738
94,CBC,Liberals to boost spending and extend deficits...,2019-09-29,-1,0.616859,0,0.324614,1,0.058527,-1.116859
95,CBC,"Winter is coming, and so is an uncharted econo...",2019-08-13,0,0.876635,-1,0.085343,1,0.038022,-0.047320


In [172]:
# function by Aaron
def get_monthly_avg_score(df):
    """
    Calculate the monthly average sentiment scores for one indicator of one source.
    
    input:
    df: a dataframe that includes the raw sentiment scores of prediction
    
    output:
    A one column dataframe, the index of which is date (publishedAt) which represents 
    the month of the average score. The monthly_avg_sent_score column is the 
    average sentiment score.
    
    """
    
    #ave_df = df.resample('M', on='publishedAt').mean() #changed on Jun3 5:20pm. publishedAt is now the index column|
    ave_df = df.resample('M').mean()
    ave_df = ave_df[['raw_sentiment_score']].rename(columns={'raw_sentiment_score': 'monthly_avg_sent_score'})
    ave_df = ave_df.fillna(method='ffill')
    return ave_df


In [63]:
#bbg_sentiment_df = gdp_with_raw.set_index('publishedAt')

# bbg_gdp_avg = get_monthly_avg_score(gdp_with_raw)
# bbg_gdp_avg

In [4]:
# unify the source naming for later filtering

test_senti_df = gdp_with_raw[['publishedAt','source','title_desc','raw_sentiment_score']]
test_senti_df['source'] = "Bloomberg"
test_senti_df.head()

test_senti_df2= gdp_with_raw2[['publishedAt','source','title_desc','raw_sentiment_score']]
test_senti_df2['source'] = "CBC"
test_senti_df2.head()

mixedsource_senti_df = pd.concat([test_senti_df,test_senti_df2])
mixedsource_senti_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,publishedAt,source,title_desc,raw_sentiment_score
0,2020-04-30,Bloomberg,Economic growth stalled in February ahead of v...,-1.497521
1,2020-04-29,Bloomberg,America’s longest economic expansion is over —...,-1.498286
2,2020-04-16,Bloomberg,"Once seen as safer than gold, Canadian real es...",-1.490286
3,2020-04-16,Bloomberg,China won't be able to bail us out this time. ...,-1.473306
4,2020-04-07,Bloomberg,Michael Burry of 'The Big Short' slams coronav...,-1.465097


In [5]:
mixedsource_senti_df = mixedsource_senti_df.sort_values('publishedAt')
mixedsource_senti_df = mixedsource_senti_df.rename(columns= {"raw_sentiment_score":"final_sentiment"})

mixedsource_senti_df

Unnamed: 0,publishedAt,source,title_desc,final_sentiment
43,2019-04-01,Bloomberg,"Stephen Poloz confident slowdown is temporary,...",-1.358499
42,2019-04-02,Bloomberg,WTO says tariff war will hammer global trade g...,-1.363895
41,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
40,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
39,2019-04-29,Bloomberg,Why are economists so bad at forecasting reces...,0.035580
...,...,...,...,...
11,2020-05-09,CBC,Deficit reduction will have to wait for the ec...,-0.272068
7,2020-05-13,CBC,England tiptoes out of lockdown as economy div...,-1.440326
5,2020-05-13,CBC,Federal deficit likely to be higher than $252 ...,-1.494338
6,2020-05-14,CBC,New Zealand plans spending spree to counter vi...,-1.161278


In [169]:
# a giant senti_df made by Aaron

combined_senti_df = pd.read_csv("combined_annotation_prediction.csv",parse_dates=['publishedAt'],index_col='publishedAt')
#clean source naming inconsistency
combined_senti_df.source = combined_senti_df.source.str.replace("{'id': 'fp-bloomberg-news', 'name': 'Bloomberg News From FP'}","Bloomberg")
combined_senti_df.source = combined_senti_df.source.str.replace("Bloomberg News","Bloomberg")
combined_senti_df.source = combined_senti_df.source.str.replace("BNN Bloomberg","Bloomberg")


combined_senti_df.head()

Unnamed: 0_level_0,Unnamed: 0,source,title_desc,raw_sentiment_score,indicator,annotation_type
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-13,0,CBC,Hamilton economy in 'extraordinary pain' but w...,-1.5,gdp,annotated
2020-05-13,1,CBC,Federal deficit likely to be higher than $252 ...,-1.5,gdp,annotated
2020-05-07,2,CBC,Post-secondary schools face rough autumn if pa...,0.0,gdp,annotated
2020-05-06,3,CBC,"May and Blanchet declare the oilpatch 'dead,' ...",-1.5,gdp,annotated
2020-05-04,4,Bloomberg,Setback to Montreal retail reopening shows roc...,-1.5,gdp,annotated


In [112]:
combined_senti_df.source.unique()

array(['CBC', 'Bloomberg'], dtype=object)

In [82]:
combined_senti_df.indicator.unique()

array(['gdp', 'employment', 'housing', 'interest', 'mortgage', 'stock'],
      dtype=object)

In [90]:
# filtering works for get_monthly_avg_score ! 
monthly_senti_df_gdp_cbc =get_monthly_avg_score(combined_senti_df.query('indicator=="gdp" & source == "CBC" '))
monthly_senti_df_mortgage =get_monthly_avg_score(combined_senti_df.query('indicator=="mortgage" '))

In [115]:
monthly_senti_df_gdp_cbc.index

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31'],
              dtype='datetime64[ns]', name='publishedAt', freq='M')

In [186]:
## UPDATED FUNCTION - WORKING
def plot_combined_graph_new(indicator_df, senti_df, indicator_name="y-axis label", title="Default Title", add_rangeslide=True): 
    """
    returns a Plotly graph given a dataframes containing financial indicator data and 
    a dataframe containing sentiment values 
    INPUT:
    indicator_df: dataframe containing a "date" column of datetimes and a "values" column of float values
    senti_df: dataframe containing a "final_sentiment" column of float values
    indicator_name: string label for y axis (what is tracked in indicator_df["values"])
    title: string of title 
    add_rangeslide: boolean of displaying
    """
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    indi_y = indicator_df["values"].astype('float64')
    indi_dates = indicator_df["dates"].astype('datetime64[ns]') 
    senti_y = senti_df["final_sentiment"].astype('float64')
    #senti_dates = senti_df["publishedAt"].astype('datetime64[ns]')
    senti_dates = senti_df.index.astype('datetime64[ns]') #Jun3 4:44 pm changed to get dates from index of senti_df
    
    
    #set x axis limits for visualization
    x_axis_limit_l = max([min(indi_dates), min(senti_dates)]) #latest of two start periods
    x_axis_limit_r = min([max(indi_dates), max(senti_dates)]) #earliest of two end periods
    #Add indicator area visualization - set boundaries, add to fig object, update axis
    indi_axis_min = min(indi_y) - 0.1
    indi_axis_max = max(indi_y) + 0.1
    fig.add_trace(
        go.Scatter(x=indi_dates, 
                   y=indi_y, 
                    name=indicator_name,
                    fill='tonexty', 
                    mode='lines', 
                    line_color='blue'),
                    secondary_y=False)
    fig['layout']['yaxis1'].update(title=indicator_name, 
                                   range=[indi_axis_min, indi_axis_max], 
                                   autorange=False)
    #Add sentiment line visualization - set boundaries, add to fig object, update axis
    senti_axis_min = min(senti_y) - 1
    senti_axis_max = max(senti_y) + 1
    fig.add_trace(
        go.Scatter(x=senti_dates, 
                   y=senti_y,
                   name="test sentiment", ## we should change this? 
                  line_color='orange'),
                    secondary_y=True)
    fig['layout']['yaxis2'].update(title='Sentiment Score', 
                                   range=[senti_axis_min, senti_axis_max])
    #update x axis, add title, and show
    fig.update_xaxes(rangeslider_visible=add_rangeslide,
                     range = [x_axis_limit_l, x_axis_limit_r],
                     title_text="Date")
    fig.update_layout(title_text=title)
    #fig.show()
    return fig

In [127]:
def plot_combined_graph_old(indicator_df, senti_df, indicator_name="y-axis label", title="Default Title", add_rangeslide=True): 
    """
    returns a Plotly graph given a dataframes containing financial indicator data and 
    a dataframe containing sentiment values 
    
    INPUT:
    
    indicator_df: dataframe containing a "date" column of datetimes and a "values" column of float values
    senti_df: dataframe containing a "final_sentiment" column of float values
    
    indicator_name: string label for y axis (what is tracked in indicator_df["values"])
    title: string of title 
    add_rangeslide: boolean of displaying
    
    
    """
    
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    indi_y = indicator_df["values"].astype('float64')
    dates = indicator_df["dates"]
    senti_y = senti_df["final_sentiment"].astype('float64')
    
    #check that 
    #assert(len(dates) == len(senti_y))
    
    #Add indicator area visualization - set boundaries, add to fig object, update axis
    indi_axis_min = min(indi_y) - 0.1
    indi_axis_max = max(indi_y) + 0.1

    fig.add_trace(
        go.Scatter(x=dates, 
                   y=indi_y, 
                    name=indicator_name,
                    fill='tonexty', 
                    mode='lines', 
                    line_color='blue'),
                    secondary_y=False)
    fig['layout']['yaxis1'].update(title=indicator_name, 
                                   range=[indi_axis_min, indi_axis_max], 
                                   autorange=False)
    
    #Add sentiment line visualization - set boundaries, add to fig object, update axis
    senti_axis_min = min(senti_y) - 1
    senti_axis_max = max(senti_y) + 1
    
    fig.add_trace(
        go.Scatter(x=dates, 
                   y=senti_y, 
                   name="test sentiment", ## we should change this? 
                  line_color='orange'),
                    secondary_y=True)
    fig['layout']['yaxis2'].update(title='Sentiment Score', 
                                   range=[senti_axis_min, senti_axis_max])
    
    #update x axis, add title, and show
    fig.update_xaxes(rangeslider_visible=add_rangeslide, title_text="Date")
    fig.update_layout(title_text=title)
    #fig.show()
    return fig

In [187]:
def plot_combined_graph_scatter(indicator_df, senti_df, indicator_name="y-axis label", title="Default Title", add_rangeslide=True): 
    """
    returns a Plotly graph given a dataframes containing financial indicator data and 
    a dataframe containing sentiment values 
    INPUT:
    indicator_df: dataframe containing a "date" column of datetimes and a "values" column of float values
    senti_df: dataframe containing a "final_sentiment" column of float values
    indicator_name: string label for y axis (what is tracked in indicator_df["values"])
    title: string of title 
    add_rangeslide: boolean of displaying
    """
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    indi_y = indicator_df["values"].astype('float64')
    indi_dates = indicator_df["dates"].astype('datetime64[ns]') 
    senti_y = senti_df["final_sentiment"].astype('float64')
    #senti_dates = senti_df["publishedAt"].astype('datetime64[ns]') ##Jun3 changed to get dates from index of senti_df
    senti_dates = senti_df.index.astype('datetime64[ns]')
    
    
    #set x axis limits for visualization
    x_axis_limit_l = max([min(indi_dates), min(senti_dates)]) #latest of two start periods
    x_axis_limit_r = min([max(indi_dates), max(senti_dates)]) #earliest of two end periods
    #Add indicator area visualization - set boundaries, add to fig object, update axis
    indi_axis_min = min(indi_y) - 0.1
    indi_axis_max = max(indi_y) + 0.1
    fig.add_trace(
        go.Scatter(x=indi_dates, 
                   y=indi_y, 
                    name=indicator_name,
                    #fill="tozeroy",
                    mode='lines', 
                    line_color='blue'),
                    secondary_y=False)
    fig['layout']['yaxis1'].update(title=indicator_name, 
                                   range=[indi_axis_min, indi_axis_max], 
                                   autorange=False)
    #Add sentiment line visualization - set boundaries, add to fig object, update axis
    senti_axis_min = min(senti_y) - 1
    senti_axis_max = max(senti_y) + 1
    fig.add_trace(
        go.Scatter(
            x=senti_dates, 
            y=senti_y,
            mode="markers",
            opacity=0.5, #0.95
            name="Sentiment Score",
            marker=dict(
                color="green" #orange
            )
        ),
        secondary_y=True
    )
    fig['layout']['yaxis2'].update(title='Sentiment Score', 
                                   range=[senti_axis_min, senti_axis_max])
    #update x axis, add title, and show
    fig.update_xaxes(rangeslider_visible=add_rangeslide,
                     range = [x_axis_limit_l, x_axis_limit_r],
                     title_text="Date")
    fig.update_layout(title_text=title)
    #fig.show()
    return fig

In [215]:
# additional feature: add a new trace for annotated data/ to distinguish it from predicted data


def plot_combined_graph_scatter_anno(indicator_df, senti_df, anno_senti_df ,indicator_name="y-axis label", title="Default Title", add_rangeslide=True): 
    """
    returns a Plotly graph given a dataframes containing financial indicator data and 
    a dataframe containing sentiment values 
    INPUT:
    indicator_df: dataframe containing a "date" column of datetimes and a "values" column of float values
    senti_df: dataframe containing a "final_sentiment" column of float values
    indicator_name: string label for y axis (what is tracked in indicator_df["values"])
    title: string of title 
    add_rangeslide: boolean of displaying
    """
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    indi_y = indicator_df["values"].astype('float64')
    indi_dates = indicator_df["dates"].astype('datetime64[ns]') 
    senti_y = senti_df["final_sentiment"].astype('float64')
    #senti_dates = senti_df["publishedAt"].astype('datetime64[ns]') ##Jun3 changed to get dates from index of senti_df
    senti_dates = senti_df.index.astype('datetime64[ns]')
    
    anno_senti_y = anno_senti_df["final_sentiment"].astype('float64')
    
    
    #set x axis limits for visualization
    x_axis_limit_l = max([min(indi_dates), min(senti_dates)]) #latest of two start periods
    x_axis_limit_r = min([max(indi_dates), max(senti_dates)]) #earliest of two end periods
    #Add indicator area visualization - set boundaries, add to fig object, update axis
    indi_axis_min = min(indi_y) - 0.1
    indi_axis_max = max(indi_y) + 0.1
    fig.add_trace(
        go.Scatter(x=indi_dates, 
                   y=indi_y, 
                    name=indicator_name,
                    #fill="tozeroy",
                    mode='lines', 
                    line_color='blue'),
                    secondary_y=False)
    fig['layout']['yaxis1'].update(title=indicator_name, 
                                   range=[indi_axis_min, indi_axis_max], 
                                   autorange=False)
    #Add sentiment line visualization - set boundaries, add to fig object, update axis
    senti_axis_min = min(senti_y) - 1
    senti_axis_max = max(senti_y) + 1
    fig.add_trace(
        go.Scatter(
            x=senti_dates, 
            y=senti_y,
            mode="markers",
            opacity=0.5, #0.95
            name="Predicted Sentiment",
            marker=dict(
                color="green" #orange
            )
        ),
        secondary_y=True
    )
    fig['layout']['yaxis2'].update(title='Sentiment Score', 
                                   range=[senti_axis_min, senti_axis_max])
    
    #add one more line for annotated data points
    fig.add_trace(
        go.Scatter(
            x=senti_dates, 
            y=anno_senti_y,
            mode="markers",
            opacity=0.5, #0.95
            name="Golden Sentiment",
            marker=dict(
                color="orange" 
            )
        ),
        secondary_y=True
    )
    
    #update x axis, add title, and show
    fig.update_xaxes(rangeslider_visible=add_rangeslide,
                     range = [x_axis_limit_l, x_axis_limit_r],
                     title_text="Date")
    fig.update_layout(title_text=title)
    #fig.show()
    return fig

In [119]:
test_mrg_df

Unnamed: 0,dates,values,indicator
0,2019-01-01,3.84,mortgage
1,2019-02-01,3.82,mortgage
2,2019-03-01,3.76,mortgage
3,2019-04-01,3.59,mortgage
4,2019-05-01,3.45,mortgage
5,2019-06-01,3.33,mortgage
6,2019-07-01,3.22,mortgage
7,2019-08-01,3.16,mortgage
8,2019-09-01,3.12,mortgage
9,2019-10-01,3.12,mortgage


In [128]:
#test function  
test_indicator_name = "gdp"
test_title = "Canadian News Sentiment VS GDP (test)"

###  show all mixed-source

###note: 
test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df, test_indicator_name, test_title)
test_fig.show()


## now try visualize monthly avg sentiment
monthly_senti_df_gdp_cbc.columns = ['final_sentiment']
test_fig = plot_combined_graph_new(test_mrg_df, monthly_senti_df_gdp_cbc, test_indicator_name, test_title)


test_fig.show()

In [129]:
test_indicator_name = "mortgage"
test_title = "Canadian News Sentiment VS mortgage (test)"

monthly_senti_df_mortgage.columns = ['final_sentiment']
test_fig = plot_combined_graph_new(test_mrg_df, monthly_senti_df_mortgage, test_indicator_name, test_title, False)
test_fig.show()

In [132]:
#test function  
test_indicator_name = "Interest Rate  <br> (total, funds advanced, <br> residential mortgages, insured)"
test_title = "Canadian News Sentiment - Mortgage Rate (test)"

###  show all mixed-source

test_fig = plot_combined_graph_new(test_mrg_df, monthly_senti_df_mortgage, test_indicator_name, test_title)
test_fig.show()

In [133]:
monthly_senti_df_mortgage

Unnamed: 0_level_0,final_sentiment
publishedAt,Unnamed: 1_level_1
2017-07-31,1.5
2017-08-31,1.5
2017-09-30,1.5
2017-10-31,0.0
2017-11-30,0.0
2017-12-31,0.0
2018-01-31,0.0
2018-02-28,0.0
2018-03-31,0.0
2018-04-30,-1.5


In [174]:
# #test function  
# test_indicator_name = "Interest Rate  <br> (total, funds advanced, <br> residential mortgages, insured)"
# test_title = "Canadian News Sentiment - Mortgage Rate (test)"

# ###  show all mixed-source

# test_fig = plot_combined_graph_scatter(test_mrg_df, mixedsource_senti_df, test_indicator_name, test_title)
# test_fig.show()

In [137]:
source_name = "CBC"

test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df.query('source==@source_name '), test_indicator_name, test_title)
test_fig.show()

In [9]:
source_name = "Bloomberg"

test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df.query('source==@source_name '), test_indicator_name, test_title)
test_fig.show()

In [10]:
# test drawing test_fig in Dash app

import dash
import dash_core_components as dcc
import dash_html_components as html

app = dash.Dash()
app.layout = html.Div([
    
    html.H1("Hello World"),
    
    dcc.Markdown('''
    Trying out paragraph writing
    '''),
    
    dcc.Graph(figure=test_fig)
])

app.run_server(debug=True, use_reloader=False)  

Running on http://127.0.0.1:8050/
Debugger PIN: 676-100-173
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


In [157]:
# dictionary of y-axis labels from Jon


indic_to_value = {}
indic_to_value["value_GDP"] = "Chained (2012) Dollars (seasonally adjusted)"
indic_to_value["value_TSX"] = "S&P/TSX Composite Index - Close (Unadjusted - CAD)"
indic_to_value["value_mortgage_rates"]= "Interest Rate - Total, funds advanced, residential mortgages, insured"
indic_to_value["value_housing_prices"] = "Composite Home Price Index (Unadjusted for seasonality)"
indic_to_value["value_employment"] = "Employment Rate - Both sexes, 15 years and over"
indic_to_value["value_interest_rates"] = "Interest Rate - Overnight Target Rate (Bank of Canada)"

#If using overnight money market rate, uncomment the following
#indic_to_value["interest_rate"] = "Interest Rate - Overnight market financing"
indic_to_value



{'value_GDP': 'Chained (2012) Dollars (seasonally adjusted)',
 'value_TSX': 'S&P/TSX Composite Index - Close (Unadjusted - CAD)',
 'value_mortgage_rates': 'Interest Rate - Total, funds advanced, residential mortgages, insured',
 'value_housing_prices': 'Composite Home Price Index (Unadjusted for seasonality)',
 'value_employment': 'Employment Rate - Both sexes, 15 years and over',
 'value_interest_rates': 'Interest Rate - Overnight Target Rate (Bank of Canada)'}

In [16]:
#giant indicators df imported
indicators_df = pd.read_csv('combined_indicator_data.csv')#,index_col='date')
indicators_df

Unnamed: 0,date,value_GDP,value_TSX,value_mortgage_rates,value_employment,value_housing_prices,value_interest_rates
0,2019-01-01,1.952558e+12,15540.599609,3.84,60.8,227.4,
1,2019-02-01,1.948783e+12,15999.000000,3.82,61.1,228.2,
2,2019-03-01,1.961298e+12,16102.099609,3.76,61.0,229.8,
3,2019-04-01,1.966131e+12,16580.699219,3.59,61.6,231.3,
4,2019-05-01,1.971712e+12,16037.500000,3.45,62.7,232.0,
...,...,...,...,...,...,...,...
267,2020-05-26,,,,,,0.25
268,2020-05-27,,,,,,0.25
269,2020-05-28,,,,,,0.25
270,2020-05-29,,,,,,0.25


In [125]:
indicators_df[['date','value_TSX']].dropna()

Unnamed: 0,date,value_TSX
0,2019-01-01,15540.599609
1,2019-02-01,15999.0
2,2019-03-01,16102.099609
3,2019-04-01,16580.699219
4,2019-05-01,16037.5
6,2019-06-01,16382.200195
27,2019-07-01,16406.599609
50,2019-08-01,16442.099609
72,2019-09-01,16658.599609
94,2019-10-01,16483.199219


In [147]:
#indicators_df[['date','value_interest_rates']].dropna().rename(columns={'value_GDP':"values","date":"dates"}).set_index('publishedAt')

In [146]:
senti_df = combined_senti_df.query(' indicator == "gdp" ')
senti_df

Unnamed: 0.1,Unnamed: 0,source,title_desc,publishedAt,raw_sentiment_score,indicator,annotation_type
0,0,CBC,Hamilton economy in 'extraordinary pain' but w...,2020-05-13,-1.500000,gdp,annotated
1,1,CBC,Federal deficit likely to be higher than $252 ...,2020-05-13,-1.500000,gdp,annotated
2,2,CBC,Post-secondary schools face rough autumn if pa...,2020-05-07,0.000000,gdp,annotated
3,3,CBC,"May and Blanchet declare the oilpatch 'dead,' ...",2020-05-06,-1.500000,gdp,annotated
4,4,Bloomberg,Setback to Montreal retail reopening shows roc...,2020-05-04,-1.500000,gdp,annotated
...,...,...,...,...,...,...,...
862,53,CBC,Business sentiment improves after downturn ear...,2019-06-28,-1.488752,gdp,predicted
863,45,CBC,Trade war darkens economic forecasts for Canad...,2019-06-04,-1.486129,gdp,predicted
864,84,CBC,Alberta could slip back into recession this ye...,2019-05-28,-1.494717,gdp,predicted
865,91,CBC,Why overlooked green energy sector is an econo...,2019-05-23,-0.052256,gdp,predicted


In [149]:
monthly_senti_df_gdp =get_monthly_avg_score(combined_senti_df.query('indicator=="gdp" '))
monthly_senti_df_gdp

Unnamed: 0_level_0,monthly_avg_sent_score
publishedAt,Unnamed: 1_level_1
2018-03-31,-1.5
2018-04-30,-1.5
2018-05-31,1.5
2018-06-30,0.0
2018-07-31,1.5
2018-08-31,0.5
2018-09-30,1.0
2018-10-31,-0.5
2018-11-30,0.0
2018-12-31,0.5


In [151]:
get_monthly_avg_score(combined_senti_df.query('indicator=="gdp" ')).rename(columns={'raw_sentiment_score':'final_sentiment'} )

Unnamed: 0_level_0,monthly_avg_sent_score
publishedAt,Unnamed: 1_level_1
2018-03-31,-1.5
2018-04-30,-1.5
2018-05-31,1.5
2018-06-30,0.0
2018-07-31,1.5
2018-08-31,0.5
2018-09-30,1.0
2018-10-31,-0.5
2018-11-30,0.0
2018-12-31,0.5


In [188]:
daily_senti_df = combined_senti_df.query('indicator=="gdp" ').rename(columns={'raw_sentiment_score':'final_sentiment'} )
monthly_senti_df =get_monthly_avg_score(combined_senti_df.query('indicator=="gdp" '))
senti_df = monthly_senti_df.rename(columns={'monthly_avg_sent_score':'final_sentiment'} )

indicator_name = "GDP"
indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
#indicator_df = indicators_df[[indicator_colname]].dropna()
indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
indicator_label = indic_to_value[indicator_colname]
test_title = indicator_name + ' VS news sentiment ' # (2019 Jan - 2020 May) -- this is not the same for each indicator data

test_fig = plot_combined_graph_new(indicator_df, senti_df, indicator_label, test_title) 
test_fig.show()

test_fig2 = plot_combined_graph_scatter(indicator_df, daily_senti_df, indicator_label, test_title) 
test_fig2.show()

In [195]:
combined_senti_df.annotation_type.unique()

array(['annotated', 'predicted'], dtype=object)

In [216]:
# test annotated trace
#combined_senti_df

combined_senti_df.query('indicator=="gdp"& annotation_type=="annotated" ')

daily_senti_df = combined_senti_df.query('indicator=="gdp"& annotation_type=="predicted" ').rename(columns={'raw_sentiment_score':'final_sentiment'} )
daily_senti_df_anno = combined_senti_df.query('indicator=="gdp"& annotation_type=="annotated" ').rename(columns={'raw_sentiment_score':'final_sentiment'} )

test_fig3 = plot_combined_graph_scatter_anno(indicator_df, daily_senti_df, daily_senti_df_anno, indicator_label, test_title) 
test_fig3.show()

In [181]:
combined_senti_df.indicator.unique()

array(['gdp', 'employment', 'housing', 'interest', 'mortgage', 'stock'],
      dtype=object)

In [180]:
indicators_df

Unnamed: 0,date,value_GDP,value_TSX,value_mortgage_rates,value_employment,value_housing_prices,value_interest_rates
0,2019-01-01,1.952558e+12,15540.599609,3.84,60.8,227.4,
1,2019-02-01,1.948783e+12,15999.000000,3.82,61.1,228.2,
2,2019-03-01,1.961298e+12,16102.099609,3.76,61.0,229.8,
3,2019-04-01,1.966131e+12,16580.699219,3.59,61.6,231.3,
4,2019-05-01,1.971712e+12,16037.500000,3.45,62.7,232.0,
...,...,...,...,...,...,...,...
267,2020-05-26,,,,,,0.25
268,2020-05-27,,,,,,0.25
269,2020-05-28,,,,,,0.25
270,2020-05-29,,,,,,0.25


In [217]:
#replace real_indicator_name with variable name
indicator_name = "interest rates"
indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
#indicator_df = indicators_df[[indicator_colname]].dropna() 

# ref : indicators = ['GDP','mortgage rates','interest rates','employment','housing prices','TSX']
senti_colname_dict = {'GDP':'gdp', 'employment':'employment', 'housing prices':'housing','interest rates': 'interest', 'mortgage rates':'mortgage','TSX': 'stock'}
senti_colname = senti_colname_dict[indicator_name]

daily_senti_df = combined_senti_df.query('indicator==@senti_colname ').rename(columns={'raw_sentiment_score':'final_sentiment'} )
monthly_senti_df =get_monthly_avg_score(combined_senti_df.query('indicator==@senti_colname '))
senti_df = monthly_senti_df.rename(columns={'monthly_avg_sent_score':'final_sentiment'} )

indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
indicator_label = indic_to_value[indicator_colname]
test_title = indicator_name + ' VS news sentiment (2019 Jan - 2020 May) '

test_fig = plot_combined_graph_new(indicator_df, senti_df, indicator_label, test_title) 
test_fig.show()

test_fig2 = plot_combined_graph_scatter(indicator_df, daily_senti_df, indicator_label, test_title) 
test_fig2.show()

##adding here

daily_senti_df = combined_senti_df.query('indicator==@senti_colname & annotation_type=="predicted" ').rename(columns={'raw_sentiment_score':'final_sentiment'} )
daily_senti_df_anno = combined_senti_df.query('indicator==@senti_colname & annotation_type=="annotated" ').rename(columns={'raw_sentiment_score':'final_sentiment'} )

test_fig3 = plot_combined_graph_scatter_anno(indicator_df, daily_senti_df, daily_senti_df_anno, indicator_label, test_title) 
test_fig3.show()

In [None]:
## 7:02pm try  the version with a third trace for annotations



In [214]:
## checkpoint: works as at Jun1 8:20pm (2 indicators)
##checkpint: added vis of 6 indicators successfully Jun2 5:32pm

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

indicators = ['GDP','mortgage rates','interest rates','employment','housing prices','TSX']
# indicators = ['Gross Domestic Product','Mortgage rates','Interest rates','Employment','Housing prices','Toronto Stock Exchange(TSX) Index']
sources = ['All','Bloomberg','CBC']
default_indicator = 'GDP'
default_source = 'All'

senti_colname_dict = {'GDP':'gdp', 'employment':'employment', 'housing prices':'housing','interest rates': 'interest', 'mortgage rates':'mortgage','TSX': 'stock'}


app = dash.Dash()
app.layout = html.Div([
    
    html.H1("Hello World"),
    
    dcc.Markdown('''
    Trying out paragraph writing
    '''),
    
    dcc.Dropdown(
                id='indicator-name',
                options=[{'label': i[0].capitalize()+i[1:], 'value': i} for i in indicators],
                value=default_indicator # set a default value
            ),
    
    dcc.Dropdown(
                id='source-name',
                options=[{'label': i, 'value': i} for i in sources],
                value=default_source # set a default value
            ),
    
    html.P("Pick a chart type: "),
    
    dcc.RadioItems(
        id = 'chart-type',
        options=[
            {'label': 'Monthly average', 'value': 'line'},
            {'label': 'Daily datapoints', 'value': 'scatter'}
        ],
        value='line'
    ),
    
    dcc.Graph(id = 'indicator-senti-graph') #figure=test_fig
])

@app.callback(
Output("indicator-senti-graph","figure"),# must be a single Output item when returns only one value
[Input("indicator-name","value"),
Input("source-name","value"),
Input("chart-type","value")])
def update_graph(indicator_name, source_name,chart_type):
    
    indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
    #indicator_df = indicators_df[[indicator_colname]].dropna()
    indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
    indicator_label = indic_to_value[indicator_colname]
    test_title = indicator_name + ' VS news sentiment (2019 Jan - 2020 May) '
    
    
    senti_colname = senti_colname_dict[indicator_name]
    
    if source_name == "All":
        senti_df = combined_senti_df.query('indicator == @senti_colname')
    else:
        senti_df = combined_senti_df.query('source ==@source_name & indicator == @senti_colname ')
    
    
    daily_senti_df = senti_df.rename(columns={'raw_sentiment_score':'final_sentiment'} )
    monthly_senti_df =get_monthly_avg_score(senti_df)
    month_senti_df = monthly_senti_df.rename(columns={'monthly_avg_sent_score':'final_sentiment'} )
    

    if chart_type == 'line':
        return plot_combined_graph_new(indicator_df, month_senti_df, indicator_label, test_title) 

    else:
        return plot_combined_graph_scatter(indicator_df, daily_senti_df, indicator_label, test_title) 


app.run_server(debug=True, use_reloader=False) 

Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-068
Debugger PIN: 438-978-06

In [None]:
#below is a backup

## checkpoint: works as at Jun1 8:20pm (2 indicators)
##checkpint: added vis of 6 indicators successfully Jun2 5:32pm

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

indicators = ['GDP','mortgage rates','interest rates','employment','housing prices','TSX']
# indicators = ['Gross Domestic Product','Mortgage rates','Interest rates','Employment','Housing prices','Toronto Stock Exchange(TSX) Index']
sources = ['All','Bloomberg','CBC']
default_indicator = 'GDP'
default_source = 'All'

df = indicators_df
senti_df = combined_senti_df

app = dash.Dash()
app.layout = html.Div([
    
    html.H1("Hello World"),
    
    dcc.Markdown('''
    Trying out paragraph writing
    '''),
    
    dcc.Dropdown(
                id='indicator-name',
                options=[{'label': i[0].capitalize()+i[1:], 'value': i} for i in indicators],
                value=default_indicator # set a default value
            ),
    
    dcc.Dropdown(
                id='source-name',
                options=[{'label': i, 'value': i} for i in sources],
                value=default_source # set a default value
            ),
    
    dcc.Graph(id = 'indicator-senti-graph') #figure=test_fig
])

@app.callback(
Output("indicator-senti-graph","figure"),# must be a single Output item when returns only one value
[Input("indicator-name","value"),
Input("source-name","value")])
def update_graph(indicator_name, source_name):
    
    indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
    #indicator_df = indicators_df[[indicator_colname]].dropna()
    indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
    indicator_label = indic_to_value[indicator_colname]
    test_title = indicator_name + ' VS news sentiment (2019 Jan - 2020 May) '
    
#     if source_name == "All":
#         senti_df = senti_df
    if source_name != "All":
        senti_df = senti_df.query('source ==@source_name')
    
    
    test_fig = plot_combined_graph(indicator_df, senti_df, indicator_label, test_title)
    test_fig = plot_combined_graph_scatter(indicator_df, senti_df, indicator_label, test_title) 

    
    return test_fig


app.run_server(debug=True, use_reloader=False) 