In [98]:
## below test Jon's visualization code

import dash
import dash_core_components as dcc
import dash_html_components as html
from datetime import datetime
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import random
from plotly.subplots import make_subplots

def parse_govt_data(df, daterow_ind, valuerow_ind, datetime_format_in):
    """returns a list of datetime strings and financial indicator values from 
    dataframe of govt data
    
    assume datetime format for output datetime list = YYYY-MM-DD
    
    """
    date_row = df.iloc[daterow_ind]
    date_row = date_row[2:]
    date_list = date_row.tolist()
    datetime_list = []

    for item in date_list:
        dt = datetime.strptime(item, datetime_format_in)
        datetime_list.append(dt.strftime("%Y-%m-%d"))


    value_row = df.iloc[valuerow_ind]
    value_row = value_row[2:]
    value_list = value_row.tolist()
    
    assert (len(value_list) == len(datetime_list))

    return datetime_list, value_list



filepath = r"C:\Users\gen80\OneDrive\Documents\MDSlectures\capstone_sentiment_analysis\testout\mortgage_rate_jan-2019_feb-2020.csv"
mortgage_df = pd.read_csv(filepath, skiprows=5)
mortgage_df.head()

datetime_mortgage, values_mortgage = parse_govt_data(mortgage_df, 0, 2, "%B %Y")

final_df = pd.DataFrame(values_mortgage, 
                        index =datetime_mortgage, 
                        columns =['values']).reset_index()

final_df["indicator"] = "mortgage"
test_mrg_df = final_df.rename(columns={"index": "dates"})

test_mrg_df.head()

#make toy sentiment data
test_senti = [random.uniform(-1, 1) for i in range(len(values_mortgage))]
test_senti_df = pd.DataFrame(test_senti, columns=["final_sentiment"])
test_senti_df.head()

Unnamed: 0,final_sentiment
0,0.041407
1,0.379483
2,-0.389945
3,0.669241
4,0.70661


In [118]:
## use senti_df produced from Aaron's code -- containing continuous sentiment score

import pandas as pd
from datetime import datetime

def generate_raw_sentiment_score(row):
    '''calculate sentiment score based on best_label'''
    if row['best_label'] == 1:
        result = row['best_confidence'] + 0.5
    elif row['best_label'] == -1:
        result = -row['best_confidence'] - 0.5
    else:
        # total = row['best_confidence'] + row['second_confidence'] + row['least_confidence'] these add up to 1
        if row['second_likely'] == 1:
            result = row['second_confidence'] - row['least_confidence']
        else:
            result = row['least_confidence'] - row['second_confidence']
    return result

def get_raw_sentiment_score(csvpath):
    '''outputs a dataframe that contains the raw sentiment score for all the articles'''
    df = pd.read_csv(csvpath)
    df['publishedAt'] = df['publishedAt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
    df['raw_sentiment_score'] = df.apply(lambda row: generate_raw_sentiment_score(row), axis=1)
    return df

url_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/master/week_4/predictions/prediction_output/unannotated_GDP_Bloomberg_predictions.csv?token=AAAAOMT7F4GPS4GRINYCZC2633F5Q'
gdp_with_raw = get_raw_sentiment_score(url_gdp_prediction)

In [126]:
url_gdp_prediction = 'https://raw.github.ubc.ca/ltian05/better_dwelling_capstone/amylam/sentiment_analyzer/data/prediction_output/cbc/unannotated_GDP_CBC_predictions.csv?token=AAAAPK4DI6H43GJIUKXHWJK64AHL6'
gdp_with_raw2 = get_raw_sentiment_score(url_gdp_prediction)

In [132]:
# unify the source naming for later filtering

test_senti_df = gdp_with_raw[['publishedAt','source','title_desc','raw_sentiment_score']]
test_senti_df['source'] = "Bloomberg"
test_senti_df.head()

test_senti_df2= gdp_with_raw2[['publishedAt','source','title_desc','raw_sentiment_score']]
test_senti_df2['source'] = "CBC"
test_senti_df2.head()

mixedsource_senti_df = pd.concat([test_senti_df,test_senti_df2])
mixedsource_senti_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,publishedAt,source,title_desc,raw_sentiment_score
0,2020-04-30,Bloomberg,Economic growth stalled in February ahead of v...,-1.497521
1,2020-04-29,Bloomberg,America’s longest economic expansion is over —...,-1.498286
2,2020-04-16,Bloomberg,"Once seen as safer than gold, Canadian real es...",-1.490286
3,2020-04-16,Bloomberg,China won't be able to bail us out this time. ...,-1.473306
4,2020-04-07,Bloomberg,Michael Burry of 'The Big Short' slams coronav...,-1.465097


In [139]:
mixedsource_senti_df = mixedsource_senti_df.sort_values('publishedAt')
mixedsource_senti_df = mixedsource_senti_df.rename(columns= {"raw_sentiment_score":"final_sentiment"})

mixedsource_senti_df

Unnamed: 0,publishedAt,source,title_desc,final_sentiment
43,2019-04-01,Bloomberg,"Stephen Poloz confident slowdown is temporary,...",-1.358499
42,2019-04-02,Bloomberg,WTO says tariff war will hammer global trade g...,-1.363895
41,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
40,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
39,2019-04-29,Bloomberg,Why are economists so bad at forecasting reces...,0.035580
...,...,...,...,...
11,2020-05-09,CBC,Deficit reduction will have to wait for the ec...,-0.272068
7,2020-05-13,CBC,England tiptoes out of lockdown as economy div...,-1.440326
5,2020-05-13,CBC,Federal deficit likely to be higher than $252 ...,-1.494338
6,2020-05-14,CBC,New Zealand plans spending spree to counter vi...,-1.161278


In [109]:
def plot_combined_graph(indicator_df, senti_df, indicator_name="y-axis label", title="Default Title", add_rangeslide=True): 
    """
    returns a Plotly graph given a dataframes containing financial indicator data and 
    a dataframe containing sentiment values 
    
    INPUT:
    
    indicator_df: dataframe containing a "date" column of datetimes and a "values" column of float values
    senti_df: dataframe containing a "final_sentiment" column of float values
    
    indicator_name: string label for y axis (what is tracked in indicator_df["values"])
    title: string of title 
    add_rangeslide: boolean of displaying
    
    
    """
    
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    indi_y = indicator_df["values"].astype('float64')
    dates = indicator_df["dates"]
    senti_y = senti_df["final_sentiment"].astype('float64')
    
    #check that 
    #assert(len(dates) == len(senti_y))
    
    #Add indicator area visualization - set boundaries, add to fig object, update axis
    indi_axis_min = min(indi_y) - 0.1
    indi_axis_max = max(indi_y) + 0.1

    fig.add_trace(
        go.Scatter(x=dates, 
                   y=indi_y, 
                    name=indicator_name,
                    fill='tonexty', 
                    mode='lines', 
                    line_color='blue'),
                    secondary_y=False)
    fig['layout']['yaxis1'].update(title=indicator_name, 
                                   range=[indi_axis_min, indi_axis_max], 
                                   autorange=False)
    
    #Add sentiment line visualization - set boundaries, add to fig object, update axis
    senti_axis_min = min(senti_y) - 1
    senti_axis_max = max(senti_y) + 1
    
    fig.add_trace(
        go.Scatter(x=dates, 
                   y=senti_y, 
                   name="test sentiment", ## we should change this? 
                  line_color='orange'),
                    secondary_y=True)
    fig['layout']['yaxis2'].update(title='Sentiment Score', 
                                   range=[senti_axis_min, senti_axis_max])
    
    #update x axis, add title, and show
    fig.update_xaxes(rangeslider_visible=add_rangeslide, title_text="Date")
    fig.update_layout(title_text=title)
    #fig.show()
    return fig

In [145]:
#test function  
test_indicator_name = "Interest Rate  <br> (total, funds advanced, <br> residential mortgages, insured)"
test_title = "Canadian News Sentiment - Mortgage Rate (test)"

###  show all mixed-source

test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df, test_indicator_name, test_title)
test_fig.show()


In [144]:
source_name = "CBC"

test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df.query('source==@source_name '), test_indicator_name, test_title)
test_fig.show()

In [143]:
source_name = "Bloomberg"

test_fig = plot_combined_graph(test_mrg_df, mixedsource_senti_df.query('source==@source_name '), test_indicator_name, test_title)
test_fig.show()

In [146]:
# test drawing test_fig in Dash app

import dash
import dash_core_components as dcc
import dash_html_components as html

app = dash.Dash()
app.layout = html.Div([
    
    html.H1("Hello World"),
    
    dcc.Markdown('''
    Trying out paragraph writing
    '''),
    
    dcc.Graph(figure=test_fig)
])

app.run_server(debug=True, use_reloader=False)  

Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger PIN: 719-041-941
Debugger P

In [155]:
#need to make a giant indicator df for cross-filtering 

#simulation data
indicators_df = pd.merge(test_mrg_df,test_emp_df,on="dates",how='outer') #both 14 rows
indicators_df['values_y'] = range(14)

In [158]:
indicators_df = indicators_df[['dates','values_x',"values_y"]]

In [161]:
#above to be replaced by a giant indicators_df created by Jon's function

indicators_df.columns= ['date','value_mortgage_rates',"value_GDP"]
indicators_df = indicators_df.set_index('date')
indicators_df

Unnamed: 0_level_0,value_mortgage_rates,value_GDP
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,3.84,0
2019-02-01,3.82,1
2019-03-01,3.76,2
2019-04-01,3.59,3
2019-05-01,3.45,4
2019-06-01,3.33,5
2019-07-01,3.22,6
2019-08-01,3.16,7
2019-09-01,3.12,8
2019-10-01,3.12,9


In [164]:
# try get a subset

indicator_name = 'mortgage rates'

indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	

indicators_df[indicator_colname]

date
2019-01-01    3.84
2019-02-01    3.82
2019-03-01    3.76
2019-04-01    3.59
2019-05-01    3.45
2019-06-01    3.33
2019-07-01    3.22
2019-08-01    3.16
2019-09-01    3.12
2019-10-01    3.12
2019-11-01    3.13
2019-12-01    3.23
2020-01-01    3.24
2020-02-01    3.19
Name: value_mortgage_rates, dtype: object

In [201]:
# dictionary of y-axis labels from Jon


indic_to_value = {}
indic_to_value["value_GDP"] = "Chained (2012) Dollars (seasonally adjusted)"
indic_to_value["value_TSX"] = "S&P/TSX Composite Index - Close (Unadjusted - CAD)"
indic_to_value["value_mortgage_rates"]= "Interest Rate - Total, funds advanced, residential mortgages, insured"
indic_to_value["value_interest_rates"] = "Interest Rate - Overnight money market financing"
indic_to_value["value_employment"] = "Employment Rate - Both sexes, 15 years and over"
indic_to_value["interest_rate"] = "Interest Rate - Overnight Target Rate (Bank of Canada)"

#housing is missing 
indic_to_value["value_housing_prices"] = 'housing prices'





In [185]:
#giant indicators df imported
indicators_df = pd.read_csv('combined_indicator_data.csv')#,index_col='date')
indicators_df

Unnamed: 0,date,value_GDP,value_TSX,value_mortgage_rates,value_employment,value_housing_prices,value_interest_rates
0,2019-01-01,1.952558e+12,15540.599609,3.84,60.8,227.4,
1,2019-02-01,1.948783e+12,15999.000000,3.82,61.1,228.2,
2,2019-03-01,1.961298e+12,16102.099609,3.76,61.0,229.8,
3,2019-04-01,1.966131e+12,16580.699219,3.59,61.6,231.3,
4,2019-05-01,1.971712e+12,16037.500000,3.45,62.7,232.0,
...,...,...,...,...,...,...,...
267,2020-05-26,,,,,,0.25
268,2020-05-27,,,,,,0.25
269,2020-05-28,,,,,,0.25
270,2020-05-29,,,,,,0.25


In [204]:
indicators_df[['date','value_interest_rates']].dropna().rename(columns={'value_GDP':"values","date":"dates"})

Unnamed: 0,dates,value_interest_rates
5,2019-05-31,1.75
7,2019-06-03,1.75
8,2019-06-04,1.75
9,2019-06-05,1.75
10,2019-06-06,1.75
...,...,...
267,2020-05-26,0.25
268,2020-05-27,0.25
269,2020-05-28,0.25
270,2020-05-29,0.25


In [203]:
senti_df

Unnamed: 0,publishedAt,source,title_desc,final_sentiment
43,2019-04-01,Bloomberg,"Stephen Poloz confident slowdown is temporary,...",-1.358499
42,2019-04-02,Bloomberg,WTO says tariff war will hammer global trade g...,-1.363895
41,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
40,2019-04-15,Bloomberg,"Trump is slamming the Fed again, saying stocks...",1.346158
39,2019-04-29,Bloomberg,Why are economists so bad at forecasting reces...,0.035580
...,...,...,...,...
11,2020-05-09,CBC,Deficit reduction will have to wait for the ec...,-0.272068
7,2020-05-13,CBC,England tiptoes out of lockdown as economy div...,-1.440326
5,2020-05-13,CBC,Federal deficit likely to be higher than $252 ...,-1.494338
6,2020-05-14,CBC,New Zealand plans spending spree to counter vi...,-1.161278


In [205]:
indicator_name = "GDP"
indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
#indicator_df = indicators_df[[indicator_colname]].dropna()
indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
indicator_label = indic_to_value[indicator_colname]
test_title = indicator_name + 'vs news sentiment (2019 Jan - 2020 May) '

test_fig = plot_combined_graph(indicator_df, senti_df, indicator_label, test_title) 
test_fig.show()

In [206]:
indicator_name = "interest rates"
indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
#indicator_df = indicators_df[[indicator_colname]].dropna()
indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
indicator_label = indic_to_value[indicator_colname]
test_title = indicator_name + 'vs news sentiment (2019 Jan - 2020 May) '

test_fig = plot_combined_graph(indicator_df, senti_df, indicator_label, test_title) 
test_fig.show()

In [None]:
## checkpoint: works as at Jun1 8:20pm

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

indicators = ['GDP','mortgage rates','interest rates','employment','housing prices','TSX']
sources = ['All','Bloomberg','CBC']
default_indicator = 'GDP'
default_source = 'All'

df = indicators_df
senti_df = mixedsource_senti_df

app = dash.Dash()
app.layout = html.Div([
    
    html.H1("Hello World"),
    
    dcc.Markdown('''
    Trying out paragraph writing
    '''),
    
    dcc.Dropdown(
                id='indicator-name',
                options=[{'label': i, 'value': i} for i in indicators],
                value=default_indicator # set a default value
            ),
    
    dcc.Dropdown(
                id='source-name',
                options=[{'label': i, 'value': i} for i in sources],
                value=default_source # set a default value
            ),
    
    dcc.Graph(id = 'indicator-senti-graph') #figure=test_fig
])

@app.callback(
Output("indicator-senti-graph","figure"),# must be a single Output item when returns only one value
[Input("indicator-name","value"),
Input("source-name","value")])
def update_graph(indicator_name, source_name):
    
    indicator_colname = 'value_'+"_".join(indicator_name.split())# value_mortgage_rates	
    #indicator_df = indicators_df[[indicator_colname]].dropna()
    indicator_df = indicators_df[['date', indicator_colname]].dropna().rename(columns={indicator_colname:"values","date":"dates"})
    indicator_label = indic_to_value[indicator_colname]
    test_title = indicator_name + ' VS news sentiment (2019 Jan - 2020 May) '
    
#     if source_name == "All":
#         senti_df = senti_df
    senti_df =  mixedsource_senti_df
    if source_name != "All":
        senti_df = senti_df.query('source ==@source_name  ')
    
    
    test_fig = plot_combined_graph(indicator_df, senti_df, indicator_label, test_title) 

    
    return test_fig


app.run_server(debug=True, use_reloader=False) 

Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on htt