### Generating Bokeh Time Series Plots

This notebook was used to generate the plots over time used for Games 1 and 2

In [1]:
import numpy as np
import pandas as pd
import math
import re

from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, FuncTickFormatter, Span
from bokeh.plotting import figure, show, output_file
from bokeh.io import export_svgs, export_png
import colorcet as cc

In [2]:
warriors_entities = [
    "Golden State",
    "Warriors",
    "Steve Kerr",
    "Kevin Durant",
    "Stephen Curry",
    "DeMarcus Cousins",
    "Klay Thompson",
    "Draymond Green",
    "Andre Iguodala",
    "Andrew Bogut",
    "Damion Lee",
    "Jordan Bell",
    "Shaun Livingston",
    "Kevon Looney",
    "Jonas Jerebko",
    "Quinn Cook",
    "Alfonzo McKinnie",
    "Jacob Evans",
    "Damian Jones",
    "Marcus Derrickson"
]

raptors_entities = [
    "Toronto",
    "Raptors",
    "Nick Nurse",
    "Kawhi Leonard",
    "Kyle Lowry",
    "Jeremy Lin",
    "Fred VanVleet",
    "Marc Gasol",
    "Pascal Siakam",
    "Danny Green",
    "Serge Ibaka",
    "OG Anunoby",
    "Norman Powell",
    "Patrick McCaw"
    "Chris Boucher",
    "Jodie Meeks",
    "Eric Moreland",
    "Malcolm Miller",
    "Jordan Loyd",
    "Nav Bhatia",

]

league_entities = [
    "NBA finals",
    "NBA"
    ]

hashtags = [
    "#DubNation",
    "#WeTheNorth",
    "#Basketball",
    "#Sports",
    "#NBAFinals",
    "#Warriors",
    "#Raptors",
    "#GoldenState",
    "#ESPN",
    "#BBall",
    "#Dunk",
    "#Basket",
    "#StephCurry",
    "#KevinDurant",
    "#NBAbasketball",
    "#GoldenStateWarriors",
    "#Curry",
    "#Hoops",
    "#Player",
    "#Game",
    "#NBAhistory"
]

In [4]:
# Color palettes of each team's colors in hex values

raptors_palette = ['#ce1141', '#000000','#B4975A']
warriors_palette = ['#006BB6','#FDB927','#26282A']
nba_palette = ['#17408b','#c9082a','#000000']

## Game 1 Bar Plots

In [None]:
nba_senti = pd.read_csv('results/nba_finals_game1_3pm_to_midnight_pdt.csv', parse_dates=["interval_start"],
                        date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S %Z"))

nba_senti.sort_values(by='interval_start')

nba_senti['minutes_since_start'] = (nba_senti['interval_start'] - nba_senti.loc[0,'interval_start']).astype('timedelta64[m]')

nba_senti['pdt'] = pd.DatetimeIndex(nba_senti['interval_start']).tz_localize('UTC').tz_convert('US/Pacific')

nba_senti.head()

In [None]:
nba_senti.sort_values(by='interval_start').head()

In [None]:
pre_game_df = nba_senti[nba_senti.interval_start < "2019-05-31 03:11:00"]
post_game_df = nba_senti[nba_senti.interval_start > "2019-05-31 05:37:00"]

In [None]:
nba_senti['num_datapoints'].value_counts()

### Pre-Game and Post-Game Aggregation

Getting that working

In [None]:
def replace_nans(num):
    if math.isnan(num):
        return 0
    else:
        return num

def generate_pre_post_lits(entities):
    
    filtered_entities = []
    pre_list = []
    post_list = []

    for entity in entities:
        pre_total_datapoints = pre_game_df[pre_game_df.entity == entity].num_datapoints.sum()
        post_total_datapoints = post_game_df[post_game_df.entity == entity].num_datapoints.sum()

        if pre_total_datapoints > 30 and post_total_datapoints > 30:
            filtered_entities.append(entity)
            
            pre_total_senti = pre_game_df[pre_game_df.entity == entity].avg_senti.sum()
            pre_avg_senti = pre_total_senti / len(pre_game_df[pre_game_df.entity == entity])
            pre_list.append(pre_avg_senti)

            post_total_senti = post_game_df[post_game_df.entity == entity].avg_senti.sum()
            post_avg_senti = post_total_senti / len(post_game_df[post_game_df.entity == entity])
            post_list.append(post_avg_senti)
        
    pre_cleaned = [replace_nans(num) for num in pre_list]
    post_cleaned = [replace_nans(num) for num in post_list]
    
    return filtered_entities, pre_cleaned, post_cleaned
    

In [None]:
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral5

output_file("bars.html")

periods = ['Pre-game','Post-game']

CURRENT_GROUP = warriors_entities
CURRENT_NAME = "Golden State Warriors"
# Choices are warriors_palette, raptors_palette, nba_palette
CURRENT_PALETTE = warriors_palette

filtered_entities, pre_team, post_team = generate_pre_post_lits(CURRENT_GROUP)

data = {'entities' : filtered_entities,
        'pregame'   : pre_team,
        'postgame'   : post_team}

# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (entity, period) for entity in filtered_entities for period in periods ]
sentis_zipped = sum(zip(data['pregame'], data['postgame']), ()) # like an hstack

source = ColumnDataSource(data=dict(x=x, sentis_zipped=sentis_zipped))

p = figure(x_range=FactorRange(*x), plot_height=450, title=f"Before and After Sentiment for {CURRENT_NAME}",
           toolbar_location=None, tools="")

p.vbar(x='x', top='sentis_zipped', width=0.9, source=source, line_color="white",
       fill_color=factor_cmap('x', palette=CURRENT_PALETTE, factors=periods, start=1, end=3))

p.xaxis.major_label_orientation = "vertical"
p.xaxis.major_label_text_font_size="12pt"
p.xaxis.group_label_orientation = "vertical"
p.xaxis.group_text_font_size = "14pt"
p.yaxis.major_label_text_font_size="12pt"
p.title.text_font_size="18pt"
p.toolbar.logo = None
p.toolbar_location = None
p.background_fill_color = None
p.border_fill_color = None

export_png(p, f"plots/{CURRENT_NAME}.png", height=600, width=800)

## Game 2 Streaming Plots (Like the Game of Thrones plots)

In [6]:
nba_senti_g2_raw = pd.read_csv('results/nba_finals_game2_5pm_to_midnight_est.csv', parse_dates=["interval_start"],
                        date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S %Z"))

nba_senti_g2_sorted = nba_senti_g2_raw.sort_values(by='interval_start')

nba_senti_g2_sorted['minutes_since_start'] = (nba_senti_g2_sorted['interval_start'] - nba_senti_g2_sorted.loc[0,'interval_start']).astype('timedelta64[m]')

nba_senti_g2_sorted['est'] = pd.DatetimeIndex(nba_senti_g2_sorted['interval_start']).tz_localize('UTC').tz_convert('US/Eastern')

nba_senti_g2_sorted.head()

Unnamed: 0,interval_start,entity,low_senti,avg_senti,high_senti,num_datapoints,minutes_since_start,est
1889,2019-06-03,#Basket,0.4939,0.494,0.4939,1,-3.0,2019-06-02 20:00:00-04:00
6581,2019-06-03,NBA Finals,-0.5574,0.226,0.7964,34,-3.0,2019-06-02 20:00:00-04:00
11545,2019-06-03,Kawhi Leonard,-0.4019,0.16,0.8519,5,-3.0,2019-06-02 20:00:00-04:00
6395,2019-06-03,Marc Gasol,0.5106,0.511,0.5106,1,-3.0,2019-06-02 20:00:00-04:00
12639,2019-06-03,Pascal Siakam,-0.4137,0.003,0.7177,3,-3.0,2019-06-02 20:00:00-04:00


In [57]:
output_file("nba_results.html")

warriors_category_palette = cc.glasbey_cool[:30]
raptors_category_palette = cc.glasbey_warm[:30]
nba_category_palette = cc.glasbey_dark[:30]

# You can change the GROUP to "warriors" and GROUP_ENTITIES to warriors_entities
# and GROUP_PALETTE to any of the palettes listed right above this comment
GROUP = "raptors"
GROUP_ENTITIES = raptors_entities
GROUP_PALETTE = nba_category_palette

source = ColumnDataSource(nba_senti_g2_sorted)

plot_size_and_tools = {'plot_height': 300, 'plot_width': 400,
                        'tools':['box_select', 'reset', 'help']}

p1 = figure(title="All Data", x_axis_type='datetime',
            **plot_size_and_tools)

#  x_range=(got_senti.loc[0,'pdt'], got_senti.loc[0,'pdt'] + pd.to_timedelta(6,'h')),


# This next block is Javascript that formats the dates.
js_formatter = FuncTickFormatter(code="""
    var utc_time = new Date(tick)
    var suffix = "PM"
    mil_time = (utc_time.getHours() + 10) % 12
    if (mil_time < 5) {
       suffix = "AM"
    }     
    return mil_time + suffix;
""")

# Let's start with all the data
p1.xaxis.formatter = js_formatter
#p1.xaxis.major_label_orientation = "vertical"
p1.xaxis.major_label_text_font_size="12pt"
p1.yaxis.major_label_text_font_size="12pt"
p1.title.text_font_size="18pt"

p1.circle(x='interval_start', y='avg_senti', source=source, color='black', alpha=0.2, size=8)

# Game started 9:11EST and ended 11:37EST according to
# https://www.theguardian.com/sport/live/2019/may/30/nba-finals-golden-state-warriors-toronto-raptors-game-1-live-score
episode_start = Span(location=pd.to_datetime("2019-06-03 03:11:00 UTC", format="%Y-%m-%d %H:%M:%S %Z"),
                      dimension='height', line_color=GROUP_PALETTE[-1],
                      line_dash='dashed', line_width=2)
episode_end = Span(location=pd.to_datetime("2019-06-03 05:37:00 UTC", format="%Y-%m-%d %H:%M:%S %Z"),
                      dimension='height', line_color=GROUP_PALETTE[-2],
                      line_dash='dashed', line_width=2)

p1.add_layout(episode_start)
p1.add_layout(episode_end)

#p1.background_fill_color = "cornsilk"
p1.toolbar.logo = None
p1.toolbar_location = None
p1.border_fill_color = None
p1.background_fill_color = None

# p1.background_fill_alpha = 0.4

# p1.output_backend = "svg"
# export_svgs(p1, "plots/overall.svg")

export_png(p1, "plots/game2/All_Data.png",height=400, width=600)

# Now the rest
got_views = dict()
got_plots = dict()

for idx, entity in enumerate(GROUP_ENTITIES):

#     par = np.polyfit(got_senti['minutes_since_start'], got_senti['avg_senti'], 1, full=True)
#     slope=par[0][0]
#     intercept=par[0][1]
#     got_senti['y_predicted'] = intercept + slope * got_senti['minutes_since_start']

    got_views[entity] = CDSView(source=source, filters=[GroupFilter(column_name='entity', group=entity)])

    got_plots[entity] = figure(title=f"Average sentiment associated with {entity}", x_axis_type='datetime', x_range=p1.x_range, **plot_size_and_tools)

    got_plots[entity].border_fill_color = None
    got_plots[entity].background_fill_color = None
    got_plots[entity].xaxis.formatter = js_formatter
#    got_plots[entity].ygrid.grid_line_color = "goldenrod"
    got_plots[entity].ygrid.grid_line_alpha = 0.2
    got_plots[entity].xgrid.grid_line_color = None
    got_plots[entity].xaxis.major_label_text_font_size="12pt"
    got_plots[entity].yaxis.major_label_text_font_size="12pt"
    got_plots[entity].title.text_font_size="18pt"
    got_plots[entity].toolbar.logo = None
    got_plots[entity].toolbar_location = None

    got_plots[entity].circle(x='interval_start', y='avg_senti', source=source, view=got_views[entity], color=GROUP_PALETTE[idx], alpha=0.2, size=8)

#     got_plots[entity].line(x='est', y='y_predicted', source=source, line_width=2, view=got_views[entity], color=got_palette[idx]) 

    episode_start = Span(location=pd.to_datetime("2019-06-03 03:11:00 UTC", format="%Y-%m-%d %H:%M:%S %Z"),
                          dimension='height', line_color=GROUP_PALETTE[-1],
                          line_dash='dashed', line_width=2)
    episode_end = Span(location=pd.to_datetime("2019-06-03 05:37:00 UTC", format="%Y-%m-%d %H:%M:%S %Z"),
                          dimension='height', line_color=GROUP_PALETTE[-2],
                          line_dash='dashed', line_width=2)

    got_plots[entity].add_layout(episode_start)
    got_plots[entity].add_layout(episode_end) 

#     got_plots[entity].output_backend = "svg"
#     export_svgs(got_plots[entity], f"plots/{entity}.svg")

    print(entity)
    export_png(got_plots[entity], f"plots/game2/{GROUP}/{entity}.png", height=400, width=600)

# show(gridplot([[p1] + [got_plots[ent] for ent in entities[0:3]],
#                [got_plots[ent] for ent in entities[3:7]],
#                [got_plots[ent] for ent in entities[7:11]],
#                [got_plots[ent] for ent in entities[11:15]],
#                [got_plots[ent] for ent in entities[15:19]],
#                [got_plots[ent] for ent in entities[19:23]],
#                [got_plots[ent] for ent in entities[23:]],
#                ]))

Toronto
Raptors
Nick Nurse
Kawhi Leonard
Kyle Lowry
Jeremy Lin
Fred VanVleet
Marc Gasol
Pascal Siakam
Danny Green
Serge Ibaka
OG Anunoby
Norman Powell
Patrick McCawChris Boucher
Jodie Meeks
Eric Moreland
Malcolm Miller
Jordan Loyd
Nav Bhatia


### VADER Scratch 

This is just to play with the sentiment analyzer and see how various text is scored.

In [50]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [52]:
TEST_TEXT = "Danny Green paying homage to Craig Sager and Stuart Scott. LOVE THIS!"

analyzer.polarity_scores(TEST_TEXT)

{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.7371}