In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm

In [3]:
## load the data
df = pd.read_csv('player_dataframe.csv')

## To do

- [x] change times to floats
- [x] add blank rows for players who didn't play games
- [x] finish bar chart
  - [x] rename game IDs to ints for bar chart
  - [x] figure out stacking
- [x] make the pts per category to map points
- [x] come up with the rest of the visuals

<br>
<br>
<br>
<br>

## Data cleaning + feature engineering

In [4]:
# ==== Points per category =====

def fantasy_points(df, goals, assists, shots, hits, powerPlayGoals, powerPlayAssists, 
                   penaltyMinutes, shortHandedAssists, blocks, shortHandedGoals, faceOffPct=0, takeaways=0, giveaways=0, 
                   plusMinus=0):
    """ This function calcluates the points the player scored per night, based on the inputted values from the user """

    df["fantasyPoints"] = (df["goals"]*goals + df["assists"]*assists + df["shots"]*shots + 
                           df["hits"]*hits + df["powerPlayGoals"]*powerPlayGoals + df["powerPlayAssists"]*powerPlayAssists + 
                           df["penaltyMinutes"]*penaltyMinutes + df["giveaways"]*giveaways + df["takeaways"]*takeaways +
                           df["shortHandedGoals"]*shortHandedGoals + df["shortHandedAssists"]*shortHandedAssists + 
                           df["blocked"]*blocks + df["plusMinus"]*plusMinus + df["plusMinus"]*plusMinus + df["faceOffPct"]*faceOffPct
    )
    return df



In [5]:
df = fantasy_points(df, goals=472, assists=295, shots=37, hits=37, powerPlayGoals=472+128, powerPlayAssists=295+80, 
                   penaltyMinutes=47, shortHandedAssists=295+80, blocks=20, shortHandedGoals=472+128, faceOffPct=0, takeaways=0, giveaways=0, 
                   plusMinus=0)

df.head(15)

Unnamed: 0,playerID,playerName,gameID,teamID,timeOnIce,assists,goals,shots,hits,powerPlayGoals,...,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,powerPlayTimeOnIce,shortHandedTimeOnIce,teamGameOrder,fantasyPoints
0,ID8480172,Jan Rutta,2021020001,14,15.916667,0,0,3,0,0,...,0,0,0,1,0,15.916667,0.0,0.0,1,131.0
1,ID8478519,Anthony Cirelli,2021020001,14,17.583333,1,1,2,3,0,...,0,0,0,0,0,16.05,0.45,1.083333,1,952.0
2,ID8478416,Erik Cernak,2021020001,14,20.316667,0,0,4,2,0,...,0,0,0,0,-1,19.666667,0.0,0.65,1,222.0
3,ID8474567,Zach Bogosian,2021020001,14,14.233333,0,0,1,3,0,...,1,0,0,0,-2,14.233333,0.0,0.0,1,148.0
4,ID8470621,Corey Perry,2021020001,14,12.666667,0,0,0,2,0,...,0,0,0,1,-2,12.216667,0.45,0.0,1,94.0
5,ID8473986,Alex Killorn,2021020001,14,19.266667,0,1,3,2,0,...,0,0,0,1,1,16.466667,1.55,1.25,1,677.0
6,ID8476453,Nikita Kucherov,2021020001,14,19.883333,0,0,0,1,0,...,1,0,0,1,-1,18.333333,1.55,0.0,1,57.0
7,ID8475167,Victor Hedman,2021020001,14,24.0,1,0,3,0,0,...,0,0,0,3,0,21.533333,1.55,0.916667,1,466.0
8,ID8479525,Ross Colton,2021020001,14,11.333333,0,0,2,3,0,...,0,0,0,1,-1,11.333333,0.0,0.0,1,205.0
9,ID8474151,Ryan McDonagh,2021020001,14,21.6,0,0,2,2,0,...,1,0,0,2,-3,20.516667,0.0,1.083333,1,188.0


In [6]:
team_colors = {
    1: '#ce1126',   #'devils
    2: '#f47d30',   #'islanders
    3: '#0038a8',   #'rangers
    4: '#f74902',   #'flyers
    5: '#fcb514',   #'penguins
    6: '#fcb514', #bruins
    7: '#002654', #sabres  
    8: '#af1e2d',   #'habs
    9: '#bf910c',   #'senators
    10: '#003e7e',   #'leafs
    12: '#ce1126',   #canes
    13: '#b9975b',   #'panthers
    14: '#002868',   #'lightning
    15: '#cf0a2c',   #'caps
    16: '#d18a00',   #blackhawks
    17: '#ce1126',   #wings
    18: '#ffb81c',   #'predators
    19: '#002f87',   #'blues
    20: '#ce1126', #flames  
    21: '#6f263d',   #avalance
    22: '#ff4c00',   #'oilers
    23: '#001f5b',   #'canucks
    24: '#f95602', #ducks
    25: '#006847',   #stars
    26: '#acaea9',   #'LA
    28: '#006d75',   #'sharks
    29: '#002654',   #blue_jackets
    30: '#024930',   #'wild
    52: '#041e41',   #jets
    53: '#8c2633', #coyotes
    54: '#b4975a',   #'golden_knights
    55: '#9CDBD9',   # kraken
}

<br>
<br>
<br>

## Player setup

In [21]:
player_name = 'Blake Coleman'
player_name
df.query(f"playerName == @player_name")["playerID"].unique()[0]

'ID8476399'

In [None]:
player = "ID8475167"

# the data 
reactive_df = df.loc[(df.playerID == player)]
color = team_colors[reactive_df['teamID'].unique()[0]]
player_name = reactive_df['playerName'].unique()[0]

<br>
<br>
<br>

## Stacked bar chart

In [15]:
# ===== Stacked bar chart by time on ice & fantasy points ======

fig = make_subplots(specs=[[{"secondary_y": True}]])

## Bar charts of time on ice
fig.add_trace(
    go.Bar(
        name="Power Play Time",
        x=reactive_df['teamGameOrder'],
        y=reactive_df['powerPlayTimeOnIce'],
        offsetgroup=0,
        opacity=0.5,
        marker=dict(color='#f58c9b')
    ))

fig.add_trace(
    go.Bar(
        name="Short Handed Time",
        x=reactive_df['teamGameOrder'],
        y=reactive_df['shortHandedTimeOnIce'],
        offsetgroup=0,
        opacity=0.5,
        marker=dict(color='#5b96fc')
    ))

fig.add_trace(
    go.Bar(
        name="Even Strength Time",
        x=reactive_df['teamGameOrder'],
        y=reactive_df['evenTimeOnIce'],
        offsetgroup=0,
        opacity=0.5,
        marker=dict(color='#c8c8f7')
    ))


## Scatter chart of fantasy points
fig.add_trace(
    go.Scatter(x=reactive_df['teamGameOrder'], 
               y=sm.nonparametric.lowess(reactive_df['fantasyPoints'], reactive_df['teamGameOrder'])[:, 1], # lowess fit line
               name="Fitted Fantasy Points/Game", 
               line_color=color,
               marker=dict(color = [team_colors.get(id) for id in list(reactive_df['teamID'])])
    ),
    secondary_y=True)

fig.add_trace(
    go.Scatter(x = reactive_df['teamGameOrder'], 
               y = reactive_df['fantasyPoints'],
               name="Fantasy Points/Game", 
               mode="markers",
               marker=dict(color = [team_colors.get(id) for id in list(reactive_df['teamID'])]),
               opacity = .8,
            #    hovertemplate = f"{:.0} points scored on game {reactive_df['teamGameOrder']}".format(x),
    ),
    secondary_y=True)


## Set axes titles
fig.update_yaxes(title_text="Minutes per Game", 
                 secondary_y=False, 
                 range=[0, 30], 
                 showgrid=False 
                 )
fig.update_yaxes(title_text="Fantasy Points per Game", 
                 secondary_y=True, 
                 range=[0, 4000],
                 dtick = 500
                 )
fig.update_xaxes(title_text="Game number", range=[0, 82])

fig.update_layout(bargap=0, 
                  barmode="stack",
                  plot_bgcolor='#f7f7f7',
                  paper_bgcolor='rgba(0,0,0,0)',
                  title=f"{player_name}'s Fantasy Points & Time on Ice",
                  )
fig.update_traces(
    marker=dict(size=6, 
                line=dict(width=0),
                ),
    selector=dict(mode="markers"),
)
    
fig.show();


<br>
<br>
<br>

## Point Distribution Chart

In [7]:
mins = 10

fig_box = go.Figure()
fig_box.add_trace(go.Box(
    x=df.query("timeOnIce > 10")['fantasyPoints'],
    name="All players", 
    marker_color='grey',
    line_color='grey'
    )
)
fig_box.add_trace(go.Box(
    x=reactive_df.query(f"timeOnIce > {mins}")['fantasyPoints'],
    name=f"{player_name}",
    marker_color=color,
    line_color=color
    )
)
fig_box.update_xaxes(title_text="Fantasy points per game", range=[-5, 4005])

fig_box.update_layout(plot_bgcolor='#f7f7f7',
                  paper_bgcolor='rgba(0,0,0,0)',
                  title=f"{player_name}' fantasy points vs. All Skaters' Playtime for games with >{mins} Minutes/Game Played)",
                  )

fig_box.show()

<br>
<br>
<br>
<br>
<br>
<br>

## Info about the player

In [8]:
# calculate the mean and standard deviation of the reactive_df['fantasyPoints']
mean = np.mean(reactive_df['fantasyPoints'])
std_dev = np.std(reactive_df['fantasyPoints'], ddof=1)
games_played = len(reactive_df)
traded = f"{player_name} was not traded this season"
if len(reactive_df['teamID'].unique()) > 1:
    traded = f"{len(reactive_df['teamID'].unique())-1}: Amount of times {player_name} was traded this season"


# print the results
print("{:.0f}: Average Points per Game,".format(mean)")
print("{:.0f}: Standard Deviation of Points per Game".format(std_dev))
print("{:.0f}: Games played".format(games_played))
print(traded)


721: Average Points per Game
589: Standard Deviation of Points per Game
80: Games played
Victor Hedman was not traded this season


In [9]:
fig_pts_hist = px.histogram(df.groupby("playerID")['fantasyPoints'].mean(),
                #    x='fantasyPoints',
                   nbins=50,
                   color_discrete_sequence=['grey']
                   )
fig_pts_hist.add_vline(mean,
                       line_color=color,
                       line_width=2, 
                       line_dash='dash'
            ).add_annotation(x=mean, 
                        y=100, 
                        text=f'{player_name} averaged:<br> {mean:.0f} point/game', 
                        showarrow=True, 
                        arrowhead=1,
            ).update_layout(
                        plot_bgcolor='#f7f7f7',
                        paper_bgcolor='rgba(0,0,0,0)',
                        title=f"{player_name}\'s Average (mean) Points/Game vs. All Players",
            ).update_yaxes(title_text="Count per Bin",
                           showgrid=False 
            ).update_xaxes(title_text="Averge Points/Game", 
                        #    range=[0, 82],
                           showgrid=True,
                           dtick=100
            )


fig_pts_hist.show()

In [10]:
std_array = df.groupby("playerID")['fantasyPoints'].mean()/df.groupby("playerID")['fantasyPoints'].std()
std_array = std_array[std_array<2]

fig_pts_hist = px.histogram(std_array,
                   nbins=50,
                   color_discrete_sequence=['grey']
                   )
fig_pts_hist.add_vline(reactive_df["fantasyPoints"].mean()/reactive_df["fantasyPoints"].std(),
                       line_color=color,
                       line_width=2, 
                       line_dash='dash'
            ).add_annotation(x=reactive_df["fantasyPoints"].mean()/reactive_df["fantasyPoints"].std(), 
                        y=60, 
                        text=f'{player_name}', 
                        showarrow=True, 
                        arrowhead=1,
            ).update_layout(
                        plot_bgcolor='#f7f7f7',
                        paper_bgcolor='rgba(0,0,0,0)',
                        title=f"{player_name}\'s Variability vs. All Players (higher is better: more consistent)",
            ).update_yaxes(title_text="Count per Bin",
                           showgrid=False 
            ).update_xaxes(title_text="Average Points/Game divided by Standard Deviation in Points", 
                           range=[0, 2],
                           showgrid=True,
                           dtick=100
            )


fig_pts_hist.show()

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# SCRAP

In [241]:
fig = px.histogram(df.query("timeOnIce > 10"), 
                   x='fantasyPoints',
                   nbins=50
                   )
fig.show()

In [None]:
df.query("playerID == 'ID8470621'")['timeOnIce'][4].minute    ##).total_seconds()

12

In [None]:
### this is to convert the data to a time series

timecolumn = ['timeOnIce', 'evenTimeOnIce', 'powerPlayTimeOnIce', 'shortHandedTimeOnIce']

for i in timecolumn:
    print(i)
    df[i] = pd.to_datetime(df[i], format='%M:%S').dt.time #+ pd.to_datetime(df[i], format='%M:%S').dt.time.second*60
    # df[i] = df[i].minute




timeOnIce
evenTimeOnIce
powerPlayTimeOnIce
shortHandedTimeOnIce


In [None]:
# calculate the mean and standard deviation of the reactive_df['fantasyPoints']
mean = np.mean(reactive_df['fantasyPoints'])
std_dev = np.std(reactive_df['fantasyPoints'], ddof=1)

# set the confidence level and degrees of freedom
confidence_level = 0.95
df_len = len(reactive_df['fantasyPoints']) - 1

# calculate the critical value for the confidence level and degrees of freedom
cv = t.ppf((1 + confidence_level) / 2, df_len)

# calculate the lower and upper bounds of the confidence interval
lower_bound = mean - (cv * std_dev / np.sqrt(len(reactive_df['fantasyPoints'])))
upper_bound = mean + (cv * std_dev / np.sqrt(len(reactive_df['fantasyPoints'])))

# print the results
print("Mean: {:.2f}".format(mean))
print("Standard Deviation: {:.2f}".format(std_dev))
print("{:.0%} Confidence Interval: [{:.2f}, {:.2f}]".format(confidence_level, lower_bound, upper_bound))


Mean: 811.12
Standard Deviation: 603.23
95% Confidence Interval: [675.11, 947.12]


In [26]:
    px.scatter(x=bar_df['teamGameOrder'], 
               y=bar_df['fantasyPoints'], 
            #    name="Fantasy Points per Game", 
            #    mode="markers",
               trendline="lowess",
    )

In [None]:
df['timeOnIceMinutes'] = df['timeOnIce'].minute + df['timeOnIce'].second

AttributeError: 'Series' object has no attribute 'minute'

In [None]:
int(str.split('01:01', ':')[0])*60 + int(str.split('01:01', ':')[1])

61

In [None]:
len(list(df.query("teamID == 25")["gameID"].sort_values().unique()))

81

In [None]:
# df['newCol'] = df['timeOnIce'].apply(lambda x: int(str.split(x, ':')[0])*60 + int(str.split(x, ':')[1]))