In [1]:
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta

In [2]:
#upload player data 
player_data = pd.read_csv(r'C:\Users\akobe\OneDrive\Desktop\Lighthouse\After\NHL-Stats\Scoring_Rates\Data\player_data\player_data.csv', index_col = [0]) #index col makes sures another index column is not needed

In [9]:
player_data.head()

Unnamed: 0,game_num,date,team_id,team_code,player_id,name,shoots,position,toi,assists,...,takeaways,giveaways,sh_goals,sh_assists,blocked,plus_minus,even_toi,pp_toi,sh_toi,home_away
0,2022020001,2022-10-07,18,NSH,8478508,Yakov Trenin,L,C,17.05,0,...,0,0,0,0,0,0,13.916667,0.0,3.133333,home
1,2022020001,2022-10-07,18,NSH,8475218,Mattias Ekholm,L,D,21.033333,1,...,0,1,0,0,0,2,18.633333,0.0,2.4,home
2,2022020001,2022-10-07,18,NSH,8476925,Colton Sissons,R,C,14.133333,0,...,0,1,0,0,1,-1,12.116667,0.0,2.016667,home
3,2022020001,2022-10-07,18,NSH,8476887,Filip Forsberg,R,LW,16.733333,1,...,0,1,0,0,0,1,12.616667,4.05,0.066667,home
4,2022020001,2022-10-07,18,NSH,8475798,Mikael Granlund,L,C,17.866667,1,...,0,0,0,0,0,1,13.466667,4.05,0.35,home


In [4]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47208 entries, 0 to 5610
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   game_num    47208 non-null  int64  
 1   date        47208 non-null  object 
 2   team_id     47208 non-null  int64  
 3   team_code   47208 non-null  object 
 4   player_id   47208 non-null  int64  
 5   name        47208 non-null  object 
 6   shoots      47208 non-null  object 
 7   position    47208 non-null  object 
 8   toi         47208 non-null  object 
 9   assists     47208 non-null  int64  
 10  goals       47208 non-null  int64  
 11  shots       47208 non-null  int64  
 12  hits        47208 non-null  int64  
 13  pp_goals    47208 non-null  int64  
 14  pp_assists  47208 non-null  int64  
 15  pim         47208 non-null  int64  
 16  fo_perc     17747 non-null  float64
 17  fo_wins     47208 non-null  int64  
 18  fo_taken    47208 non-null  int64  
 19  takeaways   47208 non-null

## Cleaning

In [5]:
#change date to datetime data type 
player_data['date'] = pd.to_datetime(player_data['date'])

In [6]:
#remove timestamp from date column 
player_data['date'] = player_data['date'].dt.strftime('%Y-%m-%d')

In [7]:
#change time on ice variables to time data type 
player_data['toi'] = pd.to_datetime(player_data['toi'], format='%M:%S').dt.time
player_data['even_toi'] = pd.to_datetime(player_data['even_toi'], format='%M:%S').dt.time
player_data['pp_toi'] = pd.to_datetime(player_data['pp_toi'], format='%M:%S').dt.time
player_data['sh_toi'] = pd.to_datetime(player_data['sh_toi'], format='%M:%S').dt.time

In [8]:
#change time on ice variables to timedelta for addition and convert to minutes
player_data['toi'] = pd.to_timedelta(player_data['toi'].astype(str))/pd.Timedelta('60s')
player_data['even_toi'] = pd.to_timedelta(player_data['even_toi'].astype(str))/pd.Timedelta('60s')
player_data['pp_toi'] = pd.to_timedelta(player_data['pp_toi'].astype(str))/pd.Timedelta('60s')
player_data['sh_toi'] = pd.to_timedelta(player_data['sh_toi'].astype(str))/pd.Timedelta('60s')

## Different Player Statistics 

## Leaf's players min played vs. goals scored 

In [10]:
#create a dataframe of just Toronto 
leaf_data = player_data.loc[player_data['team_code'] == 'TOR']

In [11]:
leaf_data.head(2)

Unnamed: 0,game_num,date,team_id,team_code,player_id,name,shoots,position,toi,assists,...,takeaways,giveaways,sh_goals,sh_assists,blocked,plus_minus,even_toi,pp_toi,sh_toi,home_away
251,2022020015,2022-10-13,10,TOR,8470966,Mark Giordano,L,D,19.716667,1,...,1,1,0,0,2,1,15.3,2.15,2.266667,home
252,2022020015,2022-10-13,10,TOR,8475718,Justin Holl,R,D,20.466667,0,...,2,1,0,0,4,0,16.05,0.0,4.416667,home


In [12]:
#get the total number of goals for each player (season totals)
#Note that the name argument within reset_index() specifies the name for the new column produced by GroupBy.
num_goals_season = leaf_data.groupby('name')['goals'].sum().reset_index(name='total_goals')  

In [13]:
#get total toi for the season for each leaf's player
season_toi = leaf_data.groupby('name')['toi'].sum().reset_index(name='total_toi')  

In [15]:
#combine totals dataframes into a single dataframe 
goals_toi_total = pd.merge(num_goals_season, season_toi, on='name', how='outer')

In [20]:
#calculate # of goals scored per minutes played 
goals_toi_total['goals_time'] = goals_toi_total['total_goals']/goals_toi_total['total_toi']

In [21]:
goals_toi_total

Unnamed: 0,name,total_goals,total_toi,goals_time
0,Alex Kerfoot,10,1198.75,0.008342
1,Alex Steeves,0,23.533333,0.0
2,Auston Matthews,40,1500.6,0.026656
3,Bobby McMann,0,105.716667,0.0
4,Calle Jarnkrok,20,1049.916667,0.019049
5,Conor Timmins,2,390.3,0.005124
6,David Kampf,7,1255.216667,0.005577
7,Denis Malgin,2,267.25,0.007484
8,Dryden Hunt,1,75.916667,0.013172
9,Erik Gustafsson,0,141.783333,0.0
