In [1]:
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import re
import dateutil.parser as dparser
from multiprocessing import  Pool
import numpy as np

## Querying Data

In [None]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, 'draft_kings', creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [None]:
contests = pd.read_sql_query("""select contestID from contest_details where entryFee <= 10 and contestID in (select id from contests where IsWinnerTakeAll is null)
""",
    connection)

contests.to_csv('data/contests.csv')

In [None]:
playerOptions = pd.read_sql_query("""select * from draft_groups_players_webdriver where contestID in (select contestID from contest_details where entryFee <= 10 and contestID in (select id from contests where IsWinnerTakeAll is null))
""",
    connection)

playerOptions.to_csv('data/playerOptionsWebDriver.csv')

In [None]:
otGames = pd.read_sql_query("""
select distinct gameID
from hockey.live_feed
where gameID in (select gameID from hockey.schedules where seasonID=20212022 and gameType='R') and periodNum >= 4
""",
    connection)

otGames.to_csv('data/otGames.csv')

In [None]:
gameInfo = pd.read_sql_query("""
select gameID,
       away.abbreviation as awayTeam,
       awayTeamID,
       home.abbreviation as homeTeam,
       homeTeamID,
       date_add(gameDate, interval -4 hour ) as newGameDate
from hockey.schedules
inner join hockey.teams away on schedules.awayTeamID = away.teamID
inner join hockey.teams home on schedules.homeTeamID = home.teamID
where seasonID=20212022 and gameType='R'

""",
    connection)

gameInfo.to_csv('data/gameInfo.csv')

In [None]:
shootoutGoals = pd.read_sql_query("""
select gameID, playerID
from hockey.live_feed
where gameID in (select gameID from hockey.schedules where seasonID=20212022 and gameType='R') and
      periodNum =5 and
      playerType = 'Scorer'

""",
    connection)

shootoutGoals.to_csv('data/shootoutGoals.csv')

In [None]:
conn.close()

## Loading Data

In [2]:
shootoutGoals = pd.read_csv('data/shootoutGoals.csv',index_col=0)

In [3]:
playerOptionsWebDriver = pd.read_csv('data/playerOptionsWebDriver.csv', index_col=0)

In [4]:
contests = pd.read_csv('data/contests.csv',index_col=0)

In [5]:
playerSelections = pd.read_csv('transformed_data/playerSelections.csv',index_col=0)
playerSelections = playerSelections[playerSelections['contestID'].isin(contests['contestID'])]

In [6]:
goalieBoxscores = pd.read_csv('data/goalies_boxscores.csv', index_col=0)

In [7]:
skaterBoxscores = pd.read_csv('data/skaters_boxscores.csv', index_col=0)

In [8]:
otGames = pd.read_csv('data/otGames.csv', index_col=0)

In [9]:
gameInfo = pd.read_csv('data/gameInfo.csv',index_col=0)

## Prepping Hockey DB Data

In [10]:
# Extracting just the date
gameInfo['newGameDate'] = gameInfo['newGameDate'].apply(lambda x: x[0:10])

In [11]:
# Adding on if the game went to OT
otGames['OT'] = 1

goalieBoxscores = pd.merge(goalieBoxscores, otGames, how='left', on=['gameID'])

goalieBoxscores.loc[goalieBoxscores['OT'].isna(), 'OT'] = 0

In [12]:
# Getting the number of shootout goals in case a player went twice
shootoutGoals = shootoutGoals.groupby(['gameID', 'playerID']).size().reset_index().rename({0:'numShootOutGoals'}, axis=1)

In [13]:
# Adding on the number of shootout goals
skaterBoxscores = pd.merge(skaterBoxscores, shootoutGoals, how='left')

skaterBoxscores.loc[skaterBoxscores['numShootOutGoals'].isna(),'numShootOutGoals'] = 0

In [14]:
# Computing Skater Points

allPoints = []
for index, player in skaterBoxscores.iterrows():
    
    points = 1.3*player['blocked'] + \
                5*player['assists'] + \
                8.5*player['goals'] + \
                2*(player['shortHandedGoals'] + player['shortHandedAssists']) + \
                1.5*player['shots'] + \
                1.5*player['numShootOutGoals']
    
    if player['blocked'] >= 3:
        points += 3
        
    if player['assists'] + player['goals'] >= 3:
        points += 3
        
    if player['shots'] >= 5:
        points += 3
        
    if player['goals'] >= 3:
        points += 3
        
    allPoints.append(points)
        
skaterBoxscores['points'] = allPoints

skaterBoxscores = skaterBoxscores[['gameID','teamID','playerID','playerName','points']]

In [15]:
# Computing Goalie Points

allPoints = []

for index, goalie in goalieBoxscores.iterrows():
    points = -3.5*(goalie['shots']-goalie['saves']) + \
                0.7*goalie['shots']
    
    if goalie['shots'] == goalie['saves']:
        points += 4
        
    if goalie['shots'] >= 35:
        points += 3
        
    if (goalie['decision'] == 'L') & (goalie['OT'] == 1):
        points += 2
        
    if goalie['decision'] == 'W':
        points += 6
        
    allPoints.append(points)

goalieBoxscores['points'] = allPoints

goalieBoxscores = goalieBoxscores[['gameID','teamID','playerID','playerName','points']]

## Prepping Draft King Data

In [31]:
playerSelections = pd.merge(playerSelections.drop('entryID', axis=1), 
         playerOptionsWebDriver[['name', 'gameInfo', 'TeamAbbrev','contestID']], 
         left_on = ['contestID', 'player'],
         right_on = ['contestID', 'name'])

In [46]:
mapping = [['PHI'], 
           ['DET'], 
           ['CLS'], 
           ['BUF'], 
           ['EDM'], 
           ['TOR'], 
           ['CGY'], 
           ['NJ'], 
           ['OTT'],
           ['CHI'], 
           ['MIN'],
           ['SJ'], 
           ['ANH'],
           ['WAS'], 
           ['BOS'], 
           ['COL'], 
           ['PIT'], 
           ['LA'], 
           ['DAL'],
           ['VGK'], 
           ['NSH'], 
           ['SEA'], 
           ['CAR'], 
           ['STL'], 
           ['ARI'], 
           ['VAN'], 
           ['MON'], 
           ['NYR'],
           ['WPG'], 
           ['TB'],
           ['FLA'],
           ['NYI']]

mapping = pd.DataFrame(mapping, columns=['draftKings'])

mapping['hockeyDB'] = mapping['draftKings']

mapping.iloc[7,1] = 'NJD'
mapping.iloc[11,1] = 'SJS'
mapping.iloc[17,1] = 'LAK'
mapping.iloc[29,1] = 'TBL'
mapping.iloc[2,1] = 'CBJ'
mapping.iloc[12,1] = 'ANA'
mapping.iloc[13,1] = 'WSH'
mapping.iloc[26,1] = 'MTL'

In [47]:
playerSelections = pd.merge(playerSelections, mapping, left_on=['TeamAbbrev'], right_on='draftKings')

In [48]:
playerSelections = playerSelections.drop(
    ['draftKings','TeamAbbrev','name'],
    axis=1
).rename(
    {'hockeyDB':'teamAbbrev'}, 
    axis=1
)

In [49]:
# playerSelections = playerSelections[~(playerSelections['gameInfo'] == 'Postponed')]

In [50]:
def extract_date(df):
    dates = []
    for index, row in df.iterrows():
        try:
            date = dparser.parse(row['gameInfo'], fuzzy=True).strftime("%Y-%m-%d")
        except dparser.ParserError:
            date = None
    
    
        dates.append(date)
    
    df['gameDate'] = dates
    return df

In [51]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [52]:
playerSelections = parallelize_dataframe(playerSelections, extract_date, 14)



In [54]:
# playerSelections.to_csv('transformed_data/playerSelectionsParsed.csv')

In [16]:
playerSelections = pd.read_csv('transformed_data/playerSelectionsParsed.csv',index_col=0)

In [17]:
playerSelections

Unnamed: 0,contestID,entryName,position,player,gameInfo,teamAbbrev,gameDate
0,114760201,zarnold,C,Sean Couturier,BOS@PHI 10/04/2021 07:00PM ET,PHI,2021-10-04
1,114760201,gggg76,C,Sean Couturier,BOS@PHI 10/04/2021 07:00PM ET,PHI,2021-10-04
2,114760201,RiccardoG (1/2),C,Sean Couturier,BOS@PHI 10/04/2021 07:00PM ET,PHI,2021-10-04
3,114760201,SmallT19,C,Sean Couturier,BOS@PHI 10/04/2021 07:00PM ET,PHI,2021-10-04
4,114760201,coys3380 (3/6),C,Sean Couturier,BOS@PHI 10/04/2021 07:00PM ET,PHI,2021-10-04
...,...,...,...,...,...,...,...
16445607,123158389,kj4261,W,Anders Lee,PHI@NYI 01/25/2022 07:30PM ET,NYI,2022-01-25
16445608,123158389,Joevega842,W,Anders Lee,PHI@NYI 01/25/2022 07:30PM ET,NYI,2022-01-25
16445609,123158389,ericdaoust,D,Scott Mayfield,PHI@NYI 01/25/2022 07:30PM ET,NYI,2022-01-25
16445610,123158389,besco,D,Scott Mayfield,PHI@NYI 01/25/2022 07:30PM ET,NYI,2022-01-25


In [23]:
playerSelections[playerSelections['gameDate'].isnull()]

Unnamed: 0,contestID,entryName,position,player,gameInfo,teamAbbrev,gameDate
444027,122329530,mkifer55,W,James van Riemsdyk,Postponed,PHI,
444028,122331987,Purple1purple,G,Martin Jones,Postponed,PHI,
444029,122331988,l4efs35,C,Kevin Hayes,Postponed,PHI,
444030,122331992,Purple1purple,G,Martin Jones,Postponed,PHI,
444031,122332067,Purple1purple,G,Martin Jones,Postponed,PHI,
...,...,...,...,...,...,...,...
12896478,117544201,JTroop,CPT,Scott Wedgewood,-,ARI,
12896479,117544205,Buttstains,CPT,Scott Wedgewood,-,ARI,
12896480,117544205,Buttstains,CPT,Scott Wedgewood,-,ARI,
12898317,117604694,jrsr411,CPT,Scott Wedgewood,-,ARI,


## Merging PlaySelections with Hockey DB Games

In [18]:
selectionsAndGamesAway = pd.merge(
    gameInfo, 
    playerSelections, 
    left_on=['newGameDate', 'awayTeam'],
    right_on=['gameDate', 'teamAbbrev']
)

In [19]:
selectionsAndGamesHome = pd.merge(
    gameInfo, 
    playerSelections, 
    left_on=['newGameDate', 'homeTeam'],
    right_on=['gameDate', 'teamAbbrev']
)

In [20]:
selectionsAndGames = pd.concat([selectionsAndGamesAway, selectionsAndGamesHome])

In [22]:
ds1 = set(map(tuple, selectionsAndGames[playerSelections.columns].values))
ds2 = set(map(tuple, playerSelections.values))

In [23]:
x = ds2.difference(ds1)

In [24]:
z = pd.DataFrame(list(x))

In [25]:
# Removing contests that had bad games (i.e. rescheduled etc)
selectionsAndGames = selectionsAndGames[~selectionsAndGames['contestID'].isin(z[0].drop_duplicates().values)]

In [34]:
scoresSkaters = pd.merge(selectionsAndGames, 
         skaterBoxscores, 
         left_on=['gameID', 'player'], 
         right_on=['gameID','playerName'], 
         how='inner')

In [35]:
scoresGoalies = pd.merge(selectionsAndGames, 
         goalieBoxscores, 
         left_on=['gameID', 'player'], 
         right_on=['gameID','playerName'], 
         how='inner')



In [63]:
scores = pd.concat([scoresSkaters, scoresGoalies])

In [67]:
scores['points'] = np.where(scores['position'] == 'CPT', scores['points'] * 1.5, scores['points'])

In [68]:
scores = scores[['contestID', 'entryName','points']].groupby(
    ['contestID', 'entryName']
).sum('points').sort_values('points',ascending=False).reset_index()

In [69]:
scores.sort_values(['contestID','points'],ascending=False)

Unnamed: 0,contestID,entryName,points
58366,123666691,damoney8888,184.1
70434,123666691,94expos4ever,178.1
70435,123666691,aylsg,178.1
70567,123666691,madstoper,178.1
70586,123666691,mrnorrad,178.1
...,...,...,...
1641022,115146218,cw1tt (12/20),29.4
1641026,115146218,cw1tt (3/20),29.4
1641027,115146218,cw1tt (20/20),29.4
1641029,115146218,cw1tt (19/20),29.4


In [52]:
scoreSizes = scores.groupby(['contestID']).size().reset_index()