In [2]:
import pandas as pd
import numpy as np
import nfl_data_py as nfl
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# NHL
## Read in data

In [44]:
nhl_draft=pd.read_csv('..\\..\\Data\\stata-files\\nhl_draft.csv')

In [39]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(nhl_draft['overall_pick'].value_counts().sort_index())

overall_pick
1       41
2       42
3       41
4       43
5       43
6       41
7       41
8       41
9       42
10      41
11      40
12      41
13      41
14      41
15      41
16      41
17      41
18      41
19      41
20      41
21      41
22      41
23      41
24      41
25      41
26      41
27      41
28      41
29      41
30      41
31      41
32      41
33      41
34      41
35      41
36      41
37      41
38      41
39      41
40      41
41      41
42      41
43      41
44      41
45      41
46      41
47      41
48      41
49     208
50      41
51      42
52      41
53      41
54      41
55      41
56      41
57      41
58      40
59      41
60      41
61      41
62      42
63      41
64      41
65      41
66      41
67      41
68      41
69      40
70      41
71      41
72      41
73      41
74      41
75      41
76      41
77      41
78      41
79      41
80      41
81      41
82      41
83      41
84      41
85      41
86      41
87      41
88      41
89      41
90      

## Create NHL round 

In [35]:
teams={1979:21,1991:22,1992:24,1993:26,1998:27,1999:28,2000:30,2017:31,2021:32}
num_rounds={}
for i in range(1982,2022):
    if i<1992:
        num_rounds[i]=12
    elif i < 1995:
        num_rounds[i]=11
    elif i < 2005:
        num_rounds[i]=9
    else:
        num_rounds[i]=7
nhl_draft=nhl_draft[nhl_draft['year']>=1982]

In [45]:
for index, row in nhl_draft.iterrows():
    if  row['overall_pick']+1!=nhl_draft.iloc[index+1]['overall_pick'] and row['year']==nhl_draft.iloc[index+1]['year']:
        forfeit=pd.DataFrame(({'id':0, 'overall_pick':row['overall_pick']+1,'team':'Forfeit', 'year':row['year']}), index=[0])
        nhl_draft=pd.concat([nhl_draft.iloc[:index+1],forfeit,nhl_draft.iloc[index+1:]], ignore_index=True)

In [42]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(nhl_draft['overall_pick'].value_counts().sort_index())

overall_pick
1      43
2      43
3      43
4      43
5      43
6      43
7      43
8      43
9      43
10     43
11     43
12     43
13     44
14     44
15     44
16     44
17     44
18     44
19     44
20     44
21     44
22     44
23     44
24     44
25     44
26     44
27     44
28     44
29     44
30     44
31     44
32     44
33     44
34     44
35     44
36     44
37     44
38     44
39     44
40     44
41     44
42     44
43     44
44     44
45     44
46     44
47     44
48     44
49     43
50     44
51     44
52     44
53     44
54     44
55     44
56     44
57     44
58     43
59     44
60     44
61     44
62     44
63     44
64     44
65     44
66     44
67     44
68     44
69     43
70     44
71     44
72     44
73     44
74     44
75     44
76     44
77     44
78     44
79     44
80     44
81     44
82     44
83     44
84     44
85     44
86     44
87     44
88     44
89     44
90     44
91     44
92     44
93     44
94     44
95     44
96     44
97     44
98     44
99     

In [46]:
round=[]
for index, row in nhl_draft.iterrows():
    num_teams=len(nhl_draft[nhl_draft['year']==row['year']].team.unique())
    overall=row['overall_pick']-1
    r=overall//(num_teams)+1
    round.append(r)
    comp_picks=nhl_draft[nhl_draft['year']==row['year']].overall_pick.max()%num_teams
    if comp_picks!=0:
        if round[-1]==3 and comp_picks>round.count(2)-num_teams:
            round[-1]=2
        elif round[-1]>3 and round.count(r-1)<num_teams:
            round[-1]=r-1
nhl_draft['round']=round

In [20]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(nhl_draft[['id','year','overall_pick','round']])

          id  year  overall_pick  round
0          1  2022             1      1
1          2  2022             2      1
2          3  2022             3      1
3          4  2022             4      1
4          5  2022             5      1
5          6  2022             6      1
6          7  2022             7      1
7          8  2022             8      1
8          9  2022             9      1
9         10  2022            10      1
10        11  2022            11      1
11        12  2022            12      1
12        13  2022            13      1
13        14  2022            14      1
14        15  2022            15      1
15        16  2022            16      1
16        17  2022            17      1
17        18  2022            18      1
18        19  2022            19      1
19        20  2022            20      1
20        21  2022            21      1
21        22  2022            22      1
22        23  2022            23      1
23        24  2022            24      1


## Create pick, and running variable (distRound)

In [47]:


rounds=nhl_draft['round'].to_numpy()
prev=0
pick=1
picks=[]
for r in rounds:
    if r==prev:
        pick+=1
    else:
        pick=1

    prev=r
    picks.append(pick)


nhl_draft['pick']=picks

distRound=[]
for index, row in nhl_draft.iterrows():
    numPicks=nhl_draft[(nhl_draft['year']==row['year']) & (nhl_draft['round']==row['round'])]['pick'].max()
    m=numPicks//2
    if row['pick']>m:
        distRound.append(row['pick']-numPicks)
    else:
        distRound.append(row['pick'])

nhl_draft['distRound']=distRound

## Fill NHL NA's

In [48]:
nhl_draft.columns

Index(['id', 'year', 'overall_pick', 'team', 'player', 'nationality',
       'position', 'age', 'to_year', 'amateur_team', 'games_played', 'goals',
       'assists', 'points', 'plus_minus', 'penalties_minutes',
       'goalie_games_played', 'goalie_wins', 'goalie_losses',
       'goalie_ties_overtime', 'save_percentage', 'goals_against_average',
       'point_shares', 'round', 'pick', 'teams', 'distRound', 'Years_Played',
       'position_code', 'age_of_retirement', 'pos_years',
       'pos_age_of_retirement', 'pos_games_played', 'round_years',
       'round_age_of_retirement', 'round_games_played', 'scaled_years',
       'scaled_age_of_retirement', 'scaled_games_played'],
      dtype='object')

In [49]:
nhl_draft[['to_year']]=nhl_draft[['to_year']].fillna(2022)
nhl_draft[['games_played']]=nhl_draft[['games_played']].fillna(0)
nhl_draft=nhl_draft.fillna(0)

In [52]:
nhl_draft=nhl_draft[nhl_draft['id']!=0]

## Create years played and age of retirement

In [50]:
nhl_draft['Years_Played']= nhl_draft['to_year']-nhl_draft['year']

In [51]:
nhl_draft['age_of_retirement']=nhl_draft['age']+nhl_draft['Years_Played']

## Condense position codes

In [7]:

for index, row in nhl_draft.iterrows():
    if type(row['position'])==float:
        nhl_draft.drop(index, inplace=True)

positions = nhl_draft['position'].unique()
positions

array(['LW', 'D', 'C', 'RW', 'G', 'W', 'L', 'F'], dtype=object)

In [8]:
positions=nhl_draft['position'].to_numpy()
for i in range(len(positions)):
        
    if len(positions[i])==1:
        pass
    else:
        positions[i]=positions[i][:2].strip('/;')
    if positions[i]=='Ce':
        positions[i]='C'
nhl_draft['position']=positions
nhl_draft['position'].unique()

array(['LW', 'D', 'C', 'RW', 'G', 'W', 'L', 'F'], dtype=object)

In [9]:
## Ordinal Encode Position
ord_enc = OrdinalEncoder()
nhl_draft["position_code"] = ord_enc.fit_transform(nhl_draft[["position"]])

## Standardize the data

In [53]:
## Scale by position
scaler=StandardScaler()
pos_scaled_nhl_draft=pd.DataFrame()
for pos in nhl_draft['position'].unique():
    temp=nhl_draft[nhl_draft['position']==pos].copy()
    temp['Years_Played']=scaler.fit_transform(temp[['Years_Played']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games_played']=scaler.fit_transform(temp[['games_played']])
    pos_scaled_nhl_draft=pd.concat([pos_scaled_nhl_draft, temp])


In [54]:
## Scale by round
scaler=StandardScaler()
round_scaled_nhl_draft=pd.DataFrame()
for round in nhl_draft['round'].unique():
    temp=nhl_draft[nhl_draft['round']==round].copy()
    temp['Years_Played']=scaler.fit_transform(temp[['Years_Played']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games_played']=scaler.fit_transform(temp[['games_played']])    
    round_scaled_nhl_draft=pd.concat([round_scaled_nhl_draft, temp])

In [55]:
## Scale by round and position
scaler=StandardScaler()
scaled_nhl_draft=pd.DataFrame()
for round in pos_scaled_nhl_draft['round'].unique():
    temp=pos_scaled_nhl_draft[pos_scaled_nhl_draft['round']==round].copy()
    temp['Years_Played']=scaler.fit_transform(temp[['Years_Played']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games_played']=scaler.fit_transform(temp[['games_played']])
    scaled_nhl_draft=pd.concat([scaled_nhl_draft, temp])

In [56]:
nhl_draft['pos_years']=pos_scaled_nhl_draft['Years_Played']
nhl_draft['pos_age_of_retirement']=pos_scaled_nhl_draft['age_of_retirement']
nhl_draft['pos_games_played']=pos_scaled_nhl_draft['games_played']


In [57]:
nhl_draft['round_years']=round_scaled_nhl_draft['Years_Played']
nhl_draft['round_age_of_retirement']=round_scaled_nhl_draft['age_of_retirement']
nhl_draft['round_games_played']=round_scaled_nhl_draft['games_played']

In [58]:
nhl_draft['scaled_years']=scaled_nhl_draft['Years_Played']
nhl_draft['scaled_age_of_retirement']=scaled_nhl_draft['age_of_retirement']
nhl_draft['scaled_games_played']=scaled_nhl_draft['games_played']

## Write to CSV

In [59]:
nhl_draft.to_csv('..\\..\\Data\\stata-files\\nhl_draft.csv',index=False)

# NFL

In [None]:
nfl_draft=pd.read_csv('..\\..\\Data\\stata-files\\nfl_draft.csv')
nfl_draft.head(32)

Unnamed: 0,season,round,pick,team,gsis_id,pfr_player_id,cfb_player_id,pfr_player_name,hof,position,...,pass_ints,rush_atts,rush_yards,rush_tds,receptions,rec_yards,rec_tds,def_solo_tackles,def_ints,def_sacks
0,1980,1,1,DET,,SimsBi00,billy-sims-1,Billy Sims,False,RB,...,0.0,1131.0,5106.0,42.0,186.0,2072.0,5.0,,,
1,1980,1,2,NYJ,,JoneLa00,lam-jones-1,Lam Jones,False,WR,...,0.0,9.0,17.0,0.0,138.0,2322.0,13.0,,,
2,1980,1,3,CIN,,MunoAn00,,Anthony Munoz,True,T,...,0.0,0.0,0.0,0.0,7.0,18.0,4.0,,,
3,1980,1,4,GNB,,ClarBr23,bruce-clark-1,Bruce Clark,False,DE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,39.5
4,1980,1,5,BAL,,DickCu00,curtis-dickey-1,Curtis Dickey,False,RB,...,0.0,937.0,4019.0,32.0,134.0,1577.0,8.0,,,
5,1980,1,6,STL,,GreeCu21,curtis-greer-1,Curtis Greer,False,DE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,69.5
6,1980,1,7,ATL,,MillJu00,junior-miller-1,Junior Miller,False,TE,...,0.0,3.0,0.0,0.0,122.0,1409.0,14.0,,,
7,1980,1,8,NYG,,HaynMa00,mark-haynes-1,Mark Haynes,False,DB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,17.0,1.0
8,1980,1,9,MIN,,MartDo21,,Doug Martin,False,DE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,61.5
9,1980,1,10,SEA,,GreeJa01,jacob-green-2,Jacob Green,False,DE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,115.5


## Create pick from overall pick

In [None]:
teams=nfl_draft['round'].to_numpy()
prev=0
pick=1
picks=[]
for p in teams:
    if p==prev:
        pick+=1
    else:
        pick=1

    prev=p
    picks.append(pick)


In [None]:
nfl_draft=nfl_draft.rename(columns={'pick':'overall_pick'})
nfl_draft['pick']=picks

## Recenter data and create running variable distRound

In [None]:
distRound=[]
for index, row in nfl_draft.iterrows():
    numPicks=nfl_draft[(nfl_draft['season']==row['season']) & (nfl_draft['round']==row['round'])]['pick'].max()
    m=numPicks//2
    if row['pick']>m:
        distRound.append(row['pick']-numPicks)
    else:
        distRound.append(row['pick'])

nfl_draft['distRound']=distRound


## Create variables and encode positions

In [None]:
nfl_draft['seasonsPlayed']=nfl_draft['to']-nfl_draft['season']
nfl_draft['age_of_retirement']=nfl_draft['age']+nfl_draft['seasonsPlayed']
ord_enc = OrdinalEncoder()
nfl_draft["position_code"] = ord_enc.fit_transform(nfl_draft[["position"]])


## Standardize the data

In [None]:
## Scale by position
scaler=StandardScaler()
pos_scaled_nfl_draft=pd.DataFrame()
for pos in nfl_draft['position'].unique():
    temp=nfl_draft[nfl_draft['position']==pos].copy()
    temp['seasonsPlayed']=scaler.fit_transform(temp[['seasonsPlayed']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games']=scaler.fit_transform(temp[['games']])
    pos_scaled_nfl_draft=pd.concat([pos_scaled_nfl_draft, temp])

In [None]:
## Scale by round
scaler=StandardScaler()
round_scaled_nfl_draft=pd.DataFrame()
for round in nfl_draft['round'].unique():
    temp=nfl_draft[nfl_draft['round']==round].copy()
    temp['seasonsPlayed']=scaler.fit_transform(temp[['seasonsPlayed']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games']=scaler.fit_transform(temp[['games']])
    round_scaled_nfl_draft=pd.concat([round_scaled_nfl_draft, temp])

In [None]:
## Scale by round and position
scaler=StandardScaler()
scaled_nfl_draft=pd.DataFrame()
for round in pos_scaled_nfl_draft['round'].unique():
    temp=pos_scaled_nfl_draft[pos_scaled_nfl_draft['round']==round].copy()
    temp['seasonsPlayed']=scaler.fit_transform(temp[['seasonsPlayed']])
    temp['age_of_retirement']=scaler.fit_transform(temp[['age_of_retirement']])
    temp['games']=scaler.fit_transform(temp[['games']])
    scaled_nfl_draft=pd.concat([scaled_nfl_draft, temp])

In [None]:
nfl_draft['pos_years']=pos_scaled_nfl_draft['seasonsPlayed']
nfl_draft['pos_age_of_retirement']=pos_scaled_nfl_draft['age_of_retirement']
nfl_draft['pos_games_played']=pos_scaled_nfl_draft['games']

nfl_draft['round_years']=round_scaled_nfl_draft['seasonsPlayed']
nfl_draft['round_age_of_retirement']=round_scaled_nfl_draft['age_of_retirement']
nfl_draft['round_games_played']=round_scaled_nfl_draft['games']

nfl_draft['scaled_years']=scaled_nfl_draft['seasonsPlayed']
nfl_draft['scaled_age_of_retirement']=scaled_nfl_draft['age_of_retirement']
nfl_draft['scaled_games_played']=scaled_nfl_draft['games']

## Write to CSV

In [None]:
nfl_draft.to_csv('..\\..\\Data\\stata-files\\nfl_draft.csv',index=False)

In [None]:
nhl_draft=pd.read_csv('..\\..\\Data\\stata-files\\nhl_draft.csv')
nhl_draft