In [219]:
import numpy as np
import pandas as pd
import fancyimpute
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cluster import KMeans
from sklearn.svm import SVC
import xgboost as xgb

In [109]:
df = pd.read_csv("data/full2.csv")
# Examine head of data
df.head()

Unnamed: 0,Year,Name,Position,Round,Pick,College,Conference,Games,Seasons,Height,...,PR.TD,KRs,KR.Yds,KR.TD,XPA,XP.,FGA,FG.,Punts,Punt.Avg
0,2008,Husain Abdullah,FS,0,0,Small,Pac-10,45,4,72,...,0,0,0,0,0,0.0,0,0.0,0,0.0
1,2008,Jamar Adams,SS,0,0,Michigan,Big Ten,38,3,74,...,0,0,0,0,0,0.0,0,0.0,0,0.0
2,2008,Xavier Adibi,OLB,4,118,Virginia Tech,ACC,47,4,74,...,0,0,0,0,0,0.0,0,0.0,0,0.0
3,2008,Erik Ainge,QB,5,162,Tennessee,SEC,43,4,77,...,0,0,0,0,0,0.0,0,0.0,0,0.0
4,2008,Branden Albert,OG,1,15,Virginia,ACC,0,0,78,...,0,0,0,0,0,0.0,0,0.0,0,0.0


In [110]:
# We know that combine data is missing, so let's impute it.
imp = df[["Position", "Height", "Weight", "X40.Yard", "Bench", "Broad.Jump", "Shuttle", "X3.Cone", "Vertical"]]
imp=imp.replace({0:np.nan})
number = LabelEncoder()
imp["Position"] = number.fit_transform(imp["Position"])
imp=fancyimpute.KNN(k=5).complete(imp)
imp = pd.DataFrame(imp)
imp.columns = ["Position", "Height", "Weight", "X40.Yard", "Bench", "Broad.Jump", "Shuttle", "X3.Cone", "Vertical"]
imp = imp.round({"X40.Yard": 2, "Bench": 0, "Broad.Jump": 0, "X3.Cone": 2, "Shuttle": 2})
imp["Vertical"] = round(imp["Vertical"] * 2) / 2
df[["Height", "Weight", "X40.Yard", "Bench", "Broad.Jump", "Shuttle", "X3.Cone", "Vertical"]] = imp.drop("Position", axis=1)

Imputing row 1/3151 with 1 missing, elapsed time: 1.772
Imputing row 101/3151 with 0 missing, elapsed time: 1.776
Imputing row 201/3151 with 0 missing, elapsed time: 1.782
Imputing row 301/3151 with 0 missing, elapsed time: 1.782
Imputing row 401/3151 with 1 missing, elapsed time: 1.792
Imputing row 501/3151 with 0 missing, elapsed time: 1.792
Imputing row 601/3151 with 0 missing, elapsed time: 1.802
Imputing row 701/3151 with 0 missing, elapsed time: 1.802
Imputing row 801/3151 with 5 missing, elapsed time: 1.812
Imputing row 901/3151 with 0 missing, elapsed time: 1.816
Imputing row 1001/3151 with 0 missing, elapsed time: 1.816
Imputing row 1101/3151 with 0 missing, elapsed time: 1.822
Imputing row 1201/3151 with 1 missing, elapsed time: 1.822
Imputing row 1301/3151 with 0 missing, elapsed time: 1.832
Imputing row 1401/3151 with 5 missing, elapsed time: 1.832
Imputing row 1501/3151 with 1 missing, elapsed time: 1.842
Imputing row 1601/3151 with 3 missing, elapsed time: 1.842
Imputing 

In [111]:
# Impute missing games and season values
imp = df[["Position", "Games", "Seasons"]]
imp=imp.replace({0:np.nan})
imp["Position"] = number.fit_transform(imp["Position"])
imp=fancyimpute.KNN(k=10).complete(imp)
imp = pd.DataFrame(imp)
imp = imp.round(0)
df[["Games", "Seasons"]] = imp.drop(0, 1)

Imputing row 1/3151 with 0 missing, elapsed time: 1.294
Imputing row 101/3151 with 0 missing, elapsed time: 1.294
Imputing row 201/3151 with 2 missing, elapsed time: 1.294
Imputing row 301/3151 with 0 missing, elapsed time: 1.298
Imputing row 401/3151 with 0 missing, elapsed time: 1.298
Imputing row 501/3151 with 0 missing, elapsed time: 1.298
Imputing row 601/3151 with 2 missing, elapsed time: 1.298
Imputing row 701/3151 with 0 missing, elapsed time: 1.302
Imputing row 801/3151 with 0 missing, elapsed time: 1.302
Imputing row 901/3151 with 0 missing, elapsed time: 1.302
Imputing row 1001/3151 with 0 missing, elapsed time: 1.302
Imputing row 1101/3151 with 0 missing, elapsed time: 1.306
Imputing row 1201/3151 with 0 missing, elapsed time: 1.306
Imputing row 1301/3151 with 0 missing, elapsed time: 1.306
Imputing row 1401/3151 with 0 missing, elapsed time: 1.306
Imputing row 1501/3151 with 0 missing, elapsed time: 1.310
Imputing row 1601/3151 with 0 missing, elapsed time: 1.310
Imputing 

# Fix Position Data

In [112]:
# Let's get some information about the positions.
df.groupby("Position").size()

Position
C        96
CB      342
DB        2
DE      246
DT      247
EDGE     19
FB       45
FS      102
ILB     153
K        46
LB        2
LS       14
NT        3
OG      125
OL        2
OLB     241
OT      175
P        60
QB      177
RB      295
S        24
SS      110
TE      166
WR      459
dtype: int64

In [113]:
df[df.Position=="DB"]

Unnamed: 0,Year,Name,Position,Round,Pick,College,Conference,Games,Seasons,Height,...,PR.TD,KRs,KR.Yds,KR.TD,XPA,XP.,FGA,FG.,Punts,Punt.Avg
2954,2018,Minkah Fitzpatrick,DB,1,11,Alabama,SEC,42.0,3.0,73.0,...,0,1,16,1,0,0.0,0,0.0,0,0.0
3039,2018,Quenton Meeks,DB,0,0,Stanford,Pac-12,36.0,3.0,74.0,...,0,0,0,0,0,0.0,0,0.0,0,0.0


In [114]:
# Fitzpatrick will play mostly as a free safety and Meeks as a CB.
df.at[2954, "Position"] = "FS"
df.at[3039, "Position"] = "CB"

In [115]:
df[df.Position=="OL"]

Unnamed: 0,Year,Name,Position,Round,Pick,College,Conference,Games,Seasons,Height,...,PR.TD,KRs,KR.Yds,KR.TD,XPA,XP.,FGA,FG.,Punts,Punt.Avg
3030,2018,Cole Madison,OL,0,0,Small,Pac-12,40.0,4.0,77.0,...,0,0,0,0,0,0.0,0,0.0,0,0.0
3073,2018,Martinas Rankin,OL,3,80,Small,SEC,41.0,4.0,76.0,...,0,0,0,0,0,0.0,0,0.0,0,0.0


In [116]:
# Madison is a guard and Rankin is a tackle.
df.at[3030, "Position"] = "OG"
df.at[3073, "Position"] = "OT"

In [117]:
df.loc[df.Position=="LB", "Position"] = "OLB"
df.loc[df.Position=="NT", "Position"] = "DT"

In [118]:
# We next need to divide free safeties and strong safeties in our 2018 data. Let's
# examine what our safeties look like in our data. Let's use a simple logistic
# regression model based on Height, Weight, Total Tackles, and Interceptions.
s = df[(df.Position=="FS") | (df.Position=="SS")]
s = s[["Position", "Height", "Weight", "Total.Tackles", "Int", "Sacks"]]
s.head()
s_predict = LogisticRegressionCV()
s_predict.fit(s.drop("Position", 1), s.Position)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [119]:
safeties = df[df.Position=="S"]
df.loc[safeties.index, "Position"] = s_predict.predict(safeties[["Height", "Weight", "Total.Tackles", "Int", "Sacks"]])

In [120]:
# Let's do the same for edge rushers, forecasting their position by height and weight differences,
# as well as total tackles
outside = df[(df.Position=="OLB") | (df.Position=="DE")]
outside = outside[["Position", "Height", "Weight", "Total.Tackles"]]
outside.head()
edge_predict = LogisticRegressionCV()
edge_predict.fit(outside.drop("Position", 1), outside.Position)
edge = df[df.Position=="EDGE"]
df.loc[edge.index, "Position"] = edge_predict.predict(edge[["Height", "Weight", "Total.Tackles"]])

# Exploratory Analysis

In [152]:
# All of our "hybrid" positions have been dealt with. Let's now examine preliminary trends.
round_info = pd.crosstab(index=df["Position"], columns=df["Round"], margins=True) 
round_info / round_info.loc["All"]

Round,0,1,2,3,4,5,6,7,All
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C,0.034156,0.023669,0.028481,0.026549,0.039275,0.013699,0.035714,0.034934,0.030467
CB,0.083491,0.130178,0.136076,0.147493,0.108761,0.136986,0.095238,0.078603,0.108854
DE,0.062619,0.121302,0.094937,0.097345,0.075529,0.05137,0.083333,0.09607,0.080292
DT,0.063567,0.100592,0.10443,0.109145,0.063444,0.065068,0.075397,0.087336,0.07934
FB,0.018027,0.0,0.0,0.00295,0.012085,0.023973,0.019841,0.039301,0.014281
FS,0.044592,0.026627,0.037975,0.038348,0.030211,0.037671,0.043651,0.026201,0.037766
ILB,0.052182,0.026627,0.047468,0.038348,0.057402,0.061644,0.051587,0.048035,0.048556
K,0.032258,0.0,0.003165,0.0,0.003021,0.013699,0.011905,0.0131,0.014599
LS,0.008539,0.0,0.0,0.0,0.0,0.003425,0.015873,0.0,0.004443
OG,0.034156,0.038462,0.031646,0.047198,0.045317,0.065068,0.039683,0.030568,0.039987


As can be seen, there are a relatively high proportion of WRs, OTs, CBs, DEs, and DTs in the first round. In later rounds, More RBs and skill positions are taken. Let's now examine the proportions of rounds each position is typically drafted in.

In [154]:
round_info = pd.crosstab(index=df["Position"], columns=df["Round"], margins=True) 
round_info.div(round_info["All"], axis=0)

Round,0,1,2,3,4,5,6,7,All
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C,0.375,0.083333,0.09375,0.09375,0.135417,0.041667,0.09375,0.083333,1.0
CB,0.25656,0.12828,0.125364,0.145773,0.104956,0.116618,0.069971,0.052478,1.0
DE,0.26087,0.162055,0.118577,0.130435,0.098814,0.059289,0.083004,0.086957,1.0
DT,0.268,0.136,0.132,0.148,0.084,0.076,0.076,0.08,1.0
FB,0.422222,0.0,0.0,0.022222,0.088889,0.155556,0.111111,0.2,1.0
FS,0.394958,0.07563,0.10084,0.109244,0.084034,0.092437,0.092437,0.05042,1.0
ILB,0.359477,0.058824,0.098039,0.084967,0.124183,0.117647,0.084967,0.071895,1.0
K,0.73913,0.0,0.021739,0.0,0.021739,0.086957,0.065217,0.065217,1.0
LS,0.642857,0.0,0.0,0.0,0.0,0.071429,0.285714,0.0,1.0
OG,0.285714,0.103175,0.079365,0.126984,0.119048,0.150794,0.079365,0.055556,1.0


At least 1/3 of players in most positions are undrafted; OTs have a significantly low amount undrafted, however. Most OTs are picked in the first round, with mostly decreasing numbers after that. We can see that positions such as kickers, punters, and fullbacks are primarily drafted in the late rounds.

# Simple Regression

In [204]:
all_X = df.drop(["Name", "Round", "Pick", "College", "Conference"], 1)
all_X = pd.get_dummies(all_X)

train_X = all_X[(all_X.Year != 2018)].drop("Year", 1)
test_X = all_X[all_X.Year == 2018].drop("Year", 1)
train_y = df[(df.Year != 2018)].Round
test_y = df[df.Year == 2018].Round

In [205]:
log = LogisticRegressionCV()
log.fit(train_X, train_y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [217]:
count = 1
for i in pd.DataFrame(log.predict_proba(test_X)).sort_values(by=1, ascending=False).index:
    print(str(count) + " " + str(df[df.Year==2018].reset_index().at[i, "Name"]))
    count += 1

1 Quinton Flowers
2 James Washington
3 Lamar Jackson
4 Darius Phillips
5 Rashaad Penny
6 Vita Vea
7 Genard Avery
8 Nick Chubb
9 D.J. Reed
10 Saquon Barkley
11 Ogbonnia Okoronkwo
12 Parry Nickerson
13 Will Hernandez
14 Harold Landry
15 Dominick Sanders
16 Baker Mayfield
17 Braden Smith
18 Courtland Sutton
19 Jojo Wicker
20 Hercules Mata'afa
21 Tim Settle
22 Quenton Nelson
23 Kylie Fitts
24 Sam Darnold
25 Dane Cruikshank
26 Wyatt Teller
27 Harrison Phillips
28 Cedrick Wilson
29 Tegray Scales
30 Andre Chachere
31 Malik Jefferson
32 Tyquan Lewis
33 Anthony Miller
34 Josh Jackson
35 J.T. Barrett
36 Rashard Fant
37 Minkah Fitzpatrick
38 Leighton Vander esch
39 Isaiah Wynn
40 Mike Hughes
41 Deshon Elliott
42 Duke Ejiofor
43 Avonte Maddox
44 Deadrin Senat
45 Jordan Akins
46 Trevon Young
47 Jester Weah
48 Christian Kirk
49 Logan Woodside
50 Taven Bryan
51 Salesi Uhatafe
52 Josh Rosen
53 Bradley Chubb
54 Duke Dawson
55 Ian Thomas
56 Kendrick Norton
57 Danny Etling
58 Josh Allen
59 Jordan Thomas


We can also use XGBoost.

In [251]:
XGB = xgb.XGBClassifier(colsample_bytree=0.7,
 eta= 0.001,
 eval_metric= 'mae',
 max_depth= 6,
 min_child_weight= 15,
 objective= 'reg:linear',
 subsample= 0.7)

In [252]:
XGB.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eta=0.001, eval_metric='mae', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=15, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [261]:
count = 1
for i in pd.DataFrame(XGB.predict_proba(test_X)).sort_values(by=1, ascending=False).index:
    print(str(count) + " " + str(df[df.Year==2018].reset_index().at[i, "Name"]))
    count += 1

1 Mike McGlinchey
2 Saquon Barkley
3 Kolton Miller
4 Terrell Edmunds
5 Troy Apke
6 Justin Reid
7 Harrison Phillips
8 David Bright
9 Jaire Alexander
10 Derwin James
11 D.J. Chark
12 Bradley Chubb
13 Josh Allen
14 Josh Rosen
15 Lamar Jackson
16 Harold Landry
17 Joseph Noteboom
18 Sam Darnold
19 Genard Avery
20 Martinas Rankin
21 Taven Bryan
22 Leighton Vander esch
23 D.J. Moore
24 Dane Cruikshank
25 Frank Ragnow
26 Riley Ferguson
27 Duke Ejiofor
28 Malik Jefferson
29 Quenton Meeks
30 Marcus Davenport
31 Denzel Ward
32 Michael Gallup
33 Rick Leonard
34 Sam Hubbard
35 Courtland Sutton
36 B.J. Hill
37 Marquis Haynes
38 Avonte Maddox
39 Minkah Fitzpatrick
40 Jojo Wicker
41 Anthony Averett
42 Brian O'neill
43 Tyquan Lewis
44 Justin Jones
45 Matthew Thomas
46 Wyatt Teller
47 Rod Taylor
48 Mike Gesicki
49 Josh Jackson
50 Scott Quessenberry
51 Kylie Fitts
52 Keke Coutee
53 Shaquem Griffin
54 Will Richardson
55 Oren Burks
56 Godwin Igwebuike
57 Isaiah Wynn
58 Chase Litton
59 Joshua Kalu
60 Cole M