# Import Libraries

In [56]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

# Obtain Data

Read csv file with combine data into pandas

In [57]:
df = pd.read_csv('Data/Combine_Data.csv')
df.head()

Unnamed: 0,Year,Player,Position,School,College Stats,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump,3 Cone,Shuttle,Draft
0,2015,Ameer Abdullah,RB,Nebraska,College Stats,5-9,205.0,4.6,42.5,24.0,130.0,6.79,3.95,Detroit Lions / 2nd / 54th pick / 2015
1,2015,Nelson Agholor,WR,USC,College Stats,6-0,198.0,4.42,,12.0,,,,Philadelphia Eagles / 1st / 20th pick / 2015
2,2015,Jay Ajayi,RB,Boise State,College Stats,6-0,221.0,4.57,39.0,19.0,121.0,7.1,4.1,Miami Dolphins / 5th / 149th pick / 2015
3,2015,Kwon Alexander,OLB,LSU,College Stats,6-1,227.0,4.55,36.0,24.0,121.0,7.14,4.2,Tampa Bay Buccaneers / 4th / 124th pick / 2015
4,2015,Mario Alford,WR,West Virginia,College Stats,5-8,180.0,4.43,34.0,13.0,121.0,6.64,4.07,Cincinnati Bengals / 7th / 238th pick / 2015


Filter df to only include WR and CB Position players

In [58]:
df = df[(df['Position'] == 'WR') | (df['Position'] == 'CB')]
df.reset_index(drop=True, inplace=True)
print('df shape:', df.shape)
df.head()

df shape: (506, 14)


Unnamed: 0,Year,Player,Position,School,College Stats,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump,3 Cone,Shuttle,Draft
0,2015,Nelson Agholor,WR,USC,College Stats,6-0,198.0,4.42,,12.0,,,,Philadelphia Eagles / 1st / 20th pick / 2015
1,2015,Mario Alford,WR,West Virginia,College Stats,5-8,180.0,4.43,34.0,13.0,121.0,6.64,4.07,Cincinnati Bengals / 7th / 238th pick / 2015
2,2015,Dres Anderson,WR,Utah,College Stats,6-1,187.0,4.54,,13.0,,,,
3,2015,Kenny Bell,WR,Nebraska,College Stats,6-1,197.0,4.42,41.5,7.0,129.0,6.66,4.15,Tampa Bay Buccaneers / 5th / 162nd pick / 2015
4,2015,Da'Ron Brown,WR,Northern Illinois,College Stats,6-0,205.0,4.54,37.0,17.0,120.0,7.04,4.11,Kansas City Chiefs / 7th / 233rd pick / 2015


# Scrub Data

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           506 non-null    int64  
 1   Player         506 non-null    object 
 2   Position       506 non-null    object 
 3   School         506 non-null    object 
 4   College Stats  464 non-null    object 
 5   Height         506 non-null    object 
 6   Weight         506 non-null    float64
 7   40 Yd Dash     453 non-null    float64
 8   Vertical       424 non-null    float64
 9   Bench          385 non-null    float64
 10  Broad Jump     420 non-null    float64
 11  3 Cone         299 non-null    float64
 12  Shuttle        309 non-null    float64
 13  Draft          318 non-null    object 
dtypes: float64(7), int64(1), object(6)
memory usage: 55.5+ KB


Drop columns not useful for modeling

In [60]:
df.drop(columns=['Draft', 'School', 'Player', 'College Stats'], inplace=True)
df.head()

Unnamed: 0,Year,Position,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump,3 Cone,Shuttle
0,2015,WR,6-0,198.0,4.42,,12.0,,,
1,2015,WR,5-8,180.0,4.43,34.0,13.0,121.0,6.64,4.07
2,2015,WR,6-1,187.0,4.54,,13.0,,,
3,2015,WR,6-1,197.0,4.42,41.5,7.0,129.0,6.66,4.15
4,2015,WR,6-0,205.0,4.54,37.0,17.0,120.0,7.04,4.11


Change height units to inches and update to int data type

In [61]:
def feet_to_inches(val):
    return int(val.split('-')[0])*12 + int(val.split('-')[1])
df['Height'] = df['Height'].apply(feet_to_inches)
df.head()

Unnamed: 0,Year,Position,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump,3 Cone,Shuttle
0,2015,WR,72,198.0,4.42,,12.0,,,
1,2015,WR,68,180.0,4.43,34.0,13.0,121.0,6.64,4.07
2,2015,WR,73,187.0,4.54,,13.0,,,
3,2015,WR,73,197.0,4.42,41.5,7.0,129.0,6.66,4.15
4,2015,WR,72,205.0,4.54,37.0,17.0,120.0,7.04,4.11


Numerically code Position variable

In [62]:
df['Position'] = df['Position'].replace({'CB':0, 'WR':1})
df.head()

Unnamed: 0,Year,Position,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump,3 Cone,Shuttle
0,2015,1,72,198.0,4.42,,12.0,,,
1,2015,1,68,180.0,4.43,34.0,13.0,121.0,6.64,4.07
2,2015,1,73,187.0,4.54,,13.0,,,
3,2015,1,73,197.0,4.42,41.5,7.0,129.0,6.66,4.15
4,2015,1,72,205.0,4.54,37.0,17.0,120.0,7.04,4.11


Drop features missing more than one quarter of values

In [63]:
df.drop(columns=['3 Cone', 'Shuttle'], inplace=True)
df.head()

Unnamed: 0,Year,Position,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump
0,2015,1,72,198.0,4.42,,12.0,
1,2015,1,68,180.0,4.43,34.0,13.0,121.0
2,2015,1,73,187.0,4.54,,13.0,
3,2015,1,73,197.0,4.42,41.5,7.0,129.0
4,2015,1,72,205.0,4.54,37.0,17.0,120.0


Drop players missing 4 values

In [64]:
df = df[df.isna().sum(axis=1)<4]
print('df shape:', df.shape)
df.head()

df shape: (487, 8)


Unnamed: 0,Year,Position,Height,Weight,40 Yd Dash,Vertical,Bench,Broad Jump
0,2015,1,72,198.0,4.42,,12.0,
1,2015,1,68,180.0,4.43,34.0,13.0,121.0
2,2015,1,73,187.0,4.54,,13.0,
3,2015,1,73,197.0,4.42,41.5,7.0,129.0
4,2015,1,72,205.0,4.54,37.0,17.0,120.0


Impute missing values