In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load file into dataframe
file_path = "./NFL.csv"
nfl_df = pd.read_csv(file_path)
nfl_df.head()

Unnamed: 0,Year,Player,Age,School,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,Drafted..tm.rnd.yr.,BMI,Player_Type,Position_Type,Position,Drafted
0,2009,Beanie Wells\WellCh00,20.0,Ohio St.,1.8542,106.594207,4.38,85.09,25.0,325.12,,,Arizona Cardinals / 1st / 31st pick / 2009,31.004194,offense,backs_receivers,RB,Yes
1,2009,Will Davis\DaviWi99,22.0,Illinois,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,Arizona Cardinals / 6th / 204th pick / 2009,33.510073,defense,defensive_lineman,DE,Yes
2,2009,Herman Johnson\JohnHe23,24.0,LSU,2.0066,165.107623,5.5,,21.0,,,,Arizona Cardinals / 5th / 167th pick / 2009,41.005821,offense,offensive_lineman,OG,Yes
3,2009,Rashad Johnson\JohnRa98,23.0,Alabama,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,Arizona Cardinals / 3rd / 95th pick / 2009,28.312463,defense,defensive_back,FS,Yes
4,2009,Cody Brown\BrowCo96,22.0,Connecticut,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,Arizona Cardinals / 2nd / 63rd pick / 2009,31.327425,defense,line_backer,OLB,Yes


In [3]:
# Drop columns that we won't use
bad_columns = ['Year', 'Player', 'Age', 'School', 'Drafted..tm.rnd.yr.', 'Player_Type', 'Position']
nfl_df = nfl_df.drop(bad_columns, axis=1)
nfl_df.head()

Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Position_Type,Drafted
0,1.8542,106.594207,4.38,85.09,25.0,325.12,,,31.004194,backs_receivers,Yes
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,defensive_lineman,Yes
2,2.0066,165.107623,5.5,,21.0,,,,41.005821,offensive_lineman,Yes
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,defensive_back,Yes
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,line_backer,Yes


In [4]:
# Look at number of values for each column
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3477 entries, 0 to 3476
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Height            3477 non-null   float64
 1   Weight            3477 non-null   float64
 2   Sprint_40yd       3303 non-null   float64
 3   Vertical_Jump     2780 non-null   float64
 4   Bench_Press_Reps  2572 non-null   float64
 5   Broad_Jump        2749 non-null   float64
 6   Agility_3cone     2260 non-null   float64
 7   Shuttle           2337 non-null   float64
 8   BMI               3477 non-null   float64
 9   Position_Type     3477 non-null   object 
 10  Drafted           3477 non-null   object 
dtypes: float64(9), object(2)
memory usage: 298.9+ KB


In [5]:
# Drop all other null values and see that we have 1731 data points remaining
nfl_df_dropped = nfl_df.dropna()
nfl_df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1731 entries, 1 to 3475
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Height            1731 non-null   float64
 1   Weight            1731 non-null   float64
 2   Sprint_40yd       1731 non-null   float64
 3   Vertical_Jump     1731 non-null   float64
 4   Bench_Press_Reps  1731 non-null   float64
 5   Broad_Jump        1731 non-null   float64
 6   Agility_3cone     1731 non-null   float64
 7   Shuttle           1731 non-null   float64
 8   BMI               1731 non-null   float64
 9   Position_Type     1731 non-null   object 
 10  Drafted           1731 non-null   object 
dtypes: float64(9), object(2)
memory usage: 162.3+ KB


In [6]:
# Create df with mean values in place of null values
nfl_df_meaned = nfl_df.copy()

#Find the mean value for each column
mean_sprint = nfl_df['Sprint_40yd'].mean()
mean_vert = nfl_df['Vertical_Jump'].mean()
mean_bench = nfl_df['Bench_Press_Reps'].mean()
mean_broad = nfl_df['Broad_Jump'].mean()
mean_3cone = nfl_df['Agility_3cone'].mean()
mean_shuttle = nfl_df['Shuttle'].mean()

# Replace all null values with that mean value
nfl_df_meaned['Sprint_40yd'].fillna(value=mean_sprint, inplace=True)
nfl_df_meaned['Vertical_Jump'].fillna(value=mean_vert, inplace=True)
nfl_df_meaned['Bench_Press_Reps'].fillna(value=mean_bench, inplace=True)
nfl_df_meaned['Broad_Jump'].fillna(value=mean_broad, inplace=True)
nfl_df_meaned['Agility_3cone'].fillna(value=mean_3cone, inplace=True)
nfl_df_meaned['Shuttle'].fillna(value=mean_shuttle, inplace=True)

#Display the meaned dataframe
nfl_df_meaned.head()

Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Position_Type,Drafted
0,1.8542,106.594207,4.38,85.09,25.0,325.12,7.237416,4.403843,31.004194,backs_receivers,Yes
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,defensive_lineman,Yes
2,2.0066,165.107623,5.5,83.392403,21.0,291.629698,7.237416,4.403843,41.005821,offensive_lineman,Yes
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,defensive_back,Yes
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,line_backer,Yes


In [7]:
# Create df with mean values in place of null values
nfl_df_meaned2 = nfl_df.copy()

for position_type in nfl_df['Position_Type']:
    
    df = nfl_df.loc[nfl_df['Position_Type'] ==position_type]
    
    #Find the mean value for each column for each position type
    mean_sprint = df['Sprint_40yd'].mean()
    mean_vert = df['Vertical_Jump'].mean()
    mean_bench = df['Bench_Press_Reps'].mean()
    mean_broad = df['Broad_Jump'].mean()
    mean_3cone = df['Agility_3cone'].mean()
    mean_shuttle = df['Shuttle'].mean()

    # Replace all null values with that mean value
    df['Sprint_40yd'].fillna(value=mean_sprint, inplace=True)
    df['Vertical_Jump'].fillna(value=mean_vert, inplace=True)
    df['Bench_Press_Reps'].fillna(value=mean_bench, inplace=True)
    df['Broad_Jump'].fillna(value=mean_broad, inplace=True)
    df['Agility_3cone'].fillna(value=mean_3cone, inplace=True)
    df['Shuttle'].fillna(value=mean_shuttle, inplace=True)

    nfl_df_meaned2.loc[nfl_df_meaned['Position_Type'] ==position_type] = df
#Display the meaned dataframe
nfl_df_meaned2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Position_Type,Drafted
0,1.8542,106.594207,4.38,85.09,25.0,325.12,7.033385,4.290321,31.004194,backs_receivers,Yes
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,defensive_lineman,Yes
2,2.0066,165.107623,5.5,69.944605,21.0,258.528591,7.820439,4.770353,41.005821,offensive_lineman,Yes
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,defensive_back,Yes
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,line_backer,Yes


In [8]:
# WE WILL RUN THE MODEL FOR BOTH DROPPED NULL VALUES AND MEANED NULL VALUES
# Dropped first:
# Create dataframe for features
X_drop = nfl_df_dropped.drop(["Drafted", "Position_Type"], axis=1)

In [9]:
# Create target variable
y_drop = nfl_df_dropped.Drafted.values

In [10]:
# Seperate into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X_drop,y_drop,random_state=5)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create and fit the model
rf_dropped = BalancedRandomForestClassifier(n_estimators = 128, random_state = 10)
rf_dropped.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=10)

In [13]:
# Use the model to predict
y_pred = rf_dropped.predict(X_test_scaled)

In [14]:
# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6628175519630485

In [15]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[100  52]
 [ 94 187]]


In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.52      0.66      0.67      0.58      0.66      0.44       152
        Yes       0.78      0.67      0.66      0.72      0.66      0.44       281

avg / total       0.69      0.66      0.66      0.67      0.66      0.44       433



In [18]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_dropped.feature_importances_, X_drop.columns), reverse=True)

[(0.148165817655903, 'Sprint_40yd'),
 (0.12171272660241703, 'BMI'),
 (0.12142392843878268, 'Agility_3cone'),
 (0.12067955840030756, 'Weight'),
 (0.1144735895270801, 'Shuttle'),
 (0.10902847082991358, 'Bench_Press_Reps'),
 (0.0991115832295414, 'Broad_Jump'),
 (0.09855485963691278, 'Vertical_Jump'),
 (0.06684946567914182, 'Height')]

In [19]:
# Now meaned
# Create dataframe for features
X = nfl_df_meaned.drop(["Drafted", "Position_Type"], axis=1)

In [20]:
# Create target variable
y = nfl_df_meaned.Drafted.values

In [21]:
# Seperate into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5)

In [22]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create and fit the model
rf_meaned = BalancedRandomForestClassifier(n_estimators = 128, random_state = 10)
rf_meaned.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=10)

In [24]:
# Use the model to predict
y_pred = rf_meaned.predict(X_test_scaled)

In [25]:
# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6298850574712643

In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[199 130]
 [192 349]]


In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.51      0.60      0.65      0.55      0.62      0.39       329
        Yes       0.73      0.65      0.60      0.68      0.62      0.39       541

avg / total       0.65      0.63      0.62      0.63      0.62      0.39       870



In [28]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_meaned.feature_importances_, X.columns), reverse=True)

[(0.1686495149172964, 'Sprint_40yd'),
 (0.1531355136723559, 'BMI'),
 (0.1449952937446715, 'Weight'),
 (0.09683164990825262, 'Bench_Press_Reps'),
 (0.09288218794760487, 'Broad_Jump'),
 (0.09259605627361793, 'Vertical_Jump'),
 (0.09176505657835668, 'Agility_3cone'),
 (0.0859485118152406, 'Shuttle'),
 (0.07319621514260344, 'Height')]

In [29]:
# Now meaned by position group
# Create dataframe for features
X = nfl_df_meaned2.drop(["Drafted", "Position_Type"], axis=1)

In [30]:
# Create target variable
y = nfl_df_meaned2.Drafted.values

In [31]:
# Seperate into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5)

In [32]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Create and fit the model
rf_meaned2 = BalancedRandomForestClassifier(n_estimators = 128, random_state = 10)
rf_meaned2.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=10)

In [34]:
# Use the model to predict
y_pred = rf_meaned2.predict(X_test_scaled)

In [35]:
# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6367816091954023

In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[204 125]
 [191 350]]


In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.52      0.62      0.65      0.56      0.63      0.40       329
        Yes       0.74      0.65      0.62      0.69      0.63      0.40       541

avg / total       0.65      0.64      0.63      0.64      0.63      0.40       870



In [38]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_meaned.feature_importances_, X.columns), reverse=True)

[(0.1686495149172964, 'Sprint_40yd'),
 (0.1531355136723559, 'BMI'),
 (0.1449952937446715, 'Weight'),
 (0.09683164990825262, 'Bench_Press_Reps'),
 (0.09288218794760487, 'Broad_Jump'),
 (0.09259605627361793, 'Vertical_Jump'),
 (0.09176505657835668, 'Agility_3cone'),
 (0.0859485118152406, 'Shuttle'),
 (0.07319621514260344, 'Height')]

In [39]:
X = nfl_df_dropped.drop(["Drafted", "Position_Type"], axis=1)
y = nfl_df_dropped.Drafted.values

# Seperate into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
from imblearn.ensemble import EasyEnsembleClassifier
boost_null = EasyEnsembleClassifier(n_estimators=64,random_state=10)
boost_null.fit(X_train_scaled,y_train)
y_pred = boost_null.predict(X_test_scaled)

In [41]:
# Calculate the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6535796766743649

In [42]:
# Try simple logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=10)
classifier.fit(X_train_scaled,y_train)
y_pred = classifier.predict(X_test_scaled)

In [43]:
# Calculate the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6697459584295612

In [44]:
# Try encoding the position type
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
nfl_df_encoded = pd.DataFrame(enc.fit_transform(nfl_df.Position_Type.values.reshape(-1,1)))

# Rename encoded columns
nfl_df_encoded.columns = enc.get_feature_names(['Position_Type'])
nfl_df_encoded.head()



Unnamed: 0,Position_Type_backs_receivers,Position_Type_defensive_back,Position_Type_defensive_lineman,Position_Type_kicking_specialist,Position_Type_line_backer,Position_Type_offensive_lineman,Position_Type_other_special
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [45]:
# Merge the two DataFrames together and drop the Country column
nfl_df = nfl_df.merge(nfl_df_encoded,left_index=True,right_index=True).drop("Position_Type",1)
nfl_df.head()

  


Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Drafted,Position_Type_backs_receivers,Position_Type_defensive_back,Position_Type_defensive_lineman,Position_Type_kicking_specialist,Position_Type_line_backer,Position_Type_offensive_lineman,Position_Type_other_special
0,1.8542,106.594207,4.38,85.09,25.0,325.12,,,31.004194,Yes,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,Yes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2.0066,165.107623,5.5,,21.0,,,,41.005821,Yes,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,Yes,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,Yes,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [46]:
# Drop all other null values and see that we have 1731 data points remaining
nfl_df_encoded = nfl_df.dropna()
nfl_df_encoded.head()

Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Drafted,Position_Type_backs_receivers,Position_Type_defensive_back,Position_Type_defensive_lineman,Position_Type_kicking_specialist,Position_Type_line_backer,Position_Type_offensive_lineman,Position_Type_other_special
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,Yes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,Yes,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,Yes,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,2.032,140.160042,5.32,55.88,19.0,238.76,7.87,4.88,33.945078,Yes,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,1.8796,120.65557,4.53,88.9,28.0,304.8,7.46,4.43,34.152029,Yes,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [51]:
# Create dataframe for features
X = nfl_df_encoded.drop(["Drafted"], axis=1)

# Create target variable
y = nfl_df_encoded.Drafted.values

# Seperate into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
# Create and fit the model
rf_encoded = BalancedRandomForestClassifier(n_estimators = 128, random_state = 10)
rf_encoded.fit(X_train_scaled, y_train)

# Use the model to predict
y_pred = rf_encoded.predict(X_test_scaled)

# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6558891454965358

In [49]:
# Try resampling model
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=10)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
smoteen_model = LogisticRegression(solver='lbfgs', random_state=1)
smoteen_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [50]:
y_pred = smoteen_model.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))

0.5635103926096998


In [52]:
# Try simple logistic regression with encoded Position_Type
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=10)
classifier.fit(X_train_scaled,y_train)
y_pred = classifier.predict(X_test_scaled)

In [53]:
print(accuracy_score(y_test, y_pred))

0.674364896073903
