In [1]:
# Import dependencies
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [10]:
# Load file into dataframe
file_path = "NFL.csv"
nfl_df = pd.read_csv(file_path)
nfl_df.head()

Unnamed: 0,Year,Player,Age,School,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,Drafted..tm.rnd.yr.,BMI,Player_Type,Position_Type,Position,Drafted
0,2009,Beanie Wells\WellCh00,20.0,Ohio St.,1.8542,106.594207,4.38,85.09,25.0,325.12,,,Arizona Cardinals / 1st / 31st pick / 2009,31.004194,offense,backs_receivers,RB,Yes
1,2009,Will Davis\DaviWi99,22.0,Illinois,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,Arizona Cardinals / 6th / 204th pick / 2009,33.510073,defense,defensive_lineman,DE,Yes
2,2009,Herman Johnson\JohnHe23,24.0,LSU,2.0066,165.107623,5.5,,21.0,,,,Arizona Cardinals / 5th / 167th pick / 2009,41.005821,offense,offensive_lineman,OG,Yes
3,2009,Rashad Johnson\JohnRa98,23.0,Alabama,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,Arizona Cardinals / 3rd / 95th pick / 2009,28.312463,defense,defensive_back,FS,Yes
4,2009,Cody Brown\BrowCo96,22.0,Connecticut,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,Arizona Cardinals / 2nd / 63rd pick / 2009,31.327425,defense,line_backer,OLB,Yes


In [11]:
# Drop columns that we won't use
bad_columns = ['Year', 'Player', 'Age', 'School', 'Drafted..tm.rnd.yr.', 'Player_Type', 'Position']
nfl_df = nfl_df.drop(bad_columns, axis=1)
nfl_df.head()

Unnamed: 0,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,BMI,Position_Type,Drafted
0,1.8542,106.594207,4.38,85.09,25.0,325.12,,,31.004194,backs_receivers,Yes
1,1.8796,118.387609,4.84,83.82,27.0,292.1,7.38,4.45,33.510073,defensive_lineman,Yes
2,2.0066,165.107623,5.5,,21.0,,,,41.005821,offensive_lineman,Yes
3,1.8034,92.079251,4.49,93.98,15.0,304.8,7.09,4.23,28.312463,defensive_back,Yes
4,1.8796,110.676538,4.76,92.71,26.0,304.8,7.1,4.4,31.327425,line_backer,Yes


In [20]:
nfl_df_dropped = nfl_df.dropna()
nfl_df_dropped['Position_Type'].value_counts()

backs_receivers      565
offensive_lineman    345
defensive_back       330
defensive_lineman    290
line_backer          199
other_special          2
Name: Position_Type, dtype: int64

In [40]:
def rf_model(df, position_type):
    # Create dataframe for features
    X = df.drop(["Drafted", "Position_Type"], axis=1)
    
    # Create target variable
    y = df.Drafted.values
    
    # Seperate into training and testing sets
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5)
    
    # Creating a StandardScaler instance.
    scaler = StandardScaler()
    # Fitting the Standard Scaler with the training data.
    X_scaler = scaler.fit(X_train)
    # Scaling the data.
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    # Create and fit the model
    model = BalancedRandomForestClassifier(n_estimators = 128, random_state = 10)
    model.fit(X_train_scaled, y_train)
    
    # Use the model to predict
    y_pred = model.predict(X_test_scaled)
    
    # Calculated the balanced accuracy score
    accuracy = balanced_accuracy_score(y_test, y_pred)
    
    # Display the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print the imbalanced classification report
    class_report = classification_report_imbalanced(y_test, y_pred)
    
    # List the features sorted in descending order by feature importance
    features = sorted(zip(model.feature_importances_, X.columns), reverse=True)
    
    #Put all useful results into a dictionary
    results = {'Position_Type': position_type,'Accuracy': accuracy, 'cm': cm, 'class_report': class_report, 'features': features}
    
    # Return that dictionary
    return results

In [28]:
# Create an empty list to hold results
results = []

# Go through each position type
for position_type in nfl_df_dropped['Position_Type'].unique():
    if position_type == 'other_special':
        pass
    
    else:
        # Make a dataframe of players just in that position
        df = nfl_df_dropped.loc[nfl_df_dropped['Position_Type']==position_type]
        # Make a random forest model for this position and append the results to the results list
        results.append(rf_model(df, position_type))

In [34]:
for result in results:
    print(result['Position_Type'], result['Accuracy'])

defensive_lineman 0.5565953654188949
defensive_back 0.6874201787994891
line_backer 0.7017543859649122
offensive_lineman 0.5948275862068966
backs_receivers 0.6656403940886699


In [35]:
def nullfill(df):
    #Find the mean value for each column for each position type
    mean_sprint = df['Sprint_40yd'].mean()
    mean_vert = df['Vertical_Jump'].mean()
    mean_bench = df['Bench_Press_Reps'].mean()
    mean_broad = df['Broad_Jump'].mean()
    mean_3cone = df['Agility_3cone'].mean()
    mean_shuttle = df['Shuttle'].mean()

    # Replace all null values with that mean value
    df['Sprint_40yd'].fillna(value=mean_sprint, inplace=True)
    df['Vertical_Jump'].fillna(value=mean_vert, inplace=True)
    df['Bench_Press_Reps'].fillna(value=mean_bench, inplace=True)
    df['Broad_Jump'].fillna(value=mean_broad, inplace=True)
    df['Agility_3cone'].fillna(value=mean_3cone, inplace=True)
    df['Shuttle'].fillna(value=mean_shuttle, inplace=True)
    
    return df

In [41]:
# Create an empty list to hold results
results = []

# Go through each position type
for position_type in nfl_df_dropped['Position_Type'].unique():
    if position_type == 'other_special':
        pass
    
    else:
        # Make a dataframe of players just in that position
        df = nfl_df.loc[nfl_df['Position_Type']==position_type]
        
        # Fill null values with mean values
        df = nullfill(df)
                
        # Make a random forest model for this position and append the results to the results list
        results.append(rf_model(df, position_type))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be 

In [42]:
for result in results:
    print(result['Position_Type'], result['Accuracy'])

defensive_lineman 0.5361111111111111
defensive_back 0.6557994757536042
line_backer 0.6200980392156863
offensive_lineman 0.6416345742404792
backs_receivers 0.5910869565217391
