Import modules

In [22]:
import os
import math 
import numpy as np
import pandas as pd
import scipy as sp

from datetime import datetime
#
# plotting and visualization
#
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
#
import seaborn as sns
#
# modeling
#
from sklearn.preprocessing import OneHotEncoder as OHE
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

#ignore warning messages to ensure clean outputs
import warnings
warnings.filterwarnings('ignore')

Load pertinent CSV files

In [23]:
path = r'C:\Users\Joseph Shire\Documents\Springboard Python Data Science\Python Scripts\springboard\NFL Projects\Predicting prospects'

In [24]:
combine = pd.read_csv(path+'\combine.csv')
draft = pd.read_csv(path+'\draft.csv')
passer = pd.read_csv(path+'\passer.csv')
games = pd.read_csv(path+'\games.csv')
gameParticipation = pd.read_csv(path+'\gameParticipation.csv')


View info on each dataset

In [25]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   combineId           10080 non-null  int64  
 1   playerId            10080 non-null  int64  
 2   combineYear         10080 non-null  int64  
 3   combinePosition     10080 non-null  object 
 4   combineHeight       10080 non-null  float64
 5   combineWeight       10080 non-null  int64  
 6   combineHand         3592 non-null   float64
 7   nameFirst           10078 non-null  object 
 8   nameLast            10078 non-null  object 
 9   nameFull            10078 non-null  object 
 10  position            10077 non-null  object 
 11  collegeId           10078 non-null  float64
 12  nflId               8993 non-null   object 
 13  college             10078 non-null  object 
 14  heightInches        10073 non-null  float64
 15  weight              10073 non-null  float64
 16  dob 

In [26]:
passer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397265 entries, 0 to 397264
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   passId         397265 non-null  int64 
 1   playId         397265 non-null  int64 
 2   teamId         397265 non-null  int64 
 3   playerId       397265 non-null  int64 
 4   passPosition   397265 non-null  object
 5   passOutcomes   397265 non-null  object
 6   passDirection  327142 non-null  object
 7   passDepth      324935 non-null  object
 8   passLength     397265 non-null  int64 
 9   passAtt        397265 non-null  int64 
 10  passComp       397265 non-null  int64 
 11  passTd         397265 non-null  int64 
 12  passInt        397265 non-null  int64 
 13  passIntTd      397265 non-null  int64 
 14  passSack       397265 non-null  int64 
 15  passSackYds    397265 non-null  int64 
 16  passHit        397265 non-null  int64 
 17  passDef        397265 non-null  int64 
 18  pass

In [27]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5308 entries, 0 to 5307
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   gameId                  5308 non-null   int64 
 1   season                  5308 non-null   int64 
 2   week                    5308 non-null   int64 
 3   gameDate                5308 non-null   object
 4   gameTimeEastern         5308 non-null   object
 5   gameTimeLocal           5308 non-null   object
 6   homeTeamId              5308 non-null   int64 
 7   visitorTeamId           5308 non-null   int64 
 8   seasonType              5308 non-null   object
 9   weekNameAbbr            5308 non-null   object
 10  siteId                  5308 non-null   int64 
 11  homeTeamDistance        5308 non-null   int64 
 12  visitingTeamDistance    5308 non-null   int64 
 13  homeTeamFinalScore      5308 non-null   int64 
 14  visitingTeamFinalScore  5308 non-null   int64 
 15  winn

In [28]:
gameParticipation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423185 entries, 0 to 423184
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   gamePartId         423185 non-null  int64  
 1   gameId             423185 non-null  int64  
 2   teamId             423185 non-null  int64  
 3   playerId           423185 non-null  int64  
 4   gamePartUnit       423185 non-null  object 
 5   gamePartSnapCount  423185 non-null  int64  
 6   nameFirst          423113 non-null  object 
 7   nameLast           423113 non-null  object 
 8   nameFull           423113 non-null  object 
 9   position           423113 non-null  object 
 10  collegeId          423113 non-null  float64
 11  nflId              423113 non-null  float64
 12  combineId          269552 non-null  float64
 13  college            423113 non-null  object 
 14  heightInches       423113 non-null  float64
 15  weight             423113 non-null  float64
 16  do

#### Clean the data

##### Combine

Percent missing data for each column of combine dataset:

In [29]:
100*combine.isna().sum()/len(combine)

combineId              0.000000
playerId               0.000000
combineYear            0.000000
combinePosition        0.000000
combineHeight          0.000000
combineWeight          0.000000
combineHand           64.365079
nameFirst              0.019841
nameLast               0.019841
nameFull               0.019841
position               0.029762
collegeId              0.019841
nflId                 10.783730
college                0.019841
heightInches           0.069444
weight                 0.069444
dob                   15.763889
ageAtDraft            15.763889
playerProfileUrl      10.783730
homeCity              25.376984
homeState             26.855159
homeCountry           24.910714
highSchool            58.611111
hsCity                76.914683
hsState               58.759921
hsCountry             58.611111
combineArm            67.490079
combine40yd           10.496032
combineVert           16.061508
combineBench          28.888889
combineShuttle        25.734127
combineB

In [30]:
combine.columns

Index(['combineId', 'playerId', 'combineYear', 'combinePosition',
       'combineHeight', 'combineWeight', 'combineHand', 'nameFirst',
       'nameLast', 'nameFull', 'position', 'collegeId', 'nflId', 'college',
       'heightInches', 'weight', 'dob', 'ageAtDraft', 'playerProfileUrl',
       'homeCity', 'homeState', 'homeCountry', 'highSchool', 'hsCity',
       'hsState', 'hsCountry', 'combineArm', 'combine40yd', 'combineVert',
       'combineBench', 'combineShuttle', 'combineBroad', 'combine3cone',
       'combine60ydShuttle', 'combineWonderlic'],
      dtype='object')

We can eliminate columns with an extreme amount of missing data, as well as useless information:

In [31]:
combine = combine.drop(columns=['playerProfileUrl',
       'homeCity', 'homeState', 'homeCountry', 'highSchool', 'hsCity',
       'hsState', 'hsCountry','combine60ydShuttle', 'combineWonderlic','dob'])

In [32]:
for col in combine:
    if combine[col].isna().sum() >0 and combine[col].dtype == 'float64' :
        combine[col].fillna(combine[col].mean(), inplace=True)

In [33]:
combine.dropna(subset=['nameFirst',
       'nameLast', 'nameFull'], inplace=True)

In [35]:
combine.fillna("", inplace=True)

In [36]:
100*combine.isna().sum()/len(combine)

combineId          0.0
playerId           0.0
combineYear        0.0
combinePosition    0.0
combineHeight      0.0
combineWeight      0.0
combineHand        0.0
nameFirst          0.0
nameLast           0.0
nameFull           0.0
position           0.0
collegeId          0.0
nflId              0.0
college            0.0
heightInches       0.0
weight             0.0
ageAtDraft         0.0
combineArm         0.0
combine40yd        0.0
combineVert        0.0
combineBench       0.0
combineShuttle     0.0
combineBroad       0.0
combine3cone       0.0
dtype: float64

##### Passer

Percent missing data for each column of combine dataset:

In [37]:
100*passer.isna().sum()/len(combine)

passId             0.000000
playId             0.000000
teamId             0.000000
playerId           0.000000
passPosition       0.000000
passOutcomes       0.000000
passDirection    695.802739
passDepth        717.701925
passLength         0.000000
passAtt            0.000000
passComp           0.000000
passTd             0.000000
passInt            0.000000
passIntTd          0.000000
passSack           0.000000
passSackYds        0.000000
passHit            0.000000
passDef            0.000000
passNull           0.000000
dtype: float64

In [38]:
passer.columns

Index(['passId', 'playId', 'teamId', 'playerId', 'passPosition',
       'passOutcomes', 'passDirection', 'passDepth', 'passLength', 'passAtt',
       'passComp', 'passTd', 'passInt', 'passIntTd', 'passSack', 'passSackYds',
       'passHit', 'passDef', 'passNull'],
      dtype='object')

In [39]:
passer = passer.drop(columns=['passOutcomes', 'passDirection', 'passDepth'])

##### Games

Percent missing data for each column of combine dataset: