In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup, Comment
import requests
import time

Below I am reading in CSV files that I downloaded from profootballfocus.com for the years and positions that I am looking at, from 2018-2009.  Each CSV represented one season of play for one position, so you can see below that I am joining all the years together for each position, creating dataframes for each differnet category (passing, rushing, receiving, defense), and creating a unique key for each player and the year they are playing in that I will use later to map this data to my larger dataset. 

In [19]:
draft_years = ['2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009']
passing_df = pd.DataFrame()

for year in draft_years: 
    temp_df = pd.read_csv('./data/targets/passing/passing_summary_' + year + '.csv')
    temp_df['key'] = temp_df['player'] + year
    temp_df['year'] = int(year)
    passing_df = pd.concat([passing_df, temp_df])

In [20]:
draft_years = ['2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009']
rushing_df = pd.DataFrame()

for year in draft_years: 
    temp_df = pd.read_csv('./data/targets/rushing/rushing_summary_' + year + '.csv')
    temp_df['key'] = temp_df['player'] + year
    temp_df['year'] = int(year)
    rushing_df = pd.concat([rushing_df, temp_df])

In [21]:
draft_years = ['2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009']
receiving_df = pd.DataFrame()

for year in draft_years: 
    temp_df = pd.read_csv('./data/targets/receiving/receiving_summary_' + year + '.csv')
    temp_df['key'] = temp_df['player'] + year
    temp_df['year'] = int(year)
    receiving_df = pd.concat([receiving_df, temp_df])

In [22]:
draft_years = ['2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009']
defense_df = pd.DataFrame()

for year in draft_years: 
    temp_df = pd.read_csv('./data/targets/defense/defense_summary_' + year + '.csv')
    temp_df['key'] = temp_df['player'] + year
    temp_df['year'] = int(year)
    defense_df = pd.concat([defense_df, temp_df])

In [23]:
passing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 904 entries, 0 to 91
Data columns (total 27 columns):
player                 904 non-null object
player_id              904 non-null int64
position               903 non-null object
player_game_count      904 non-null int64
team_name              904 non-null object
dropbacks              904 non-null int64
attempts               904 non-null int64
completions            904 non-null int64
completion_percent     904 non-null float64
yards                  904 non-null int64
ypa                    904 non-null float64
touchdowns             904 non-null int64
interceptions          904 non-null int64
grades_offense         904 non-null float64
grades_pass            904 non-null float64
grades_run             777 non-null float64
grades_hands_fumble    903 non-null float64
sacks                  904 non-null int64
bats                   904 non-null int64
drops                  904 non-null int64
thrown_aways           904 non-null int64

In [24]:
all_drafts = pd.read_csv('./data/college/all_drafts.csv') 

In [25]:
all_drafts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2546 entries, 0 to 2545
Data columns (total 11 columns):
Unnamed: 0           2546 non-null int64
college_stats_url    2110 non-null object
draft_age            2522 non-null float64
draft_overall        2546 non-null int64
draft_round          2546 non-null int64
draft_year           2546 non-null int64
player               2546 non-null object
position             2546 non-null object
pro_stats_url        2528 non-null object
school               2534 non-null object
team_nfl             2546 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 218.9+ KB


In [26]:
all_drafts.head()

Unnamed: 0.1,Unnamed: 0,college_stats_url,draft_age,draft_overall,draft_round,draft_year,player,position,pro_stats_url,school,team_nfl
0,0,http://www.sports-reference.com/cfb/players/ba...,23.0,1,1,2018,Baker Mayfield,QB,https://www.pro-football-reference.com/players...,Oklahoma,CLE
1,1,http://www.sports-reference.com/cfb/players/sa...,21.0,2,1,2018,Saquon Barkley,RB,https://www.pro-football-reference.com/players...,Penn St.,NYG
2,2,http://www.sports-reference.com/cfb/players/sa...,21.0,3,1,2018,Sam Darnold,QB,https://www.pro-football-reference.com/players...,USC,NYJ
3,3,http://www.sports-reference.com/cfb/players/de...,21.0,4,1,2018,Denzel Ward,CB,https://www.pro-football-reference.com/players...,Ohio St.,CLE
4,4,http://www.sports-reference.com/cfb/players/br...,22.0,5,1,2018,Bradley Chubb,DE,https://www.pro-football-reference.com/players...,North Carolina St.,DEN


Below I drop two subsets of data, first I drop data that does not have a url for college stats in my dataframe, as some of the player data I scraped off of pro-football-reference did not have associated college stats records.  Since I cannot use these players in my model I dropped those datapoints.  I also choose to drop players of positions I deemed irrelevant because they had few or no stats associated with their college careers (such as Offensive Linemen, or Punters) on their college stats page, so modeling these positions would be impossible. 

In [27]:
all_drafts.dropna(subset=['college_stats_url'], inplace=True)

In [28]:
relevant_pos = ['QB', 'RB', 'FB', 'WR', 'TE', 'CB', 'DE', 'ILB', 'S', 'DT', 'OLB', 'LB', 'DB', 'DL', 'NT']
irrelevant_pos = [i for i in all_drafts['position'].unique() if i not in relevant_pos]

In [29]:
irrelevant_pos

['G', 'T', 'C', 'P', 'K', 'LS', 'OL']

In [30]:
for i in irrelevant_pos:
    all_drafts.drop(index=all_drafts[all_drafts['position']==i].index, inplace=True)


In [31]:
all_drafts['position'].unique()

array(['QB', 'RB', 'CB', 'DE', 'ILB', 'S', 'DT', 'OLB', 'WR', 'TE', 'LB',
       'DB', 'FB', 'DL', 'NT'], dtype=object)

In [32]:
all_drafts.isnull().sum()

Unnamed: 0            0
college_stats_url     0
draft_age            16
draft_overall         0
draft_round           0
draft_year            0
player                0
position              0
pro_stats_url        12
school                0
team_nfl              0
dtype: int64

Below I am looping through the larger all_drafts dataframe and encoding columns for grades, and number of games for each player for their first, second, third, and fourth years in the NFL. 

In [33]:
all_drafts['games_y1'] = np.nan
all_drafts['ov_grade_y1'] = np.nan
all_drafts['games_y2'] = np.nan
all_drafts['ov_grade_y2'] = np.nan
all_drafts['games_y3'] = np.nan
all_drafts['ov_grade_y3'] = np.nan
all_drafts['games_y4'] = np.nan
all_drafts['ov_grade_y4'] = np.nan

for index, row in all_drafts.iterrows():
    print(row['player'])
    
    if row['position'] == 'QB':
        
        try:
            passing_mask = passing_df['key']== row['player']+str(row['draft_year'])
            passing_index = passing_df[passing_mask].index[0]
            all_drafts.loc[index, 'games_y1'] = passing_df[passing_mask].loc[passing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y1'] = passing_df[passing_mask].loc[passing_index, 'grades_offense']
        except IndexError:
            pass
        
        try:
            passing_mask = passing_df['key'] == row['player']+str(row['draft_year']+1)
            passing_index = passing_df[passing_mask].index[0]
            all_drafts.loc[index, 'games_y2'] = passing_df[passing_mask].loc[passing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y2'] = passing_df[passing_mask].loc[passing_index, 'grades_offense']
        except IndexError:
            pass

        try:
            passing_mask = passing_df['key'] == row['player']+str(row['draft_year']+2)
            passing_index = passing_df[passing_mask].index[0]
            all_drafts.loc[index, 'games_y3'] = passing_df[passing_mask].loc[passing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y3'] = passing_df[passing_mask].loc[passing_index, 'grades_offense']
        except IndexError:
            pass
        
        try:
            passing_mask = passing_df['key'] == row['player']+str(row['draft_year']+2)
            passing_index = passing_df[passing_mask].index[0]
            all_drafts.loc[index, 'games_y4'] = passing_df[passing_mask].loc[passing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y4'] = passing_df[passing_mask].loc[passing_index, 'grades_offense']
        except IndexError:
            pass
    
    
    elif (row['position'] == 'RB') | (row['position'] == 'FB'):
        try:
            rushing_mask = rushing_df['key']== row['player']+str(row['draft_year'])
            rushing_index = rushing_df[rushing_mask].index[0]
            all_drafts.loc[index, 'games_y1'] = rushing_df[rushing_mask].loc[rushing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y1'] = rushing_df[rushing_mask].loc[rushing_index, 'grades_offense']
        except:
            pass
        
        try:
            rushing_mask = rushing_df['key']== row['player']+str(row['draft_year']+1)
            rushing_index = rushing_df[rushing_mask].index[0]
            all_drafts.loc[index, 'games_y2'] = rushing_df[rushing_mask].loc[rushing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y2'] = rushing_df[rushing_mask].loc[rushing_index, 'grades_offense']
        except IndexError:
            pass
        
        try:
            rushing_mask = rushing_df['key']== row['player']+str(row['draft_year']+2)
            rushing_index = rushing_df[rushing_mask].index[0]
            all_drafts.loc[index, 'games_y3'] = rushing_df[rushing_mask].loc[rushing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y3'] = rushing_df[rushing_mask].loc[rushing_index, 'grades_offense']
            
        except IndexError:
            pass
            
        try:
            rushing_mask = rushing_df['key']== row['player']+str(row['draft_year']+3)
            rushing_index = rushing_df[rushing_mask].index[0]
            all_drafts.loc[index, 'games_y4'] = rushing_df[rushing_mask].loc[rushing_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y4'] = rushing_df[rushing_mask].loc[rushing_index, 'grades_offense']
            
        except:
            pass
        
    
    elif (row['position'] == 'WR') | (row['position'] == 'TE'):
        try:
            receiving_mask = receiving_df['key']== row['player']+str(row['draft_year'])
            receiving_index = receiving_df[receiving_mask].index[0]
            all_drafts.loc[index, 'games_y1'] = receiving_df[receiving_mask].loc[receiving_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y1'] = receiving_df[receiving_mask].loc[receiving_index, 'grades_offense']
        except IndexError:
            pass
        
        try:
            receiving_mask = receiving_df['key']== row['player']+str(row['draft_year']+1)
            receiving_index = receiving_df[receiving_mask].index[0]
            all_drafts.loc[index, 'games_y2'] = receiving_df[receiving_mask].loc[receiving_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y2'] = receiving_df[receiving_mask].loc[receiving_index, 'grades_offense']
        except IndexError:
            pass
        
        try:
            receiving_mask = receiving_df['key']== row['player']+str(row['draft_year']+2)
            receiving_index = receiving_df[receiving_mask].index[0]
            all_drafts.loc[index, 'games_y3'] = receiving_df[receiving_mask].loc[receiving_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y3'] = receiving_df[receiving_mask].loc[receiving_index, 'grades_offense']
        except IndexError:
            pass
            
        try:
            receiving_mask = receiving_df['key']== row['player']+str(row['draft_year']+3)
            receiving_index = receiving_df[receiving_mask].index[0]
            all_drafts.loc[index, 'games_y4'] = receiving_df[receiving_mask].loc[receiving_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y4'] = receiving_df[receiving_mask].loc[receiving_index, 'grades_offense']
            
        except IndexError:
            pass
        
    elif row['position'] in ['CB', 'DE','ILB', 'S', 'DT', 'OLB', 'LB', 'DB', 'DL', 'NT']:
        try:
            defense_mask = defense_df['key']== row['player']+str(row['draft_year'])
            defense_index = defense_df[defense_mask].index[0]
            all_drafts.loc[index, 'games_y1'] = defense_df[defense_mask].loc[defense_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y1'] = defense_df[defense_mask].loc[defense_index, 'grades_defense']
        except IndexError:
            pass
        
        try:
            defense_mask = defense_df['key']== row['player']+str(row['draft_year']+1)
            defense_index = defense_df[defense_mask].index[0]
            all_drafts.loc[index, 'games_y2'] = defense_df[defense_mask].loc[defense_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y2'] = defense_df[defense_mask].loc[defense_index, 'grades_defense']
        except IndexError:
            pass
        
        try:
            defense_mask = defense_df['key']== row['player']+str(row['draft_year']+2)
            defense_index = defense_df[defense_mask].index[0]
            all_drafts.loc[index, 'games_y3'] = defense_df[defense_mask].loc[defense_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y3'] = defense_df[defense_mask].loc[defense_index, 'grades_defense']
        except IndexError:
            pass
            
        try:
            defense_mask = defense_df['key']== row['player']+str(row['draft_year']+3)
            defense_index = defense_df[defense_mask].index[0]
            all_drafts.loc[index, 'games_y4'] = defense_df[defense_mask].loc[defense_index, 'player_game_count']
            all_drafts.loc[index, 'ov_grade_y4'] = defense_df[defense_mask].loc[defense_index, 'grades_defense']            
        except IndexError:
            pass
            

Baker Mayfield
Saquon Barkley
Sam Darnold
Denzel Ward
Bradley Chubb
Josh Allen
Roquan Smith
Josh Rosen
Minkah Fitzpatrick
Vita Vea
Daron Payne
Marcus Davenport
Tremaine Edmunds
Derwin James
Jaire Alexander
Leighton Vander Esch
Rashaan Evans
D.J. Moore
Hayden Hurst
Calvin Ridley
Rashaad Penny
Terrell Edmunds
Taven Bryan
Mike Hughes
Sony Michel
Lamar Jackson
Nick Chubb
Ronald Jones
Courtland Sutton
Harold Landry
Mike Gesicki
Kerryon Johnson
Dante Pettis
Josh Jackson
Breeland Speaks
Christian Kirk
Uchenna Nwosu
Anthony Miller
Kemoko Turay
M.J. Stewart
Jessie Bates
Donte Jackson
Duke Dawson
Isaiah Oliver
Derrius Guice
James Washington
DJ Chark
Carlton Davis
Tyquan Lewis
Lorenzo Carter
Chad Thomas
Justin Reid
B.J. Hill
Fred Warner
Royce Freeman
Jerome Baker
Derrick Nnadi
Mason Rudolph
Sam Hubbard
Malik Jefferson
Rasheem Green
Michael Gallup
Tracy Walker
Justin Jones
Rashaan Gaulden
Mark Andrews
Arden Key
Oren Burks
Deadrin Senat
Tre'Quan Smith
Ronnie Harrison
Tarvarius Moore
Harrison Philli

Jalen Collins
Benardrick McKinney
Hau'oli Kikaha
Eric Kendricks
Eric Rowe
Denzel Perryman
Ronald Darby
Nate Orchard
Jordan Phillips
Ameer Abdullah
Maxx Williams
Senquez Golson
Markus Golden
Randy Gregory
Quinten Rollins
Frank Clark
Jordan Richards
D'Joun Smith
Clive Walford
Tyler Lockett
Jaelen Strong
Tevin Coleman
Owamagbe Odighizuwa
Garrett Grayson
Chris Conley
Duke Johnson
P.J. Williams
Eli Harold
Alex Carter
Lorenzo Mauldin
Craig Mager
Jordan Hicks
Tyler Kroft
Sammie Coates
Danielle Hunter
Sean Mannion
Carl Davis
Jeff Heuerman
Henry Anderson
Ty Montgomery
Matt Jones
Xavier Cooper
Geneo Grissom
Steven Nelson
Paul Dawson
Trey Flowers
Bryce Petty
James Sample
Jamison Crowder
Jeremy Langford
Justin Hardy
Jalston Fowler
Clayton Geathers
Gabe Wright
Ibraheim Campbell
Blake Bell
Ramik Wilson
Josh Shaw
Doran Grant
Za'Darius Smith
Vince Mayle
Kwon Alexander
Javorius Allen
Mike Davis
Damien Wilson
Jake Ryan
Marcus Hardison
Grady Jarrett
David Cobb
Rashad Greene
Ben Heeney
Martrell Spaight
Ad

Malik Jackson
Tahir Whitehead
Robert Blanton
Najee Goode
Taylor Thompson
DeQuan Menzie
Tank Carder
Danny Coale
Korey Toomer
Josh Kaddu
Shaun Prater
Bradie Ewing
Jack Crawford
Chris Rainey
Terrell Manning
Jonathan Massaquoi
Darius Fleming
Marvin Jones
George Iloka
Juron Criner
Vick Ballard
Alfred Morris
Keith Tandy
Mike Harris
Winston Guy
Cyrus Gray
B.J. Cunningham
Ryan Lindley
James Hanna
Josh Bush
Danny Trevathan
Markelle Martin
Dan Herron
Charles Mitchell
Marvin McNutt
Jonte Green
Nate Ebner
Tommy Streeter
Terrance Ganaway
Emmanuel Acho
Billy Winn
LaVon Brazill
Aaron Brown
Audie Cole
Scott Solomon
Michael Smith
Richard Crawford
Tim Fugger
Kheeston Randall
D.J. Campbell
Jerome Long
Trevor Guyton
Greg McCoy
Travis Lewis
Alfonzo Dennard
J.R. Sweezy
Bryce Brown
Toney Clemons
Greg Scruggs
Drake Dunsmore
Jeremy Ebert
DeAngelo Tyson
Cam Johnson
Junior Hemingway
Markus Kuhn
David Paulson
Antonio Allen
Trevin Wade
Terrence Frederick
Brad Smelley
Travian Robertson
Edwin Baker
Chandler Harnish


In [34]:
all_drafts.columns

Index(['Unnamed: 0', 'college_stats_url', 'draft_age', 'draft_overall',
       'draft_round', 'draft_year', 'player', 'position', 'pro_stats_url',
       'school', 'team_nfl', 'games_y1', 'ov_grade_y1', 'games_y2',
       'ov_grade_y2', 'games_y3', 'ov_grade_y3', 'games_y4', 'ov_grade_y4'],
      dtype='object')

In [35]:
all_drafts[['player', 'position', 'draft_year', 'games_y1', 'ov_grade_y1','games_y2', 'ov_grade_y2', 'games_y3', 
            'ov_grade_y3', 'games_y4', 'ov_grade_y4']]

Unnamed: 0,player,position,draft_year,games_y1,ov_grade_y1,games_y2,ov_grade_y2,games_y3,ov_grade_y3,games_y4,ov_grade_y4
0,Baker Mayfield,QB,2018,14.0,84.5,,,,,,
1,Saquon Barkley,RB,2018,16.0,85.9,,,,,,
2,Sam Darnold,QB,2018,13.0,64.7,,,,,,
3,Denzel Ward,CB,2018,13.0,78.9,,,,,,
4,Bradley Chubb,DE,2018,16.0,68.1,,,,,,
6,Josh Allen,QB,2018,12.0,65.3,,,,,,
7,Roquan Smith,ILB,2018,16.0,65.0,,,,,,
9,Josh Rosen,QB,2018,14.0,49.1,,,,,,
10,Minkah Fitzpatrick,S,2018,16.0,61.9,,,,,,
11,Vita Vea,DT,2018,13.0,74.2,,,,,,


In [36]:
all_drafts.isnull().sum()

Unnamed: 0              0
college_stats_url       0
draft_age              16
draft_overall           0
draft_round             0
draft_year              0
player                  0
position                0
pro_stats_url          12
school                  0
team_nfl                0
games_y1              422
ov_grade_y1           422
games_y2              595
ov_grade_y2           595
games_y3              838
ov_grade_y3           838
games_y4             1054
ov_grade_y4          1054
dtype: int64

In [37]:
len(all_drafts)

1799

In [38]:
for index, row in all_drafts.head().iterrows():
    print(np.isnan(row['games_y2']))

True
True
True
True
True


I created the below variables to examine how many players in my dataframe did not end up playing any years in the NFL.  I did not drop these however because those would be important datapoints, indicating that these players were not successful compared to someone who did play. 

In [39]:
all_drafts['played_y1'] = all_drafts['games_y1'].map(lambda x: 1 if np.isnan(x) == False else 0)

In [40]:
all_drafts['played_y2'] = all_drafts['games_y2'].map(lambda x: 1 if np.isnan(x) == False else 0)

In [41]:
all_drafts['played_y3'] = all_drafts['games_y3'].map(lambda x: 1 if np.isnan(x) == False else 0)

In [42]:
all_drafts['played_y4'] = all_drafts['games_y4'].map(lambda x: 1 if np.isnan(x) == False else 0)

In [43]:
all_drafts['years_played'] = all_drafts.apply(lambda row: row['played_y1'] + row['played_y2'] + row['played_y3'] + row['played_y4'], axis=1)

In [44]:
all_drafts[['player', 'played_y1', 'played_y2', 'played_y3', 'played_y4', 'years_played']]

Unnamed: 0,player,played_y1,played_y2,played_y3,played_y4,years_played
0,Baker Mayfield,1,0,0,0,1
1,Saquon Barkley,1,0,0,0,1
2,Sam Darnold,1,0,0,0,1
3,Denzel Ward,1,0,0,0,1
4,Bradley Chubb,1,0,0,0,1
6,Josh Allen,1,0,0,0,1
7,Roquan Smith,1,0,0,0,1
9,Josh Rosen,1,0,0,0,1
10,Minkah Fitzpatrick,1,0,0,0,1
11,Vita Vea,1,0,0,0,1


In [45]:
all_drafts['years_played'].value_counts()

4    596
2    330
3    312
1    307
0    254
Name: years_played, dtype: int64

In [3]:
college_stats = pd.read_csv('./data/college/all_player_stats.csv')

In [4]:
college_stats.head()

Unnamed: 0.1,Unnamed: 0,player,year,school,conference,class,pos,games,pass_cmp,pass_att,...,fumbles_rec_td,fumbles_forced,punt_ret,punt_ret_yds,punt_ret_yds_per_ret,punt_ret_td,kick_ret,kick_ret_yds,kick_ret_yds_per_ret,kick_ret_td
0,0,Baker Mayfield,2013,Texas Tech,Big 12,FR,QB,8.0,218.0,340.0,...,,0.0,,,,,,,,
1,1,Baker Mayfield,2015,Oklahoma,Big 12,SO,QB,13.0,269.0,395.0,...,,0.0,,,,,,,,
2,2,Baker Mayfield,2016,Oklahoma,Big 12,JR,QB,13.0,254.0,358.0,...,,0.0,,,,,,,,
3,3,Baker Mayfield,2017,Oklahoma,Big 12,SR,QB,14.0,285.0,404.0,...,,0.0,,,,,,,,
4,4,Saquon Barkley,2015,Penn State,Big Ten,FR,RB,11.0,0.0,0.0,...,,0.0,,,,,,,,


In [5]:
college_stats.isnull().sum()

Unnamed: 0                 0
player                     0
year                       0
school                     0
conference                 1
class                     67
pos                        1
games                    198
pass_cmp                4527
pass_att                4527
pass_cmp_pct            4662
pass_yds                4527
pass_yds_per_att        4662
adj_pass_yds_per_att    4662
pass_td                 4527
pass_int                4527
pass_rating             4662
rush_att                3126
rush_yds                3126
rush_yds_per_att        3596
rush_td                 3126
rec                     2892
rec_yds                 2892
rec_yds_per_rec         3207
rec_td                  2892
scrim_att               2585
scrim_yds               2585
scrim_yds_per_att       2807
scrim_td                2585
tackles_solo             803
tackles_assists          803
tackles_total            803
tackles_loss             803
sacks                    803
def_int       

In [6]:
college_stats[['pass_cmp', 'pass_att', 'pass_cmp_pct', 'pass_yds',
       'pass_yds_per_att', 'adj_pass_yds_per_att', 'pass_td', 'pass_int',
       'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td',
       'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'scrim_att', 'scrim_yds',
       'scrim_yds_per_att', 'scrim_td', 'tackles_solo', 'tackles_assists',
       'tackles_total', 'tackles_loss', 'sacks', 'def_int', 'def_int_yds',
       'def_int_yds_per_int', 'def_int_td', 'pass_defended', 'fumbles_rec',
       'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 'punt_ret',
       'punt_ret_yds', 'punt_ret_yds_per_ret', 'punt_ret_td', 'kick_ret',
       'kick_ret_yds', 'kick_ret_yds_per_ret', 'kick_ret_td']] = college_stats[['pass_cmp', 'pass_att', 'pass_cmp_pct', 'pass_yds',
       'pass_yds_per_att', 'adj_pass_yds_per_att', 'pass_td', 'pass_int',
       'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td',
       'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'scrim_att', 'scrim_yds',
       'scrim_yds_per_att', 'scrim_td', 'tackles_solo', 'tackles_assists',
       'tackles_total', 'tackles_loss', 'sacks', 'def_int', 'def_int_yds',
       'def_int_yds_per_int', 'def_int_td', 'pass_defended', 'fumbles_rec',
       'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 'punt_ret',
       'punt_ret_yds', 'punt_ret_yds_per_ret', 'punt_ret_td', 'kick_ret',
       'kick_ret_yds', 'kick_ret_yds_per_ret', 'kick_ret_td']].fillna(value=0)

I chose to fill null values in my college stats dataframe with 0 values, because the assumption is that if it was not included on a table on pro-football-reference.com there were not stats accumulated in that category that year for that player.  Rather than having a table full of zeros for multiple categories, that website opted to just have those specific categories missing.  Because of this I have to assume those values are zeros for those players, however this could be a limitation of my modeling later on, and other than manually checking and imputing values I don't really have a way of knowing whether or not a stat is actually missing, or just not accumulated that season meaning it would be a zero. 

In [7]:
college_stats.isnull().sum()

Unnamed: 0                0
player                    0
year                      0
school                    0
conference                1
class                    67
pos                       1
games                   198
pass_cmp                  0
pass_att                  0
pass_cmp_pct              0
pass_yds                  0
pass_yds_per_att          0
adj_pass_yds_per_att      0
pass_td                   0
pass_int                  0
pass_rating               0
rush_att                  0
rush_yds                  0
rush_yds_per_att          0
rush_td                   0
rec                       0
rec_yds                   0
rec_yds_per_rec           0
rec_td                    0
scrim_att                 0
scrim_yds                 0
scrim_yds_per_att         0
scrim_td                  0
tackles_solo              0
tackles_assists           0
tackles_total             0
tackles_loss              0
sacks                     0
def_int                   0
def_int_yds         

I create two dataframes from the larger college stats dataframe that I have, those being avg_college_stats and final_year_college_stats, as these would be my way of creating one single datapoint for each player that I could map to my draft_picks, rather than having multiple lines for different years of college stats.  Later on in modeling I opt to use the avg_college_stats over final_year_college_stats. 

In [8]:
avg_college_stats = college_stats.groupby('player', sort=False).mean()

In [9]:
avg_college_stats.head()

Unnamed: 0_level_0,Unnamed: 0,year,games,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_yds_per_att,adj_pass_yds_per_att,pass_td,...,fumbles_rec_td,fumbles_forced,punt_ret,punt_ret_yds,punt_ret_yds_per_ret,punt_ret_td,kick_ret,kick_ret_yds,kick_ret_yds_per_ret,kick_ret_td
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baker Mayfield,1.5,2015.25,12.0,256.5,374.25,68.4,3651.75,9.7,10.475,32.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Saquon Barkley,5.0,2016.0,12.666667,0.666667,0.666667,33.333333,12.0,6.0,9.333333,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sam Darnold,8.0,2016.0,13.5,183.0,282.0,43.433333,2409.666667,5.666667,5.833333,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Denzel Ward,11.0,2016.0,8.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,3.333333,3.333333,0.0,0.0,0.0,0.0,0.0
Bradley Chubb,14.5,2015.5,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
max_year = college_stats.groupby('player', sort=False).max()['year']

In [11]:
college_stats['final_year'] = np.nan
for name, year in max_year.items():
    print(name)
    for index, row in college_stats.iterrows():
        if row['player'] == name:
            college_stats.loc[index, 'final_year'] = year

Baker Mayfield
Saquon Barkley
Sam Darnold
Denzel Ward
Bradley Chubb
Josh Allen
Roquan Smith
Josh Rosen
Minkah Fitzpatrick
Vita Vea
Daron Payne
Marcus Davenport
Tremaine Edmunds
Derwin James
Jaire Alexander
Leighton Vander Esch
Rashaan Evans
D.J. Moore
Hayden Hurst
Calvin Ridley
Rashaad Penny
Terrell Edmunds
Taven Bryan
Mike Hughes
Sony Michel
Lamar Jackson
Nick Chubb
Ronald Jones
Courtland Sutton
Harold Landry
Mike Gesicki
Kerryon Johnson
Dante Pettis
Joshua Jackson
Breeland Speaks
Christian Kirk
Uchenna Nwosu
Anthony Miller
Kemoko Turay
M.J. Stewart
Jessie Bates
Donte Jackson
Duke Dawson
Isaiah Oliver
Derrius Guice
James Washington
D.J. Chark
Carlton Davis
Tyquan Lewis
Lorenzo Carter
Chad Thomas
Justin Reid
B.J. Hill
Fred Warner
Royce Freeman
Jerome Baker
Derrick Nnadi
Mason Rudolph
Sam Hubbard
Malik Jefferson
Rasheem Green
Michael Gallup
Tracy Walker
Justin Jones
Rashaan Gaulden
Mark Andrews
Arden Key
Oren Burks
Deadrin Senat
Tre'Quan Smith
Ronnie Harrison
Tarvarius Moore
Harrison Ph

Stephone Anthony
Landon Collins
Mario Edwards Jr.
T.J. Yeldon
Devin Smith
Preston Smith
Eddie Goldman
Dorial Green-Beckham
Devin Funchess
Jalen Collins
Benardrick McKinney
Hau'oli Kikaha
Eric Kendricks
Eric Rowe
Denzel Perryman
Ronald Darby
Nate Orchard
Ameer Abdullah
Maxx Williams
Senquez Golson
Markus Golden
Randy Gregory
Quinten Rollins
Frank Clark
Jordan Richards
D'Joun Smith
Clive Walford
Tyler Lockett
Jaelen Strong
Tevin Coleman
Owamagbe Odighizuwa
Garrett Grayson
Chris Conley
Duke Johnson
P.J. Williams
Eli Harold
Alex Carter
Lorenzo Mauldin
Craig Mager
Jordan Hicks
Tyler Kroft
Sammie Coates
Danielle Hunter
Sean Mannion
Carl Davis
Jeff Heuerman
Henry Anderson
Ty Montgomery
Matt Jones
Xavier Cooper
Geneo Grissom
Steven Nelson
Paul Dawson
Trey Flowers
Bryce Petty
James Sample
Jamison Crowder
Jeremy Langford
Justin Hardy
Jalston Fowler
Clayton Geathers
Gabe Wright
Ibraheim Campbell
Blake Bell
Ramik Wilson
Josh Shaw
Doran Grant
Za'Darius Smith
Vince Mayle
Kwon Alexander
Javorius Alle

Randall Cobb
Terrell McClain
Dontay Moch
Kelvin Sheppard
Rob Housler
Justin Houston
Demarco Murray
Martez Wilson
Stevan Ridley
Ryan Mallett
Jurrell Casey
Austin Pettis
Leonard Hankerson
Vincent Brown
Jerrel Jernigan
Allen Bailey
Drake Nevis
Akeem Dent
Alex Green
Sione Fua
Jordan Cameron
Luke Stocker
Roy Helu
Christian Ballard
Kris Durham
Colin McCarthy
Greg Salas
Kendall Hunter
Casey Matthews
Delone Carter
Tandon Doss
Owen Marecic
Bilal Powell
Jamie Harper
Kealoha Pilares
Johnny White
Anthony Sherman
Gabe Miller
D.J. Williams
Karl Klug
Jacquizz Rodgers
Denarius Moore
Dion Lewis
T.J. Yates
Jeremy Kerley
Niles Paul
Doug Hogue
Lee Smith
Nathan Enderle
Chris Carter
Pernell McPhee
Lawrence Wilson
Ryan Whalen
Chris White
Quan Sturdivant
Charles Clay
Dwayne Harris
Evan Royster
Aldrick Robinson
Tyrod Taylor
Richard Gordon
Ronald Johnson
Jordan Todman
David Carter
Greg Jones
Allen Bradford
Mike Mohamed
Brian Rolle
Jerrell Powe
Ross Homan
Jacquian Williams
Virgil Green
Greg McElroy
Bruce Miller


In [12]:
college_stats[['player', 'year', 'final_year']]

Unnamed: 0,player,year,final_year
0,Baker Mayfield,2013,2017.0
1,Baker Mayfield,2015,2017.0
2,Baker Mayfield,2016,2017.0
3,Baker Mayfield,2017,2017.0
4,Saquon Barkley,2015,2017.0
5,Saquon Barkley,2016,2017.0
6,Saquon Barkley,2017,2017.0
7,Sam Darnold,2015,2017.0
8,Sam Darnold,2016,2017.0
9,Sam Darnold,2017,2017.0


In [13]:
final_year_college_stats = college_stats[college_stats['year']==college_stats['final_year']]

In [14]:
final_year_college_stats.head()

Unnamed: 0.1,Unnamed: 0,player,year,school,conference,class,pos,games,pass_cmp,pass_att,...,fumbles_forced,punt_ret,punt_ret_yds,punt_ret_yds_per_ret,punt_ret_td,kick_ret,kick_ret_yds,kick_ret_yds_per_ret,kick_ret_td,final_year
3,3,Baker Mayfield,2017,Oklahoma,Big 12,SR,QB,14.0,285.0,404.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0
6,6,Saquon Barkley,2017,Penn State,Big Ten,JR,RB,13.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0
9,9,Sam Darnold,2017,USC,Pac-12,SO,QB,14.0,303.0,480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0
12,12,Denzel Ward,2017,Ohio State,Big Ten,JR,CB,11.0,0.0,0.0,...,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,2017.0
16,16,Bradley Chubb,2017,North Carolina State,ACC,SR,DE,12.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0


In [15]:
college_stats.drop(columns='Unnamed: 0', inplace=True)

In [16]:
college_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5341 entries, 0 to 5340
Data columns (total 51 columns):
player                  5341 non-null object
year                    5341 non-null int64
school                  5341 non-null object
conference              5340 non-null object
class                   5274 non-null object
pos                     5340 non-null object
games                   5143 non-null float64
pass_cmp                5341 non-null float64
pass_att                5341 non-null float64
pass_cmp_pct            5341 non-null float64
pass_yds                5341 non-null float64
pass_yds_per_att        5341 non-null float64
adj_pass_yds_per_att    5341 non-null float64
pass_td                 5341 non-null float64
pass_int                5341 non-null float64
pass_rating             5341 non-null float64
rush_att                5341 non-null float64
rush_yds                5341 non-null float64
rush_yds_per_att        5341 non-null float64
rush_td                 53

In [17]:
new_cols = college_stats.drop(columns=['player', 'school', 'conference', 'class', 'pos', 'games', 'year', 'final_year']).columns
avg_cols = ['avg_' + i for i in new_cols]  
final_year_cols = ['final_year_' + i for i in new_cols]

In [46]:
all_drafts['conference'] = np.nan

for i in avg_cols:
    all_drafts[i] = np.nan

for i  in final_year_cols:
    all_drafts[i] = np.nan

Mapping average college stats to draft picks dataframe with new column names

In [47]:
for index, row in all_drafts.iterrows():
    try:
        all_drafts.loc[index, 'avg_pass_cmp'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_cmp'][0]
        all_drafts.loc[index, 'avg_pass_att'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_att'][0]
        all_drafts.loc[index, 'avg_pass_cmp_pct'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_cmp_pct'][0]
        all_drafts.loc[index, 'avg_pass_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_yds'][0]
        all_drafts.loc[index, 'avg_pass_yds_per_att'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_yds_per_att'][0]
        all_drafts.loc[index, 'avg_adj_pass_yds_per_att'] = avg_college_stats[avg_college_stats.index==row['player']]['adj_pass_yds_per_att'][0]
        all_drafts.loc[index, 'avg_pass_td'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_td'][0]
        all_drafts.loc[index, 'avg_pass_int'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_int'][0]
        all_drafts.loc[index, 'avg_pass_rating'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_rating'][0]
        all_drafts.loc[index, 'avg_rush_att'] = avg_college_stats[avg_college_stats.index==row['player']]['rush_att'][0]
        all_drafts.loc[index, 'avg_rush_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['rush_yds'][0]
        all_drafts.loc[index, 'avg_rush_yds_per_att'] = avg_college_stats[avg_college_stats.index==row['player']]['rush_yds_per_att'][0]
        all_drafts.loc[index, 'avg_rush_td'] = avg_college_stats[avg_college_stats.index==row['player']]['rush_td'][0]
        all_drafts.loc[index, 'avg_rec'] = avg_college_stats[avg_college_stats.index==row['player']]['rec'][0]
        all_drafts.loc[index, 'avg_rec_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['rec_yds'][0]
        all_drafts.loc[index, 'avg_rec_yds_per_rec'] = avg_college_stats[avg_college_stats.index==row['player']]['rec_yds_per_rec'][0]
        all_drafts.loc[index, 'avg_rec_td'] = avg_college_stats[avg_college_stats.index==row['player']]['rec_td'][0]
        all_drafts.loc[index, 'avg_scrim_att'] = avg_college_stats[avg_college_stats.index==row['player']]['scrim_att'][0]
        all_drafts.loc[index, 'avg_scrim_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['scrim_yds'][0]
        all_drafts.loc[index, 'avg_scrim_yds_per_att'] = avg_college_stats[avg_college_stats.index==row['player']]['scrim_yds_per_att'][0]
        all_drafts.loc[index, 'avg_scrim_td'] = avg_college_stats[avg_college_stats.index==row['player']]['scrim_td'][0]
        all_drafts.loc[index, 'avg_tackles_solo'] = avg_college_stats[avg_college_stats.index==row['player']]['tackles_solo'][0]
        all_drafts.loc[index, 'avg_tackles_assists'] = avg_college_stats[avg_college_stats.index==row['player']]['tackles_assists'][0]
        all_drafts.loc[index, 'avg_tackles_total'] = avg_college_stats[avg_college_stats.index==row['player']]['tackles_total'][0]
        all_drafts.loc[index, 'avg_tackles_loss'] = avg_college_stats[avg_college_stats.index==row['player']]['tackles_loss'][0]
        all_drafts.loc[index, 'avg_sacks'] = avg_college_stats[avg_college_stats.index==row['player']]['sacks'][0]
        all_drafts.loc[index, 'avg_def_int'] = avg_college_stats[avg_college_stats.index==row['player']]['def_int'][0]
        all_drafts.loc[index, 'avg_def_int_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['def_int_yds'][0]
        all_drafts.loc[index, 'avg_def_int_yds_per_int'] = avg_college_stats[avg_college_stats.index==row['player']]['def_int_yds_per_int'][0]
        all_drafts.loc[index, 'avg_def_int_td'] = avg_college_stats[avg_college_stats.index==row['player']]['def_int_td'][0]
        all_drafts.loc[index, 'avg_pass_defended'] = avg_college_stats[avg_college_stats.index==row['player']]['pass_defended'][0]
        all_drafts.loc[index, 'avg_fumbles_rec'] = avg_college_stats[avg_college_stats.index==row['player']]['fumbles_rec'][0]
        all_drafts.loc[index, 'avg_fumbles_rec_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['fumbles_rec_yds'][0]
        all_drafts.loc[index, 'avg_fumbles_rec_td'] = avg_college_stats[avg_college_stats.index==row['player']]['fumbles_rec_td'][0]
        all_drafts.loc[index, 'avg_fumbles_forced'] = avg_college_stats[avg_college_stats.index==row['player']]['fumbles_forced'][0]
        all_drafts.loc[index, 'avg_punt_ret'] = avg_college_stats[avg_college_stats.index==row['player']]['punt_ret'][0]
        all_drafts.loc[index, 'avg_punt_ret_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['punt_ret_yds'][0]
        all_drafts.loc[index, 'avg_punt_ret_yds_per_ret'] = avg_college_stats[avg_college_stats.index==row['player']]['punt_ret_yds_per_ret'][0]
        all_drafts.loc[index, 'avg_punt_ret_td'] = avg_college_stats[avg_college_stats.index==row['player']]['punt_ret_td'][0]
        all_drafts.loc[index, 'avg_kick_ret'] = avg_college_stats[avg_college_stats.index==row['player']]['kick_ret'][0]
        all_drafts.loc[index, 'avg_kick_ret_yds'] = avg_college_stats[avg_college_stats.index==row['player']]['kick_ret_yds'][0]
        all_drafts.loc[index, 'avg_kick_ret_yds_per_ret'] = avg_college_stats[avg_college_stats.index==row['player']]['kick_ret_yds_per_ret'][0]
        all_drafts.loc[index, 'avg_kick_ret_td'] = avg_college_stats[avg_college_stats.index==row['player']]['kick_ret_td'][0]
        print(index)
        print(row['player'])
    except IndexError:
        pass

0
Baker Mayfield
1
Saquon Barkley
2
Sam Darnold
3
Denzel Ward
4
Bradley Chubb
6
Josh Allen
7
Roquan Smith
9
Josh Rosen
10
Minkah Fitzpatrick
11
Vita Vea
12
Daron Payne
13
Marcus Davenport
15
Tremaine Edmunds
16
Derwin James
17
Jaire Alexander
18
Leighton Vander Esch
21
Rashaan Evans
23
D.J. Moore
24
Hayden Hurst
25
Calvin Ridley
26
Rashaad Penny
27
Terrell Edmunds
28
Taven Bryan
29
Mike Hughes
30
Sony Michel
31
Lamar Jackson
34
Nick Chubb
37
Ronald Jones
39
Courtland Sutton
40
Harold Landry
41
Mike Gesicki
42
Kerryon Johnson
43
Dante Pettis
45
Breeland Speaks
46
Christian Kirk
47
Uchenna Nwosu
50
Anthony Miller
51
Kemoko Turay
52
M.J. Stewart
53
Jessie Bates
54
Donte Jackson
55
Duke Dawson
57
Isaiah Oliver
58
Derrius Guice
59
James Washington
62
Carlton Davis
63
Tyquan Lewis
65
Lorenzo Carter
66
Chad Thomas
67
Justin Reid
68
B.J. Hill
69
Fred Warner
70
Royce Freeman
72
Jerome Baker
74
Derrick Nnadi
75
Mason Rudolph
76
Sam Hubbard
77
Malik Jefferson
78
Rasheem Green
80
Michael Gallup
81

683
Jatavis Brown
684
Andy Janovich
686
D.J. White
690
Keenan Reynolds
691
Devante Bond
692
Jerell Adams
694
Jakeem Grant
695
Nate Sudfeld
696
David Morgan
697
Anthony Brown
698
Josh Forrest
699
Jake Rudock
700
Kolby Listenbee
702
Cory James
704
Blake Countess
705
Dan Vitale
706
Derek Watt
707
Cody Core
709
Brandon Allen
710
Anthony Zettel
711
Dadi Nicolas
712
Jordan Lucas
714
Mike Thomas
715
Jeff Driskel
717
Maurice Canady
719
Kelvin Taylor
720
Kavon Frazier
721
Aaron Burbridge
722
Elandon Roberts
726
Kevon Seymour
728
Travis Feeney
730
Aaron Wallace
731
Brandon Doughty
733
Devin Lucien
735
Stephen Weatherly
737
Demarcus Ayers
738
Daniel Braverman
739
Thomas Duarte
740
Steven Daniels
741
Jalen Mills
744
Dwayne Washington
745
Daniel Lasco
746
Devin Fuller
748
Alex McCalister
749
Charone Peake
750
Keith Marshall
751
Kenny Lawler
752
Jayron Kearse
753
Clayton Fejedelem
754
Tyler Matakevich
755
Zac Brooks
758
Scooby Wright
759
Joe Walker
761
Kalan Reed
762
Jameis Winston
763
Marcus Mariot

1507
Zac Dysert
1508
Steve Beauharnais
1509
Marquess Wilson
1510
B.J. Daniels
1512
David King
1522
Sean Renfree
1526
Michael Cox
1527
Justice Cunningham
1528
Andrew Luck
1530
Trent Richardson
1532
Justin Blackmon
1535
Ryan Tannehill
1536
Luke Kuechly
1538
Dontari Poe
1539
Fletcher Cox
1540
Michael Floyd
1541
Michael Brockers
1542
Bruce Irvin
1543
Quinton Coples
1545
Melvin Ingram
1546
Shea McClellin
1547
Kendall Wright
1548
Chandler Jones
1549
Brandon Weeden
1552
Dont'a Hightower
1553
Whitney Mercilus
1555
Nick Perry
1557
A.J. Jenkins
1558
Doug Martin
1559
David Wilson
1561
Coby Fleener
1562
Courtney Upshaw
1563
Derek Wolfe
1565
Andre Branch
1570
Stephen Hill
1572
Alshon Jeffery
1573
Mychal Kendricks
1574
Bobby Wagner
1576
Kendall Reyes
1577
Isaiah Pead
1578
Jerel Worthy
1579
Zach Brown
1580
Devon Still
1581
Ryan Broyles
1584
Brock Osweiler
1585
Lavonte David
1586
Vinny Curry
1588
LaMichael James
1590
Rueben Randle
1591
Dwayne Allen
1593
Josh Robinson
1594
Ronnie Hillman
1599
Olivier V

2434
Quinn Johnson
2435
Scott McKillop
2436
Nic Harris
2439
Jasper Brinkley
2441
James Casey
2443
Marcus Freeman
2447
Cody Glenn
2458
Frank Summers
2462
Javon Ringer
2463
Tom Brandstater
2465
Spencer Adkins
2471
Jarius Wynn
2473
Bear Pascoe
2474
Cedric Peerman
2475
Robert Henson
2478
Kevin Ellison
2481
Aaron Brown
2483
Brandon Gibson
2484
James Davis
2488
Stryker Sulak
2490
Curtis Painter
2491
Brandon Myers
2493
Will Davis
2496
Myron Pryor
2497
John Phillips
2499
Vance Walker
2500
Chris Ogbonnaya
2507
Brad Jones
2513
Demetrius Byrd
2518
Manuel Johnson
2519
Moise Fokou
2521
Julian Edelman
2522
Sammie Stroughter
2524
Zack Follett
2526
Jake O'Connell
2529
LaRod Stephens-Howling
2530
David Johnson
2532
Marko Mitchell
2533
Ricky Jean-Francois
2536
Nick Reed
2537
Cameron Morrah
2538
Clinton McDonald
2540
Derek Kinder
2541
Freddie Brown
2542
Tiquan Underwood
2544
Dan Gronkowski


Mapping final year college stats to draft picks dataframe with new column names

In [48]:
for index, row in all_drafts.iterrows():
    try:    
        print(index)
        print(row['player'])
        player_index = final_year_college_stats[final_year_college_stats['player']==row['player']].index[0]
        
        all_drafts.loc[index, 'conference'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['conference'][player_index]
        all_drafts.loc[index, 'final_year_pass_cmp'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_cmp'][player_index]
        all_drafts.loc[index, 'final_year_pass_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_att'][player_index]
        all_drafts.loc[index, 'final_year_pass_cmp_pct'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_cmp_pct'][player_index]
        all_drafts.loc[index, 'final_year_pass_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_yds'][player_index]
        all_drafts.loc[index, 'final_year_pass_yds_per_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_yds_per_att'][player_index]
        all_drafts.loc[index, 'final_year_adj_pass_yds_per_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['adj_pass_yds_per_att'][player_index]
        all_drafts.loc[index, 'final_year_pass_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_td'][player_index]
        all_drafts.loc[index, 'final_year_pass_int'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_int'][player_index]
        all_drafts.loc[index, 'final_year_pass_rating'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_rating'][player_index]
        all_drafts.loc[index, 'final_year_rush_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rush_att'][player_index]
        all_drafts.loc[index, 'final_year_rush_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rush_yds'][player_index]
        all_drafts.loc[index, 'final_year_rush_yds_per_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rush_yds_per_att'][player_index]
        all_drafts.loc[index, 'final_year_rush_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rush_td'][player_index]
        all_drafts.loc[index, 'final_year_rec'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rec'][player_index]
        all_drafts.loc[index, 'final_year_rec_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rec_yds'][player_index]
        all_drafts.loc[index, 'final_year_rec_yds_per_rec'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rec_yds_per_rec'][player_index]
        all_drafts.loc[index, 'final_year_rec_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['rec_td'][player_index]
        all_drafts.loc[index, 'final_year_scrim_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['scrim_att'][player_index]
        all_drafts.loc[index, 'final_year_scrim_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['scrim_yds'][player_index]
        all_drafts.loc[index, 'final_year_scrim_yds_per_att'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['scrim_yds_per_att'][player_index]
        all_drafts.loc[index, 'final_year_scrim_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['scrim_td'][player_index]
        all_drafts.loc[index, 'final_year_tackles_solo'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['tackles_solo'][player_index]
        all_drafts.loc[index, 'final_year_tackles_assists'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['tackles_assists'][player_index]
        all_drafts.loc[index, 'final_year_tackles_total'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['tackles_total'][player_index]
        all_drafts.loc[index, 'final_year_tackles_loss'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['tackles_loss'][player_index]
        all_drafts.loc[index, 'final_year_sacks'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['sacks'][player_index]
        all_drafts.loc[index, 'final_year_def_int'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['def_int'][player_index]
        all_drafts.loc[index, 'final_year_def_int_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['def_int_yds'][player_index]
        all_drafts.loc[index, 'final_year_def_int_yds_per_int'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['def_int_yds_per_int'][player_index]
        all_drafts.loc[index, 'final_year_def_int_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['def_int_td'][player_index]
        all_drafts.loc[index, 'final_year_pass_defended'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['pass_defended'][player_index]
        all_drafts.loc[index, 'final_year_fumbles_rec'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['fumbles_rec'][player_index]
        all_drafts.loc[index, 'final_year_fumbles_rec_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['fumbles_rec_yds'][player_index]
        all_drafts.loc[index, 'final_year_fumbles_rec_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['fumbles_rec_td'][player_index]
        all_drafts.loc[index, 'final_year_fumbles_forced'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['fumbles_forced'][player_index]
        all_drafts.loc[index, 'final_year_punt_ret'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['punt_ret'][player_index]
        all_drafts.loc[index, 'final_year_punt_ret_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['punt_ret_yds'][player_index]
        all_drafts.loc[index, 'final_year_punt_ret_yds_per_ret'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['punt_ret_yds_per_ret'][player_index]
        all_drafts.loc[index, 'final_year_punt_ret_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['punt_ret_td'][player_index]
        all_drafts.loc[index, 'final_year_kick_ret'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['kick_ret'][player_index]
        all_drafts.loc[index, 'final_year_kick_ret_yds'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['kick_ret_yds'][player_index]
        all_drafts.loc[index, 'final_year_kick_ret_yds_per_ret'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['kick_ret_yds_per_ret'][player_index]
        all_drafts.loc[index, 'final_year_kick_ret_td'] = final_year_college_stats[final_year_college_stats['player']==row['player']]['kick_ret_td'][player_index]
    except IndexError:
        pass

0
Baker Mayfield
1
Saquon Barkley
2
Sam Darnold
3
Denzel Ward
4
Bradley Chubb
6
Josh Allen
7
Roquan Smith
9
Josh Rosen
10
Minkah Fitzpatrick
11
Vita Vea
12
Daron Payne
13
Marcus Davenport
15
Tremaine Edmunds
16
Derwin James
17
Jaire Alexander
18
Leighton Vander Esch
21
Rashaan Evans
23
D.J. Moore
24
Hayden Hurst
25
Calvin Ridley
26
Rashaad Penny
27
Terrell Edmunds
28
Taven Bryan
29
Mike Hughes
30
Sony Michel
31
Lamar Jackson
34
Nick Chubb
37
Ronald Jones
39
Courtland Sutton
40
Harold Landry
41
Mike Gesicki
42
Kerryon Johnson
43
Dante Pettis
44
Josh Jackson
45
Breeland Speaks
46
Christian Kirk
47
Uchenna Nwosu
50
Anthony Miller
51
Kemoko Turay
52
M.J. Stewart
53
Jessie Bates
54
Donte Jackson
55
Duke Dawson
57
Isaiah Oliver
58
Derrius Guice
59
James Washington
60
DJ Chark
62
Carlton Davis
63
Tyquan Lewis
65
Lorenzo Carter
66
Chad Thomas
67
Justin Reid
68
B.J. Hill
69
Fred Warner
70
Royce Freeman
72
Jerome Baker
74
Derrick Nnadi
75
Mason Rudolph
76
Sam Hubbard
77
Malik Jefferson
78
Rashee

600
Brandon Williams
601
Cody Kessler
602
Nick Vannett
604
Vincent Valentine
606
Justin Simmons
607
Joe Schobert
608
Connor Cook
609
Charles Tapper
610
Joshua Perry
611
Sheldon Day
612
Tavon Young
614
Eric Murray
615
Chris Moore
617
B.J. Goodson
618
Tyler Higbee
620
Malcolm Mitchell
621
Nick Kwiatkoski
622
Ricardo Louis
623
De'Vondre Campbell
624
Hassan Ridgeway
625
Pharoh Cooper
626
Juston Burris
627
Tyler Ervin
630
Andrew Billings
632
Deon Bush
633
Antonio Morrison
634
Demarcus Robinson
637
Derrick Kindred
639
Blake Martinez
640
Willie Henry
641
Rashard Robinson
642
Kenneth Dixon
643
Dak Prescott
644
Devontae Booker
645
Dean Lowry
647
Cardale Jones
648
Tajae Sharpe
649
Zack Sanchez
650
Ronald Blair
651
DeAndre Washington
655
Quinton Jefferson
657
Paul Perkins
658
Jordan Howard
660
Matthew Ioannidis
661
Wendell Smallwood
662
Jordan Payton
664
Jonathan Williams
667
K.J. Dillon
668
Kentrell Brothers
670
Kevin Hogan
671
Trevor Davis
674
D.J. Reader
677
Antwione Williams
679
Alex Collins


1318
Kevin Minter
1319
Kiko Alonso
1320
Gavin Escobar
1321
Le'Veon Bell
1322
Johnathan Hankins
1323
Jonathan Bostic
1324
David Amerson
1325
Jamie Collins
1326
Margus Hunt
1327
Jamar Taylor
1328
Vance McDonald
1329
Arthur Brown
1330
D.J. Swearinger
1331
Montee Ball
1332
Aaron Dobson
1334
Eddie Lacy
1335
Christine Michael
1336
Travis Kelce
1337
Dwayne Gratz
1339
Sio Moore
1340
Bennie Logan
1341
Leon McFadden
1343
Blidi Wreh-Wilson
1344
T.J. McDonald
1346
Mike Glennon
1347
Terrance Williams
1349
Keenan Allen
1351
Marquise Goodwin
1352
Markus Wheaton
1354
Damontre Moore
1355
John Jenkins
1356
Logan Ryan
1357
Shawn Williams
1358
Jordan Reed
1360
Jordan Hill
1361
Corey Lemonier
1363
Kayvon Webster
1364
Duron Harmon
1365
Stedman Bailey
1366
Will Davis
1368
Sam Montgomery
1369
Knile Davis
1370
Zaviar Gooden
1371
Matt Barkley
1372
Nico Johnson
1373
Akeem Spence
1374
Ace Sanders
1375
Josh Boyce
1376
Alex Okafor
1377
Jelani Jenkins
1378
Duke Williams
1379
Dion Sims
1383
Ryan Nassib
1384
Shamarko 

1991
Bruce Miller
1992
Zach Clayton
1993
Brandyn Thompson
1995
D'Aundre Reed
1996
Mikail Baker
1998
Ryan Taylor
1999
Malcolm Williams
2000
Shaun Chapas
2001
Da'Rel Scott
2002
Anthony Gaitor
2005
Anthony Allen
2006
Greg Romeus
2007
Scotty McKnight
2009
Jonathan Nelson
2010
Cliff Matthews
2012
Baron Batch
2013
Lawrence Guy
2014
Andrew Gachkar
2017
Greg Lloyd
2018
Daniel Hardy
2020
Stanley Havili
2021
David Ausberry
2022
Malcolm Smith
2023
Nate Bussey
2026
Jay Finley
2027
Jeremy Beal
2028
Eric Hagg
2029
DeMarco Sampson
2031
Tommie Campbell
2033
Chris Neild
2034
Cheta Ozougwu
2035
Sam Bradford
2036
Ndamukong Suh
2037
Gerald McCoy
2039
Eric Berry
2041
Joe Haden
2042
Rolando McClain
2043
C.J. Spiller
2044
Tyson Alualu
2046
Ryan Mathews
2047
Brandon Graham
2048
Earl Thomas
2049
Jason Pierre-Paul
2050
Derrick Morgan
2053
Sean Weatherspoon
2054
Kareem Jackson
2055
Jermaine Gresham
2056
Demaryius Thomas
2058
Dez Bryant
2059
Tim Tebow
2060
Dan Williams
2061
Devin McCourty
2062
Jared Odrick
2063
K

In [49]:
all_drafts[final_year_cols]

Unnamed: 0,final_year_pass_cmp,final_year_pass_att,final_year_pass_cmp_pct,final_year_pass_yds,final_year_pass_yds_per_att,final_year_adj_pass_yds_per_att,final_year_pass_td,final_year_pass_int,final_year_pass_rating,final_year_rush_att,...,final_year_fumbles_rec_td,final_year_fumbles_forced,final_year_punt_ret,final_year_punt_ret_yds,final_year_punt_ret_yds_per_ret,final_year_punt_ret_td,final_year_kick_ret,final_year_kick_ret_yds,final_year_kick_ret_yds_per_ret,final_year_kick_ret_td
0,285.0,404.0,70.5,4627.0,11.5,12.9,43.0,6.0,198.9,97.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,2.0,100.0,36.0,18.0,28.0,1.0,0.0,416.2,217.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,303.0,480.0,63.1,4143.0,8.6,8.5,26.0,13.0,148.1,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,152.0,270.0,56.3,1812.0,6.7,6.9,16.0,6.0,127.8,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,283.0,452.0,62.6,3756.0,8.3,8.5,26.0,10.0,147.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,39.0,39.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,17.0,17.0,0.0,0.0,0.0,0.0,0.0


In [51]:
all_drafts.drop(columns=['Unnamed: 0'], inplace=True)

In [52]:
for i in all_drafts.columns:
    print(i)

college_stats_url
draft_age
draft_overall
draft_round
draft_year
player
position
pro_stats_url
school
team_nfl
games_y1
ov_grade_y1
games_y2
ov_grade_y2
games_y3
ov_grade_y3
games_y4
ov_grade_y4
played_y1
played_y2
played_y3
played_y4
years_played
conference
avg_pass_cmp
avg_pass_att
avg_pass_cmp_pct
avg_pass_yds
avg_pass_yds_per_att
avg_adj_pass_yds_per_att
avg_pass_td
avg_pass_int
avg_pass_rating
avg_rush_att
avg_rush_yds
avg_rush_yds_per_att
avg_rush_td
avg_rec
avg_rec_yds
avg_rec_yds_per_rec
avg_rec_td
avg_scrim_att
avg_scrim_yds
avg_scrim_yds_per_att
avg_scrim_td
avg_tackles_solo
avg_tackles_assists
avg_tackles_total
avg_tackles_loss
avg_sacks
avg_def_int
avg_def_int_yds
avg_def_int_yds_per_int
avg_def_int_td
avg_pass_defended
avg_fumbles_rec
avg_fumbles_rec_yds
avg_fumbles_rec_td
avg_fumbles_forced
avg_punt_ret
avg_punt_ret_yds
avg_punt_ret_yds_per_ret
avg_punt_ret_td
avg_kick_ret
avg_kick_ret_yds
avg_kick_ret_yds_per_ret
avg_kick_ret_td
final_year_pass_cmp
final_year_pass_att
fi

In [53]:
all_drafts[all_drafts.columns[0:50]].isnull().sum()

college_stats_url              0
draft_age                     16
draft_overall                  0
draft_round                    0
draft_year                     0
player                         0
position                       0
pro_stats_url                 12
school                         0
team_nfl                       0
games_y1                     422
ov_grade_y1                  422
games_y2                     595
ov_grade_y2                  595
games_y3                     838
ov_grade_y3                  838
games_y4                    1054
ov_grade_y4                 1054
played_y1                      0
played_y2                      0
played_y3                      0
played_y4                      0
years_played                   0
conference                   340
avg_pass_cmp                 339
avg_pass_att                 339
avg_pass_cmp_pct             339
avg_pass_yds                 339
avg_pass_yds_per_att         339
avg_adj_pass_yds_per_att     339
avg_pass_t

In [54]:
all_drafts[all_drafts.columns[50:]].isnull().sum()

avg_def_int                        339
avg_def_int_yds                    339
avg_def_int_yds_per_int            339
avg_def_int_td                     339
avg_pass_defended                  339
avg_fumbles_rec                    339
avg_fumbles_rec_yds                339
avg_fumbles_rec_td                 339
avg_fumbles_forced                 339
avg_punt_ret                       339
avg_punt_ret_yds                   339
avg_punt_ret_yds_per_ret           339
avg_punt_ret_td                    339
avg_kick_ret                       339
avg_kick_ret_yds                   339
avg_kick_ret_yds_per_ret           339
avg_kick_ret_td                    339
final_year_pass_cmp                339
final_year_pass_att                339
final_year_pass_cmp_pct            339
final_year_pass_yds                339
final_year_pass_yds_per_att        339
final_year_adj_pass_yds_per_att    339
final_year_pass_td                 339
final_year_pass_int                339
final_year_pass_rating   

In [55]:
all_drafts[all_drafts['player'].isnull()==True]

Unnamed: 0,college_stats_url,draft_age,draft_overall,draft_round,draft_year,player,position,pro_stats_url,school,team_nfl,...,final_year_fumbles_rec_td,final_year_fumbles_forced,final_year_punt_ret,final_year_punt_ret_yds,final_year_punt_ret_yds_per_ret,final_year_punt_ret_td,final_year_kick_ret,final_year_kick_ret_yds,final_year_kick_ret_yds_per_ret,final_year_kick_ret_td


In [56]:
all_drafts.dropna(subset=['player'], inplace=True)

In [57]:
all_drafts.isnull().sum()

college_stats_url                     0
draft_age                            16
draft_overall                         0
draft_round                           0
draft_year                            0
player                                0
position                              0
pro_stats_url                        12
school                                0
team_nfl                              0
games_y1                            422
ov_grade_y1                         422
games_y2                            595
ov_grade_y2                         595
games_y3                            838
ov_grade_y3                         838
games_y4                           1054
ov_grade_y4                        1054
played_y1                             0
played_y2                             0
played_y3                             0
played_y4                             0
years_played                          0
conference                          340
avg_pass_cmp                        339


In [58]:
all_drafts[all_drafts['avg_pass_cmp'].isnull()==True].isnull().sum()

college_stats_url                    0
draft_age                            3
draft_overall                        0
draft_round                          0
draft_year                           0
player                               0
position                             0
pro_stats_url                        3
school                               0
team_nfl                             0
games_y1                            85
ov_grade_y1                         85
games_y2                           101
ov_grade_y2                        101
games_y3                           133
ov_grade_y3                        133
games_y4                           160
ov_grade_y4                        160
played_y1                            0
played_y2                            0
played_y3                            0
played_y4                            0
years_played                         0
conference                         339
avg_pass_cmp                       339
avg_pass_att             

In [59]:
all_drafts[all_drafts['avg_pass_cmp'].isnull()==True]

Unnamed: 0,college_stats_url,draft_age,draft_overall,draft_round,draft_year,player,position,pro_stats_url,school,team_nfl,...,final_year_fumbles_rec_td,final_year_fumbles_forced,final_year_punt_ret,final_year_punt_ret_yds,final_year_punt_ret_yds_per_ret,final_year_punt_ret_td,final_year_kick_ret,final_year_kick_ret_yds,final_year_kick_ret_yds_per_ret,final_year_kick_ret_td
44,http://www.sports-reference.com/cfb/players/jo...,22.0,45,2,2018,Josh Jackson,CB,https://www.pro-football-reference.com/players...,Iowa,GNB,...,,,,,,,,,,
60,http://www.sports-reference.com/cfb/players/dj...,21.0,61,2,2018,DJ Chark,WR,https://www.pro-football-reference.com/players...,LSU,JAX,...,,,,,,,,,,
106,http://www.sports-reference.com/cfb/players/ch...,22.0,107,4,2018,Chris Herndon,TE,https://www.pro-football-reference.com/players...,Miami (FL),NYJ,...,,,,,,,,,,
115,http://www.sports-reference.com/cfb/players/do...,21.0,116,4,2018,Dorance Armstrong,DE,https://www.pro-football-reference.com/players...,Kansas,DAL,...,,,,,,,,,,
138,http://www.sports-reference.com/cfb/players/rj...,22.0,139,5,2018,R.J. McIntosh,DT,https://www.pro-football-reference.com/players...,Miami (FL),NYG,...,,,,,,,,,,
139,http://www.sports-reference.com/cfb/players/ma...,23.0,140,5,2018,Maurice Hurst,DT,https://www.pro-football-reference.com/players...,Michigan,OAK,...,,,,,,,,,,
141,http://www.sports-reference.com/cfb/players/dj...,21.0,142,5,2018,D.J. Reed,CB,https://www.pro-football-reference.com/players...,Kansas St.,SFO,...,,,,,,,,,,
142,http://www.sports-reference.com/cfb/players/ja...,22.0,143,5,2018,Ja'Whaun Bentley,ILB,https://www.pro-football-reference.com/players...,Purdue,NWE,...,,,,,,,,,,
160,http://www.sports-reference.com/cfb/players/je...,23.0,161,5,2018,Jermaine Carter,LB,https://www.pro-football-reference.com/players...,Maryland,CAR,...,,,,,,,,,,
207,http://www.sports-reference.com/cfb/players/ce...,22.0,208,6,2018,Cedrick Wilson,WR,https://www.pro-football-reference.com/players...,Boise St.,DAL,...,,,,,,,,,,


In [60]:
college_stats[college_stats['player']=='Mitchell Trubisky']

Unnamed: 0,player,year,school,conference,class,pos,games,pass_cmp,pass_att,pass_cmp_pct,...,fumbles_forced,punt_ret,punt_ret_yds,punt_ret_yds_per_ret,punt_ret_td,kick_ret,kick_ret_yds,kick_ret_yds_per_ret,kick_ret_td,final_year


#### Any rows that are showing null values for their college statistics at this point are rows that were included in the original college draft table, but for some reason threw an error when their college stats were scraped so they did not get any values.  For the time being I am going to drop these, but it would be good to revisit these in the future and try to include them for more inclusive data. 

In [61]:
all_drafts.dropna(subset=['avg_pass_cmp'], inplace=True)

In [62]:
all_drafts.isnull().sum()

college_stats_url                    0
draft_age                           13
draft_overall                        0
draft_round                          0
draft_year                           0
player                               0
position                             0
pro_stats_url                        9
school                               0
team_nfl                             0
games_y1                           337
ov_grade_y1                        337
games_y2                           494
ov_grade_y2                        494
games_y3                           705
ov_grade_y3                        705
games_y4                           894
ov_grade_y4                        894
played_y1                            0
played_y2                            0
played_y3                            0
played_y4                            0
years_played                         0
conference                           1
avg_pass_cmp                         0
avg_pass_att             

In [63]:
all_drafts[all_drafts['conference'].isnull()==True]

Unnamed: 0,college_stats_url,draft_age,draft_overall,draft_round,draft_year,player,position,pro_stats_url,school,team_nfl,...,final_year_fumbles_rec_td,final_year_fumbles_forced,final_year_punt_ret,final_year_punt_ret_yds,final_year_punt_ret_yds_per_ret,final_year_punt_ret_td,final_year_kick_ret,final_year_kick_ret_yds,final_year_kick_ret_yds_per_ret,final_year_kick_ret_td
555,http://www.sports-reference.com/cfb/players/mi...,23.0,47,2,2016,Michael Thomas,WR,https://www.pro-football-reference.com/players...,Ohio St.,NOR,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Ohio State is in the Big Ten conference, probably the only manual impuation I will do.  

#### The remaining null values for my target variables (games_y1, ov_grade_y1... etc.) that come from ProFootballFocus.com are null if there is no data for that player in the given year meaning they did not play a snap that season.  Since I am attempting to model a players immediate effecivness in the NFL, I am hoping to be able to factor out situations where a player did not play rather than give them a grade of zero for that year, considering factors such as getting injured before the season started, or being second string to an established starter or superstar on a given team.  I will however consider players who did not player at all for their first four years in the NFL a failure, as regular rookie contracts are generally 3 or 4 years long.  

In [64]:
all_drafts['conference'].unique()

array(['Big 12', 'Big Ten', 'Pac-12', 'ACC', 'MWC', 'SEC', 'CUSA',
       'American', 'Ind', 'Sun Belt', 'MAC', 'MVC', nan, 'Pac-10',
       'Big East', 'WAC'], dtype=object)

In [65]:
all_drafts.loc[555, 'conference'] = 'Big Ten'

In [66]:
all_drafts.loc[[555]]

Unnamed: 0,college_stats_url,draft_age,draft_overall,draft_round,draft_year,player,position,pro_stats_url,school,team_nfl,...,final_year_fumbles_rec_td,final_year_fumbles_forced,final_year_punt_ret,final_year_punt_ret_yds,final_year_punt_ret_yds_per_ret,final_year_punt_ret_td,final_year_kick_ret,final_year_kick_ret_yds,final_year_kick_ret_yds_per_ret,final_year_kick_ret_td
555,http://www.sports-reference.com/cfb/players/mi...,23.0,47,2,2016,Michael Thomas,WR,https://www.pro-football-reference.com/players...,Ohio St.,NOR,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Below I am creating one average grade as an average of a players first four years grades in the NFL.  I created logic such that if a player didn't play in a given year ('games_y1 would be null for example) or they played less than 5 games that year, then that year would not be included in the average.

In [67]:
for index, row in all_drafts.iterrows():
    numerator = 0
    denominator = 0
    
    if row['games_y1'] > 4:
        numerator += row['ov_grade_y1']
        denominator += 1
    
    if row['games_y2'] > 4:
        numerator += row['ov_grade_y2']
        denominator += 1
    
    if row['games_y3'] > 4:
        numerator += row['ov_grade_y3']
        denominator += 1
    
    if row['games_y4'] > 4:
        numerator += row['ov_grade_y4']
        denominator += 1

    try:
        all_drafts.loc[index, 'avg_grade'] = numerator / denominator
    except ZeroDivisionError:
        all_drafts.loc[index, 'avg_grade'] = 0

In [68]:
all_drafts.isnull().sum()

college_stats_url                    0
draft_age                           13
draft_overall                        0
draft_round                          0
draft_year                           0
player                               0
position                             0
pro_stats_url                        9
school                               0
team_nfl                             0
games_y1                           337
ov_grade_y1                        337
games_y2                           494
ov_grade_y2                        494
games_y3                           705
ov_grade_y3                        705
games_y4                           894
ov_grade_y4                        894
played_y1                            0
played_y2                            0
played_y3                            0
played_y4                            0
years_played                         0
conference                           0
avg_pass_cmp                         0
avg_pass_att             

In [69]:
all_drafts.drop(columns=['college_stats_url', 'draft_age', 'pro_stats_url', 'games_y1', 'ov_grade_y1', 'games_y2', 
                         'ov_grade_y2', 'games_y3', 'ov_grade_y3', 'games_y4', 'ov_grade_y4', 'played_y1', 'played_y2',
                         'played_y3', 'played_y4', 'years_played'], axis=1).columns

Index(['draft_overall', 'draft_round', 'draft_year', 'player', 'position',
       'school', 'team_nfl', 'conference', 'avg_pass_cmp', 'avg_pass_att',
       'avg_pass_cmp_pct', 'avg_pass_yds', 'avg_pass_yds_per_att',
       'avg_adj_pass_yds_per_att', 'avg_pass_td', 'avg_pass_int',
       'avg_pass_rating', 'avg_rush_att', 'avg_rush_yds',
       'avg_rush_yds_per_att', 'avg_rush_td', 'avg_rec', 'avg_rec_yds',
       'avg_rec_yds_per_rec', 'avg_rec_td', 'avg_scrim_att', 'avg_scrim_yds',
       'avg_scrim_yds_per_att', 'avg_scrim_td', 'avg_tackles_solo',
       'avg_tackles_assists', 'avg_tackles_total', 'avg_tackles_loss',
       'avg_sacks', 'avg_def_int', 'avg_def_int_yds',
       'avg_def_int_yds_per_int', 'avg_def_int_td', 'avg_pass_defended',
       'avg_fumbles_rec', 'avg_fumbles_rec_yds', 'avg_fumbles_rec_td',
       'avg_fumbles_forced', 'avg_punt_ret', 'avg_punt_ret_yds',
       'avg_punt_ret_yds_per_ret', 'avg_punt_ret_td', 'avg_kick_ret',
       'avg_kick_ret_yds', 'avg_kick_

In [70]:
modeling_df = all_drafts[['draft_overall', 'draft_round', 'draft_year', 'player', 'position',
       'school', 'conference', 'team_nfl', 'avg_grade', 'avg_pass_cmp', 'avg_pass_att',
       'avg_pass_cmp_pct', 'avg_pass_yds', 'avg_pass_yds_per_att',
       'avg_adj_pass_yds_per_att', 'avg_pass_td', 'avg_pass_int',
       'avg_pass_rating', 'avg_rush_att', 'avg_rush_yds',
       'avg_rush_yds_per_att', 'avg_rush_td', 'avg_rec', 'avg_rec_yds',
       'avg_rec_yds_per_rec', 'avg_rec_td', 'avg_scrim_att', 'avg_scrim_yds',
       'avg_scrim_yds_per_att', 'avg_scrim_td', 'avg_tackles_solo',
       'avg_tackles_assists', 'avg_tackles_total', 'avg_tackles_loss',
       'avg_sacks', 'avg_def_int', 'avg_def_int_yds',
       'avg_def_int_yds_per_int', 'avg_def_int_td', 'avg_pass_defended',
       'avg_fumbles_rec', 'avg_fumbles_rec_yds', 'avg_fumbles_rec_td',
       'avg_fumbles_forced', 'avg_punt_ret', 'avg_punt_ret_yds',
       'avg_punt_ret_yds_per_ret', 'avg_punt_ret_td', 'avg_kick_ret',
       'avg_kick_ret_yds', 'avg_kick_ret_yds_per_ret', 'avg_kick_ret_td',
       'final_year_pass_cmp', 'final_year_pass_att', 'final_year_pass_cmp_pct',
       'final_year_pass_yds', 'final_year_pass_yds_per_att',
       'final_year_adj_pass_yds_per_att', 'final_year_pass_td',
       'final_year_pass_int', 'final_year_pass_rating', 'final_year_rush_att',
       'final_year_rush_yds', 'final_year_rush_yds_per_att',
       'final_year_rush_td', 'final_year_rec', 'final_year_rec_yds',
       'final_year_rec_yds_per_rec', 'final_year_rec_td',
       'final_year_scrim_att', 'final_year_scrim_yds',
       'final_year_scrim_yds_per_att', 'final_year_scrim_td',
       'final_year_tackles_solo', 'final_year_tackles_assists',
       'final_year_tackles_total', 'final_year_tackles_loss',
       'final_year_sacks', 'final_year_def_int', 'final_year_def_int_yds',
       'final_year_def_int_yds_per_int', 'final_year_def_int_td',
       'final_year_pass_defended', 'final_year_fumbles_rec',
       'final_year_fumbles_rec_yds', 'final_year_fumbles_rec_td',
       'final_year_fumbles_forced', 'final_year_punt_ret',
       'final_year_punt_ret_yds', 'final_year_punt_ret_yds_per_ret',
       'final_year_punt_ret_td', 'final_year_kick_ret',
       'final_year_kick_ret_yds', 'final_year_kick_ret_yds_per_ret',
       'final_year_kick_ret_td']]

In [71]:
modeling_df.to_csv('./data/modeling_data')

Reordered and dropped unneeded columns above and then saved the dataframe that is read for analysis and modeling. 