In [5]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [180]:
tdc = pd.read_csv('data_sets/wr_draft_combine.csv')

### Re-organizing the columns in proper order

In [181]:
tdc = tdc[['year', 'round', 'pick', 'player', 'pos', 'draft age',
       'team', 'entry year', 'last year', '1st team pro select', 'pro select',
       'weighted career av', 'years as primary starter', 'games',
       'games started', 'rushing attempts', 'rushing yards', 'rushing td',
       'receiving attemps', 'receiving yards', 'receiving td',
        'nfl url','nfl age', 'nfl team', 'nfl pos', 'nfl no', 'nfl game',
       'nfl game started', 'nfl target', 'nfl receptions', 'nfl yards',
       'nfl y/r', 'nfl td', 'nfl first downs', 'nfl longest rec',
       'nfl rec per game', 'nfl yards per game', 'nfl catch ratio',
       'nfl yards per target', 'nfl rushes', 'nfl rush yards', 'nfl rush td',
       'nfl first downs rush', 'nfl longest rush',
       'nfl rush yards per attempt', 'nfl rush yards per game',
       'nfl rush attempt per games', 'nfl total touches',
       'nfl yards per touch', 'nfl yards from scrimmage', 'nfl total td',
       'nfl fumbles', 'nfl av', 'nfl table type',
        'college', 'cfb url', 'cfb school', 'cfb conference', 'cfb class', 'cfb pos',
       'cfb games', 'cfb receptions', 'cfb yards', 'cfb average', 'cfb td',
       'cfb attemps rushing', 'cfb yards rushing', 'cfb avg rushing',
       'cfb td rushing', 'cfb scrimmages', 'cfb yards total', 'cfb avg total',
       'cfb td total',
       'combine player', 'combine pos', 'combine ht', 'combine wt',
       'combine forty', 'combine vertical', 'combine benchreps',
       'combine broadjump', 'combine cone', 'combine shuttle', 'combine year',
       'combine pfr_id', 'combine av', 'combine team', 'combine round',
       'combine pick',
        'nfl method','cfb method', 'combine method']]

### Cleaning some of the obvious columns:
converting obvious columns to floats only

In [182]:
cond = [str.split(x)[0] == 'fail' for x in tdc['cfb method']]

In [183]:
tdc.loc[cond,'cfb school':'cfb td total'] = [np.nan]*17

In [184]:
cond = [str.split(x)[0] == 'fail' for x in tdc['combine method']]

In [185]:
cond= [str.split(row['nfl method'])[0] == 'fail' and str.split(row['nfl team'])[0] == 'fail' for i,row in tdc.iterrows()]

In [186]:
tdc.loc[cond,'nfl age':'nfl table type'] = [np.nan]*32

### Catching an error 
Some tables have rushing and receiving still switched up. We find those if the value of 'nfl catch ratio' doesn't end with '%'

In [198]:
error_list = []
col = 'nfl catch ratio'
for i in tdc[col].index:
    if isinstance(tdc.loc[i,col], str):
        if tdc.loc[i,col][-1] != '%':
            error_list.append(i)
            
#display(tdc.loc[[0]+error_list])
#[print(x) for x in tdc.loc[[0]+error_list]['nfl url']]

In [189]:
i_col = tdc.columns.get_loc('nfl target')
lr = 8
lw = 11

def switch_rushing_receiving (table, error_list): #switch rush data and receiving data if needed
    for i in error_list:
        rush = table.iloc[i,i_col:i_col+lr].tolist()
        receiving = table.iloc[i,i_col+lr:i_col+lr+lw].tolist()
        table.iloc[i,i_col:i_col+lw] = receiving
        table.iloc[i,i_col+lw:i_col+lw+lr] = rush
    return table

tdc = switch_rushing_receiving(tdc,error_list)

#tdc.loc[error_list]

In [234]:
col = 'nfl catch ratio'
for i in tdc[col].index:
    if isinstance(tdc.loc[i,col], str):
        s = tdc.loc[i,col][:-1]
        tdc.loc[i,col] = float(s)/100
        
tdc[col]= tdc[col].astype(float)

### Converting str to floats carefully

In [241]:
i_s = 60
i_f = 73
for i in range(i_s, i_f):
    print(tdc.columns[i])
    tdc.iloc[:,i] = tdc.iloc[:,i].astype(float)

cfb games
cfb receptions
cfb yards
cfb average
cfb td
cfb attemps rushing
cfb yards rushing
cfb avg rushing
cfb td rushing
cfb scrimmages
cfb yards total
cfb avg total
cfb td total


In [243]:
for i in range(len(tdc.columns)):
    print(i, tdc.columns[i], tdc.iloc[:,i].dtypes)

0 year int64
1 round int64
2 pick int64
3 player object
4 pos object
5 draft age float64
6 team object
7 entry year float64
8 last year float64
9 1st team pro select int64
10 pro select int64
11 weighted career av int64
12 years as primary starter float64
13 games float64
14 games started float64
15 rushing attempts float64
16 rushing yards float64
17 rushing td float64
18 receiving attemps float64
19 receiving yards float64
20 receiving td float64
21 nfl url object
22 nfl age float64
23 nfl team object
24 nfl pos object
25 nfl no object
26 nfl game float64
27 nfl game started float64
28 nfl target float64
29 nfl receptions float64
30 nfl yards float64
31 nfl y/r float64
32 nfl td float64
33 nfl first downs float64
34 nfl longest rec float64
35 nfl rec per game float64
36 nfl yards per game float64
37 nfl catch ratio float64
38 nfl yards per target float64
39 nfl rushes float64
40 nfl rush yards float64
41 nfl rush td float64
42 nfl first downs rush float64
43 nfl longest rush float64


In [244]:
tdc.to_csv('data_sets/wr_tdc_clean.csv', index=False)