# Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from collections import Counter

# Show all columns in DataFrames
pd.options.display.max_columns = None

features = pd.read_csv(os.path.join('NeuralPassing', 'Data', 'qbPassingFeatures.csv'), index_col=0, header=[0,1,2])
features_columns = features.columns.values

# Clean Data

### Remove

Remove rows with:
* Labels < 75
* QBs missed 3 or more of past 6 games
* QB plays a minor role (e.g. Taysom Hill) defined by < 75 yards in 3 or more past 6 games
* Fewer than 60 total pass attempts over past 6 games

In [2]:
print(len(features))

# Labels < 75
features = features[features[('Label', 'pass_yds', 'pass_yds')] >= 75]

print(len(features))

# Missed 3 or more of past 6 games
features = features[(pd.isna(features[[('QB', 'pass_yds', str(i)) for i in range(1,7)]])).sum(axis=1) < 3]

print(len(features))

# Minor role
features = features[(features[[('QB', 'pass_yds', str(i)) for i in range(1,7)]] <= 75).sum(axis=1) < 3]

print(len(features))

# Fewer 60 pass attempts in 6 games
features[('QB','Total_pass_att','Total_pass_att')] = features[[('QB', 'pass_cmp', str(i)) for i in range(1,7)]].sum(axis=1)
features = features[features[('QB','Total_pass_att','Total_pass_att')] >= 60]

print(len(features))


# Only keep orignal columns
features = features[features_columns]

1589
1503
1409
1398
1380


### Replace Empty Values

Fill in empty values (mostly due to BYE weeks, but some missed weeks due to injury too) with mean of values in row. E.g. if BYE week, then the QB's numbers for this will be filled in with the mean of the other 5 values. This is preferred to the mean values of the overall data.

In [3]:
print('Number of NaNs: ' , features.isna().sum().sum())

Number of NaNs:  18756


#### QB Empty Values

In [4]:
idx = pd.IndexSlice

# Loop over the numeric second-level QB columns (e.g. pass_yds, pass_att ...)
for f in [n for n in set(features.loc[:2,idx['QB',:]].columns.get_level_values(1)) if n != 'home_game']:
    
    # Using: https://stackoverflow.com/questions/33058590/pandas-dataframe-replacing-nan-with-row-average
    features.loc[:,idx['QB',f,:]] = features.loc[:,idx['QB',f,:]].T.fillna(features.loc[:,idx['QB',f,:]].mean(axis=1)).T

print('Number of NaNs: ' , features.isna().sum().sum())   

Number of NaNs:  9792


#### Offence BYE weeks

In [5]:
# Loop over the numeric second-level offence columns (e.g. rush_yds, rush_att ...)
for f in set(features.loc[:2,idx['Offence',:]].columns.get_level_values(1)):
    
    # Using: https://stackoverflow.com/questions/33058590/pandas-dataframe-replacing-nan-with-row-average
    features.loc[:,idx['Offence',f,:]] = features.loc[:,idx['Offence',f,:]].T.fillna(features.loc[:,idx['Offence',f,:]].mean(axis=1)).T

print('Number of NaNs: ' , features.isna().sum().sum())   

Number of NaNs:  7344


#### Defence BYE weeks

In [6]:
# Loop over the numeric second-level defence columns (e.g. pass_yds, pass_att ...)
for f in set(features.loc[:2,idx['Defence_upcom_mean',:]].columns.get_level_values(1)):
    
    # Using: https://stackoverflow.com/questions/33058590/pandas-dataframe-replacing-nan-with-row-average
    features.loc[:,idx['Defence_prev_mean',f,:]] = features.loc[:,idx['Defence_prev_mean',f,:]].T.fillna(features.loc[:,idx['Defence_prev_mean',f,:]].mean(axis=1)).T

print('Number of NaNs: ' , features.isna().sum().sum())   

Number of NaNs:  0


In [7]:
# Save cleaned data
features.to_csv(os.path.join('NeuralPassing', 'Data', 'qbPassingFeatures_CLEAN.csv'))