In [1]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [2]:
# Dependencies
import numpy as np
import pandas as pd

In [3]:
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

In [4]:
# read in 2019 nfl data
home_epa = pd.read_csv('home_data.csv')

away_epa = pd.read_csv('away_data.csv')

# merge home + away data for 2019
epa_2019 = pd.merge(away_epa, home_epa, on='game_id', how='inner')

# pre home + away merged data for 2018
epa_2018 = pd.read_csv('mergedepa_2018.csv')

# append the 2018 data to the 2019 dataframe
epa = epa_2019.append(epa_2018, ignore_index=True)
epa.shape

(463, 37)

In [5]:
epa.head()

Unnamed: 0,game_id,home_team_x,away_team_x,posteam_x,Total_EPA_x,Total_Yards_Gained_x,Air_Yards_x,Turnovers_x,Total_Rush_Attempts_x,Total_Pass_Attempts_x,...,Total_Pass_Attempts_y,total_away_score_y,total_home_score_y,Winner_y,MA_EPA_y,MA_yards_gained_y,MA_air_yards_y,MA_turnovers_y,MA_rush_attempts_y,MA_pass_attempts_y
0,2019091500,BAL,ARI,ARI,1.885633,351,312,0,11,44,...,39,17,23,Home,51.706295,643,262,1.0,46.0,27.0
1,2019100601,CIN,ARI,ARI,15.487371,514,247,0,38,33,...,39,26,23,Away,-10.77779,306,296,2.25,17.75,46.25
2,2019102006,NYG,ARI,ARI,2.310009,245,112,0,38,23,...,43,27,21,Away,-7.564422,299,298,2.75,23.0,36.75
3,2019102707,NO,ARI,ARI,-10.899145,237,289,0,11,38,...,43,9,31,Home,7.330501,368,237,0.5,29.75,37.25
4,2019111006,TB,ARI,ARI,8.570837,417,560,2,18,47,...,52,27,30,Home,1.113415,368,433,3.0,23.25,47.25


## Data Pre-Processing

## dropped turnovers (60%)

In [6]:
# # dropping columns that pertain to the game that happened. only want moving average data. probably could have just said what to grab instead of what to drop
# #dropped MA of turnovers (result = 60%)
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_turnovers_x", "MA_turnovers_y", 
#               ], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Dropped Turnovers and yards gained (56%)

In [7]:
# # dropping turnovers and yards gained (reseult = 56%)
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_turnovers_x", "MA_turnovers_y", 
#               "MA_yards_gained_x", "MA_yards_gained_y"], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Dropped EPA (60%)

In [8]:
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_EPA_x", "MA_EPA_y", 
#               ], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Dropped rush and pass attempts

In [9]:
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_pass_attempts_x", "MA_pass_attempts_y", 
#               "MA_rush_attempts_x", "MA_rush_attempts_y"], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Dropped turnovers, rush + pass attempts

In [10]:
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_pass_attempts_x", "MA_pass_attempts_y", 
#               "MA_rush_attempts_x", "MA_rush_attempts_y", "MA_turnovers_x", "MA_turnovers_y"], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Keeping EPA and Air yards only

In [11]:
# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_pass_attempts_x", "MA_pass_attempts_y", 
#               "MA_rush_attempts_x", "MA_rush_attempts_y", "MA_turnovers_x", "MA_turnovers_y", "MA_yards_gained_y", "MA_yards_gained_x",
#              "MA_air_yards_x", "MA_air_yards_y"], axis=1)
# # who won the game 
# y = epa["Winner_x"]

# print(X.shape, y.shape)

## Dropping air yards, EPA

In [12]:
X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
             "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
             "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
              "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id", "MA_EPA_x", "MA_EPA_y", 
              "MA_air_yards_x", "MA_air_yards_y"], axis=1)
# who won the game 
y = epa["Winner_x"]

print(X.shape, y.shape)

(463, 8) (463,)


## Not dropping anything

In [13]:
# # dropping columns that pertain to the game that happened. only want moving average data. probably could have just said what to grab instead of what to drop

# X = epa.drop(["home_team_x", "home_team_y", "away_team_x", "away_team_y", "posteam_x", "posteam_y", "total_home_score_x", "total_home_score_y",
#              "total_away_score_x", "total_away_score_y", "Total_EPA_x", "Total_EPA_y", "Total_Yards_Gained_x", "Total_Yards_Gained_y",
#              "Air_Yards_x", "Air_Yards_y", "Turnovers_x", "Turnovers_y","Total_Rush_Attempts_x", "Total_Rush_Attempts_y", 
#               "Total_Pass_Attempts_x", "Total_Pass_Attempts_y", "Winner_y", "Winner_x", "game_id" 
#               ], axis=1)
# # who won the game
# y = epa["Winner_x"]

# print(X.shape, y.shape)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, stratify=y)

In [16]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)


In [17]:
# Step 1: Label-encode data set, necessary for Winner_x
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [18]:
# # Step 2: Convert encoded labels to one-hot-encoding 
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Create a Deep Learning Model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## input_dim below must match the shape of X columns

In [20]:
# Create model and add layers
model = Sequential()
#input_dim is equal to number of columns used for train
model.add(Dense(units=100, activation='relu', input_dim=8))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [21]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               900       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 11,202
Trainable params: 11,202
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=200,
    shuffle=True,
    verbose=2
)

Train on 347 samples
Epoch 1/200
347/347 - 1s - loss: 0.6909 - accuracy: 0.5331
Epoch 2/200
347/347 - 0s - loss: 0.6828 - accuracy: 0.5476
Epoch 3/200
347/347 - 0s - loss: 0.6813 - accuracy: 0.5591
Epoch 4/200
347/347 - 0s - loss: 0.6788 - accuracy: 0.5562
Epoch 5/200
347/347 - 0s - loss: 0.6774 - accuracy: 0.5533
Epoch 6/200
347/347 - 0s - loss: 0.6773 - accuracy: 0.5965
Epoch 7/200
347/347 - 0s - loss: 0.6716 - accuracy: 0.6081
Epoch 8/200
347/347 - 0s - loss: 0.6720 - accuracy: 0.5706
Epoch 9/200
347/347 - 0s - loss: 0.6697 - accuracy: 0.5994
Epoch 10/200
347/347 - 0s - loss: 0.6690 - accuracy: 0.5908
Epoch 11/200
347/347 - 0s - loss: 0.6731 - accuracy: 0.5793
Epoch 12/200
347/347 - 0s - loss: 0.6656 - accuracy: 0.5937
Epoch 13/200
347/347 - 0s - loss: 0.6627 - accuracy: 0.6023
Epoch 14/200
347/347 - 0s - loss: 0.6634 - accuracy: 0.5965
Epoch 15/200
347/347 - 0s - loss: 0.6592 - accuracy: 0.5937
Epoch 16/200
347/347 - 0s - loss: 0.6570 - accuracy: 0.6254
Epoch 17/200
347/347 - 0s - 

Epoch 137/200
347/347 - 0s - loss: 0.4415 - accuracy: 0.8012
Epoch 138/200
347/347 - 0s - loss: 0.4344 - accuracy: 0.8098
Epoch 139/200
347/347 - 0s - loss: 0.4299 - accuracy: 0.8242
Epoch 140/200
347/347 - 0s - loss: 0.4309 - accuracy: 0.8184
Epoch 141/200
347/347 - 0s - loss: 0.4247 - accuracy: 0.8213
Epoch 142/200
347/347 - 0s - loss: 0.4263 - accuracy: 0.8069
Epoch 143/200
347/347 - 0s - loss: 0.4259 - accuracy: 0.8156
Epoch 144/200
347/347 - 0s - loss: 0.4432 - accuracy: 0.7839
Epoch 145/200
347/347 - 0s - loss: 0.4533 - accuracy: 0.7637
Epoch 146/200
347/347 - 0s - loss: 0.4462 - accuracy: 0.7867
Epoch 147/200
347/347 - 0s - loss: 0.4318 - accuracy: 0.8069
Epoch 148/200
347/347 - 0s - loss: 0.4239 - accuracy: 0.8300
Epoch 149/200
347/347 - 0s - loss: 0.4367 - accuracy: 0.7867
Epoch 150/200
347/347 - 0s - loss: 0.4287 - accuracy: 0.8069
Epoch 151/200
347/347 - 0s - loss: 0.4122 - accuracy: 0.8530
Epoch 152/200
347/347 - 0s - loss: 0.4126 - accuracy: 0.8300
Epoch 153/200
347/347 - 

<tensorflow.python.keras.callbacks.History at 0x1c7be98f240>

## Quantify our Trained Model

In [24]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

116/1 - 0s - loss: 1.7537 - accuracy: 0.5000
Normal Neural Network - Loss: 1.344303295530122, Accuracy: 0.5


In [25]:
X.columns

Index(['MA_yards_gained_x', 'MA_turnovers_x', 'MA_rush_attempts_x',
       'MA_pass_attempts_x', 'MA_yards_gained_y', 'MA_turnovers_y',
       'MA_rush_attempts_y', 'MA_pass_attempts_y'],
      dtype='object')

## Make Predictions -- future plans, be able to make predictions on specific games

In [26]:
# encoded_predictions = model.predict_classes(X_test_scaled[:])
# prediction_labels = label_encoder.inverse_transform(encoded_predictions)


In [27]:
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test1[:])}")

In [28]:
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test[:])}")

In [29]:
# test = pd.read_csv('test.csv')
# test.head()
# then do pre-processing 

In [30]:
# print(model.summary())

In [32]:
# model.to_yaml()

In [33]:
# model.outputs

In [34]:
# model.inputs