# Predicting the Next Play


## Imports

In [159]:
from metaflow import Flow
import pandas
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
import xgboost
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
run = Flow('NFLStatsFlow').latest_successful_run

In [163]:
new_df = run.data.san_fran_df.groupby(['full_play_type'])['play_id'].count().to_frame()
percentage_df = new_df.apply(lambda x: x/x.sum())
totals_df = pandas.concat([percentage_df, new_df], axis = 1)
totals_df

Unnamed: 0_level_0,play_id,play_id
full_play_type,Unnamed: 1_level_1,Unnamed: 2_level_1
pass_left_deep,0.028939,27
pass_left_short,0.188639,176
pass_middle_deep,0.02358,22
pass_middle_short,0.132905,124
pass_right_deep,0.027867,26
pass_right_short,0.163987,153
run_left,0.160772,150
run_middle,0.115756,108
run_right,0.157556,147


In [164]:
# PASS IN FULL FEATURE SET
final_df = run.data.san_fran_df[[
## STANDARD METRICS
    'game_seconds_remaining',
    'yardline_100',
    'down',
    'ydstogo',
    'shotgun',
    'score_differential',
    'total_home_score',
    'total_away_score',
    'quarter_seconds_remaining',
    'half_seconds_remaining',
    'qtr',
    'goal_to_go',
    'no_huddle',
    'posteam_timeouts_remaining',
    'defteam_timeouts_remaining',
## CUSTOM METRICS
     'previous_play_in_drive',
     'drive_yards_gained',
     'game_yards_gained',
     'drive_rushing_yards_gained',
     'game_rushing_yards_gained',
     'drive_passing_yards_gained',
     'game_passing_yards_gained',
     'drive_sack',
     'game_sack',
     'drive_incomplete_pass',
     'game_incomplete_pass',
     'drive_no_huddle',
     'game_no_huddle',
     'drive_interception',
     'game_interception',
     'drive_first_down_rush',
     'game_first_down_rush',
     'drive_first_down_pass',
     'game_first_down_pass',
     'drive_first_down_penalty',
     'game_first_down_penalty',
     'game_third_down_converted',
     'game_third_down_failed',
     'game_fumble',
#      'drive_qb_hit',
     'game_qb_hit',
     'drive_rush_attempt',
     'game_rush_attempt',
     'drive_pass_attempt',
     'game_pass_attempt',
     'game_pass_touchdown',
     'game_rush_touchdown',
### GOAL METRIC
    'full_play_type'
]]


final_df= pandas.get_dummies(data=final_df, columns=['previous_play_in_drive'])
X = final_df.loc[:, final_df.columns != 'full_play_type']
Y = final_df.full_play_type

# Encode string class Values
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
seed = 13
test_size = 0.2

# Randomizing
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)

# Fit GBC model
GBC_model = xgboost.XGBClassifier(learning_rate=0.01,n_estimators=400)
GBC_model.fit(X_train, y_train)

# Make predictions
GBC_y_pred = GBC_model.predict(X_test)
GBC_predictions = [round(value) for value in GBC_y_pred]
# Evaluate Predictions
GBC_accuracy = accuracy_score(y_test, GBC_predictions)

CLF_model = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=500, alpha=0.0001,
                     solver='sgd', verbose=10,  random_state=21,tol=0.000000001)
CLF_model.fit(X_train, y_train)
CLF_model_y_pred = CLF_model.predict(X_test)
CLF_model_accuracy_score = accuracy_score(y_test, CLF_model_y_pred)
CLF_model_cm = confusion_matrix(y_test, CLF_model_y_pred)

Iteration 1, loss = 18.91155499
Iteration 2, loss = 14.17241692
Iteration 3, loss = 2.78951859
Iteration 4, loss = 2.48461881
Iteration 5, loss = 2.27875134
Iteration 6, loss = 2.28718508
Iteration 7, loss = 2.23945491
Iteration 8, loss = 2.28549019
Iteration 9, loss = 2.23704335
Iteration 10, loss = 2.24917363
Iteration 11, loss = 2.21439380
Iteration 12, loss = 2.20292680
Iteration 13, loss = 2.19103194
Iteration 14, loss = 2.18537340
Iteration 15, loss = 2.18350949
Iteration 16, loss = 2.17679189
Iteration 17, loss = 2.17199016
Iteration 18, loss = 2.16781012
Iteration 19, loss = 2.16381982
Iteration 20, loss = 2.16035464
Iteration 21, loss = 2.15707647
Iteration 22, loss = 2.15417421
Iteration 23, loss = 2.15140114
Iteration 24, loss = 2.14867942
Iteration 25, loss = 2.14583616
Iteration 26, loss = 2.14335836
Iteration 27, loss = 2.14057172
Iteration 28, loss = 2.13792825
Iteration 29, loss = 2.13552894
Iteration 30, loss = 2.13303740
Iteration 31, loss = 2.13080816
Iteration 32, l

Iteration 284, loss = 1.99795768
Iteration 285, loss = 1.99826771
Iteration 286, loss = 1.99808592
Iteration 287, loss = 1.99774547
Iteration 288, loss = 1.99776840
Iteration 289, loss = 1.99766471
Iteration 290, loss = 1.99781266
Iteration 291, loss = 1.99779843
Iteration 292, loss = 1.99751927
Iteration 293, loss = 1.99752086
Iteration 294, loss = 1.99738350
Iteration 295, loss = 1.99732761
Iteration 296, loss = 1.99733533
Iteration 297, loss = 1.99741825
Iteration 298, loss = 1.99722369
Iteration 299, loss = 1.99730775
Iteration 300, loss = 1.99719386
Iteration 301, loss = 1.99723238
Iteration 302, loss = 1.99692806
Iteration 303, loss = 1.99692206
Iteration 304, loss = 1.99682756
Iteration 305, loss = 1.99704314
Iteration 306, loss = 1.99661342
Iteration 307, loss = 1.99672520
Iteration 308, loss = 1.99684710
Iteration 309, loss = 1.99664658
Iteration 310, loss = 1.99685101
Iteration 311, loss = 1.99642507
Iteration 312, loss = 1.99645005
Iteration 313, loss = 1.99644505
Iteration 

In [167]:
print("GBC Accuracy: %.2f%%" % (GBC_accuracy * 100.0))
print("CLF Accuracy: %.2f%%" % (CLF_model_accuracy_score * 100.0))
# print(classification_report(CLF_model_y_pred,predictions))

GBC Accuracy: 28.34%
CLF Accuracy: 18.18%


In [None]:
list(run.data.san_fran_df.columns)