In [52]:
# Imports necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
np.set_printoptions(precision=4)
import statsmodels.api as sms

# Reads in data
df = pd.read_csv('TexasAll.csv') #------------------------------------------------------------------------Enter PFF Play Feed File Name Here--------------------------------------------------------------------------------------------------------

# Removes rows of data that are classified as no-play
df=df[df.pff_NOPLAY ==0]

# Removes rows of data with null (or empty) entries in the penalty yards column
df=df[df["pff_PENALTYYARDS"].isnull()]
df.pff_OFFPERSONNELBASIC.dropna()
df.pff_OFFPERSONNEL.dropna()
df.pff_DEFPERSONNEL.dropna()

# Replaces a null value within the gain/loss column with a zero
df.pff_GAINLOSS.fillna(0,inplace=True)

# Creates a column that changes the data from "C" as closed to a 1 as a "yes" and a 0 as a "no"
df['shownisclosed'] = np.where((df['pff_MOFOCSHOWN'] == "C") , 1, 0)
df['playedisclosed'] = np.where((df['pff_MOFOCPLAYED'] == "C") , 1, 0)

# Creates a binary (0 or 1) column showing if the defense plays man converage (1) or not (0)
df['isman'] = np.where((df['pff_PASSCOVERAGE']=="0")|(df['pff_PASSCOVERAGE']=='1')|(df['pff_PASSCOVERAGE']=='1D')|(df['pff_PASSCOVERAGE']=='2M')|(df['pff_PASSCOVERAGE']=='GL') , 1, 0)

# Creates a hash position column where 0 is the left hash, 1 is between the hashes, and 2 is the right hash
df['hashposition'] = df['pff_HASH']
df.hashposition=df.hashposition.replace("L",0)
df.hashposition=df.hashposition.replace("C",1)
df.hashposition=df.hashposition.replace("R",2)

# Changes data to binary (0 or 1) where 0 = no and 1 = yes according to the column name 
# Example: if teleft = 1, that means a tight end is lined up on the left side of the formation
df['ispress'] = np.where((df['pff_PRESS'].isna()) , 0, 1)
df['isshotgun'] = np.where((df['pff_SHOTGUN'].isna()) , 0, 1)
df['isemptybackfield'] = np.where((df['pff_RBALIGNMENT']=="EMPTY") , 1, 0)
df['isrush'] = np.where((df['pff_RUNPASS'] == 'R'), 1, 0)
df['teleft'] = np.where((df['pff_TEALIGNMENT'].str[0]=="L")|(df['pff_TEALIGNMENT'].str[2]=="L")|(df['pff_TEALIGNMENT'].str[4]=="L")|(df['pff_TEALIGNMENT'].str[6]=="L") , 1, 0)
df['teright'] = np.where((df['pff_TEALIGNMENT'].str[0]=="R")|(df['pff_TEALIGNMENT'].str[2]=="R")|(df['pff_TEALIGNMENT'].str[4]=="R")|(df['pff_TEALIGNMENT'].str[6]=="R") , 1, 0)
df['teboth']= np.where((df['teleft']== 1)&(df['teright']==1),1,0)

# Removes rows of data where there were plays that had 10 men on the field since they're anomalies
df = df[df.pff_DEFPERSONNEL != "10 Men"]
df = df[df.pff_OFFPERSONNEL != "10 Men"]

# Pulls data from larger string and parses it accordingly
# Example: The pff_DEFPERSONNEL might have 3-4-4 in the format of DL-LB-DB. This code makes the columns with the number of DL = 3, LB = 4, and DB = 3
df['numberlinemen']=df['pff_DEFPERSONNEL'].astype(str).str[0]
df['numberlinebackers']=df['pff_DEFPERSONNEL'].astype(str).str[2]
df['numbersecondary']=df['pff_DEFPERSONNEL'].astype(str).str[4]
df['numberwidereceivers']=df['pff_OFFPERSONNEL'].astype(str).str[0]
df['numberrunningbacks']=df['pff_OFFPERSONNEL'].astype(str).str[4]
df['numbertightends']=df['pff_OFFPERSONNEL'].astype(str).str[6]
df['numberrushers']=df['pff_PASSRUSHPLAYERS'].astype(str).str[0]
df['numbercoverage']=df['pff_PASSCOVERAGEPLAYERS'].astype(str).str[0]
df['numberwrleft']=df['pff_OFFFORMATIONGROUP'].astype(str).str[0]
df['numberwrright']=df['pff_OFFFORMATIONGROUP'].astype(str).str[2]
df['numberpassblocking']=df['pff_PASSBLOCKING'].astype(str).str[0]
df['numberboxplayers']=df['pff_BOXPLAYERS'].astype(str).str[0]

df=df[pd.to_numeric(df['numberlinemen'], errors='coerce').notnull()]

# Only keeps the specified columns below
df.drop(df.columns.difference(['pff_PLAYID','pff_WEEK','pff_GSISGAMEKEY','pff_GSISPLAYID','pff_QUARTER','pff_DRIVE','pff_DRIVEPLAY','pff_DOWN','pff_GAINLOSS','pff_TIMETOPRESSURE','pff_TIMETOTHROW','pff_DISTANCE','pff_FIELDPOSITION','pff_BLITZDOG','pff_OFFFORMATIONUNBALANCED','pff_STUNT','pff_SCREEN','pff_RUNPASSOPTION','shownisclosed','playedisclosed','isman','numberlinemen','numberlinebackers','numbersecondary','numberwidereceivers','numberrunningbacks','numbertightends','numberrushers','numbercoverage','numberboxplayers','hashposition','numberwrleft','numberwrright','numberpassblocking','ispress','isemptybackfield','isshotgun','teleft','teright','teboth']), 1, inplace=True)

# Removes plays where the number of WRs, RBs, or TEs is labelled as "X" since it's a no-play
df = df[df.numberwidereceivers != "X"]
df = df[df.numberrunningbacks != "X"]
df = df[df.numbertightends != "X"]

# Previously, the data showed a "n" when there were no rushers, pass blockers, or defenders in coverage with a 0 so we can work with it
df.numberrushers=df.numberrushers.replace("n",0)
df.numberpassblocking=df.numberpassblocking.replace("n",0)
df.numbercoverage=df.numbercoverage.replace("n",0)

# This block of code converts the data type of these columns as integers/numbers instead of strings/phrases
df['pff_DRIVE'] = df['pff_DRIVE'].astype(np.int64)
df['pff_DRIVEPLAY'] = df['pff_DRIVEPLAY'].astype(np.int64)
df['pff_GAINLOSS'] = df['pff_GAINLOSS'].astype(np.int64)
df['numberlinemen'] = df['numberlinemen'].astype(np.int64)
df['numberlinebackers'] = df['numberlinebackers'].astype(np.int64)
df['numbersecondary'] = df['numbersecondary'].astype(np.int64)
df['numberwidereceivers'] = df['numberwidereceivers'].astype(np.int64)
df['numberrunningbacks'] = df['numberrunningbacks'].astype(np.int64)
df['numbertightends'] = df['numbertightends'].astype(np.int64)
df['numberrushers'] = df['numberrushers'].astype(np.int64)
df['numbercoverage'] = df['numbercoverage'].astype(np.int64)
df['numberwrleft'] = df['numberwrleft'].astype(np.int64)
df['numberwrright'] = df['numberwrright'].astype(np.int64)
df['numberpassblocking'] = df['numberpassblocking'].astype(np.int64)
df['numberboxplayers'] = df['numberboxplayers'].astype(np.int64)

# Creates a column that properly defines field position
df['fieldposition'] = np.where((df['pff_FIELDPOSITION'] >= 0),df['pff_FIELDPOSITION'] , df['pff_FIELDPOSITION']+100) 

 # This defines play efficiency according to K-State's system
df['efficent'] = np.where((df['pff_DOWN'] == 1) & (df['pff_DISTANCE']*.4 <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 2) & (df['pff_DISTANCE']*.5 <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 3) & (df['pff_DISTANCE'] <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 4) & (df['pff_DISTANCE'] <= df['pff_GAINLOSS']), 1, 0)

 # Creates a column that classifies plays with a distance to goal of 0-20 yards as red zone
df['redzone'] = np.where((df['pff_FIELDPOSITION'] <= 20), 1, 0)

 # Creates a column that classifies plays with a distance to goal of 80-100 yards as backed up
df['backedup'] = np.where((df['pff_FIELDPOSITION'] >= 80) , 1, 0)

# Creates a column that classifies plays with more than 4 rushers as blitzes
df['blitz'] = np.where((df['numberrushers'] > 4), 1, 0)
df['boxplayers'] = np.where((df['numberboxplayers'] > 6), 1, 0)

# This section whittles the data down to factors that the offense can control before the snap
presnapoffense = ['pff_QUARTER','pff_DOWN','pff_DRIVE','pff_DRIVEPLAY','pff_OFFFORMATIONUNBALANCED','pff_RUNPASSOPTION',
                  'isshotgun','isemptybackfield','teleft','teright','teboth','numberwidereceivers','numberrunningbacks','numbertightends','numberwrleft',
                 'numberwrright','numberpassblocking','boxplayers']

# This only selects passing plays
passdata = pd.DataFrame(df)
passdata = passdata.iloc[:,1:]
passtarget = pd.DataFrame(passdata.blitz)

#passdata = passdata[passdata.numberrushers !=0]
passdata = passdata.loc[:,presnapoffense]

# This part extracts only plays ran in the third quarter
Q3passdata = passdata[passdata['pff_QUARTER'] == 3]
Q3passdata.dropna(axis=1, inplace=True)

# This part extracts only first-half plays
Q12passdata = passdata[(passdata['pff_QUARTER'] == 1) | (passdata['pff_QUARTER'] == 2)]
Q12passdata.dropna(axis=1, inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)
  df.drop(df.columns.difference(['pff_PLAYID','pff_WEEK','pff_GSISGAMEKEY','pff_GSISPLAYID','pff_QUARTER','pff_DRIVE','pff_DRIVEPLAY','pff_DOWN','pff_GAINLOSS','pff_TIMETOPRESSURE','pff_TIMETOTHROW','pff_DISTANCE','pff_FIELDPOSITION','pff_BLITZDOG','pff_OFFFORMATIONUNBALANCED','pff_STUNT','pff_SCREEN','pff_RUNPASSOPTION','shownisclosed','playedisclosed','isman','numberlinemen','numberlinebackers','numbersecondary','numberwidereceivers','numberrunningbacks','numbertightends','numberrushers','numbercoverage','numberboxplayers','hashposition','numberwrleft','numberwrright','numberpassblocking','ispress','isemptybackfield','isshotgun','teleft','teright','teboth']), 1, inplace=True)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(passdata.drop('boxplayers',axis=1), 
                                                    passdata['boxplayers'], test_size=0.30, 
                                                    random_state=101)

In [54]:
# Creates the model and runs it
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

# Creates a prediction based off of the testing data set
y_pred = logmodel.predict(X_test)

# The classification shows the accuracy of the model
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1306
           1       0.72      0.61      0.66       671

    accuracy                           0.79      1977
   macro avg       0.77      0.75      0.75      1977
weighted avg       0.78      0.79      0.78      1977



In [55]:
# Creates the confusion matrix and tells the user how to understand it
print('How to read the confusion matrix:')
print('')
cfdef = [['Correctly predicted 6 or less box players','False positive for 6 or less box players'],['False negative for 7 or more box players','Correctly predicted 7 or more box players']]
print(cfdef[0])
print(cfdef[1])
print('')
print(confusion_matrix(y_test, y_pred))

How to read the confusion matrix:

['Correctly predicted 6 or less box players', 'False positive for 6 or less box players']
['False negative for 7 or more box players', 'Correctly predicted 7 or more box players']

[[1146  160]
 [ 260  411]]
