# For Full Script:

In [3]:
# ------------------------------------------------------------ IMPORTS LIBRARIES ---------------------------------------------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
np.set_printoptions(precision=4)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import statsmodels.api as sms
pd.options.mode.chained_assignment = None  # default='warn'

#------------------------------------------------------------------- DATA CLEANER ----------------------------------------------------------------------------------------------------------------------------

# Reads in historical data
df = pd.read_csv('__________.csv') # -------- ENTER HISTORICAL DATA CSV HERE ---------

# Read in user input file
test = pd.read_csv('__________.csv') # -------- ENTER USER INPUT CSV FILE HERE ---------

# Removes rows of data that are classified as no-play
df=df[df.pff_NOPLAY ==0]

# Removes rows of data with null (or empty) entries in the penalty yards column
df=df[df["pff_PENALTYYARDS"].isnull()]
df.pff_OFFPERSONNELBASIC.dropna()
df.pff_OFFPERSONNEL.dropna()
df.pff_DEFPERSONNEL.dropna()

# Replaces a null value within the gain/loss column with a zero
df.pff_GAINLOSS.fillna(0,inplace=True)

# Creates a column that changes the data from "C" as closed to a 1 as a "yes" and a 0 as a "no"
df['shownisclosed'] = np.where((df['pff_MOFOCSHOWN'] == "C") , 1, 0)
df['playedisclosed'] = np.where((df['pff_MOFOCPLAYED'] == "C") , 1, 0)

# Creates a binary (0 or 1) column showing if the defense plays man converage (1) or not (0)
df['isman'] = np.where((df['pff_PASSCOVERAGE']=="0")|(df['pff_PASSCOVERAGE']=='1')|(df['pff_PASSCOVERAGE']=='1D')|(df['pff_PASSCOVERAGE']=='2M')|(df['pff_PASSCOVERAGE']=='GL') , 1, 0)

# Creates a hash position column where 0 is the left hash, 1 is between the hashes, and 2 is the right hash
df['hashposition'] = df['pff_HASH']
df.hashposition=df.hashposition.replace("L",0)
df.hashposition=df.hashposition.replace("C",1)
df.hashposition=df.hashposition.replace("R",2)

# Changes data to binary (0 or 1) where 0 = no and 1 = yes according to the column name 
# Example: if teleft = 1, that means a tight end is lined up on the left side of the formation
df['ispress'] = np.where((df['pff_PRESS'].isna()) , 0, 1)
df['isshotgun'] = np.where((df['pff_SHOTGUN'].isna()) , 0, 1)
df['isemptybackfield'] = np.where((df['pff_RBALIGNMENT']=="EMPTY") , 1, 0)
df['isrush'] = np.where((df['pff_RUNPASS'] == 'R'), 1, 0)
df['teleft'] = np.where((df['pff_TEALIGNMENT'].str[0]=="L")|(df['pff_TEALIGNMENT'].str[2]=="L")|(df['pff_TEALIGNMENT'].str[4]=="L")|(df['pff_TEALIGNMENT'].str[6]=="L") , 1, 0)
df['teright'] = np.where((df['pff_TEALIGNMENT'].str[0]=="R")|(df['pff_TEALIGNMENT'].str[2]=="R")|(df['pff_TEALIGNMENT'].str[4]=="R")|(df['pff_TEALIGNMENT'].str[6]=="R") , 1, 0)
df['teboth']= np.where((df['teleft']== 1)&(df['teright']==1),1,0)

# Removes rows of data where there were plays that had 10 men on the field since they're anomalies
df = df[df.pff_DEFPERSONNEL != "10 Men"]
df = df[df.pff_OFFPERSONNEL != "10 Men"]

# Pulls data from larger string and parses it accordingly
# Example: The pff_DEFPERSONNEL might have 3-4-4 in the format of DL-LB-DB. This code makes the columns with the number of DL = 3, LB = 4, and DB = 3
df['numberlinemen']=df['pff_DEFPERSONNEL'].astype(str).str[0]
df['numberlinebackers']=df['pff_DEFPERSONNEL'].astype(str).str[2]
df['numbersecondary']=df['pff_DEFPERSONNEL'].astype(str).str[4]
df['numberwidereceivers']=df['pff_OFFPERSONNEL'].astype(str).str[0]
df['numberrunningbacks']=df['pff_OFFPERSONNEL'].astype(str).str[4]
df['numbertightends']=df['pff_OFFPERSONNEL'].astype(str).str[6]
df['numberrushers']=df['pff_PASSRUSHPLAYERS'].astype(str).str[0]
df['numbercoverage']=df['pff_PASSCOVERAGEPLAYERS'].astype(str).str[0]
df['numberwrleft']=df['pff_OFFFORMATIONGROUP'].astype(str).str[0]
df['numberwrright']=df['pff_OFFFORMATIONGROUP'].astype(str).str[2]
df['numberpassblocking']=df['pff_PASSBLOCKING'].astype(str).str[0]
df['numberboxplayers']=df['pff_BOXPLAYERS'].astype(str).str[0]

df=df[pd.to_numeric(df['numberlinemen'], errors='coerce').notnull()]

# Only keeps the specified columns below
df.drop(df.columns.difference(['pff_PLAYID','pff_WEEK','pff_GSISGAMEKEY','pff_GSISPLAYID','pff_QUARTER','pff_DRIVE','pff_DRIVEPLAY','pff_DOWN','pff_GAINLOSS','pff_TIMETOPRESSURE','pff_TIMETOTHROW','pff_DISTANCE','pff_FIELDPOSITION','pff_BLITZDOG','pff_OFFFORMATIONUNBALANCED','pff_STUNT','pff_SCREEN','pff_RUNPASSOPTION','shownisclosed','playedisclosed','isman','numberlinemen','numberlinebackers','numbersecondary','numberwidereceivers','numberrunningbacks','numbertightends','numberrushers','numbercoverage','numberboxplayers','hashposition','numberwrleft','numberwrright','numberpassblocking','ispress','isemptybackfield','isshotgun','teleft','teright','teboth']), 1, inplace=True)

# Removes plays where the number of WRs, RBs, or TEs is labelled as "X" since it's a no-play
df = df[df.numberwidereceivers != "X"]
df = df[df.numberrunningbacks != "X"]
df = df[df.numbertightends != "X"]

# Previously, the data showed a "n" when there were no rushers, pass blockers, or defenders in coverage with a 0 so we can work with it
df.numberrushers=df.numberrushers.replace("n",0)
df.numberpassblocking=df.numberpassblocking.replace("n",0)
df.numbercoverage=df.numbercoverage.replace("n",0)

# This block of code converts the data type of these columns as integers/numbers instead of strings/phrases
df['pff_DRIVE'] = df['pff_DRIVE'].astype(np.int64)
df['pff_DRIVEPLAY'] = df['pff_DRIVEPLAY'].astype(np.int64)
df['pff_GAINLOSS'] = df['pff_GAINLOSS'].astype(np.int64)
df['numberlinemen'] = df['numberlinemen'].astype(np.int64)
df['numberlinebackers'] = df['numberlinebackers'].astype(np.int64)
df['numbersecondary'] = df['numbersecondary'].astype(np.int64)
df['numberwidereceivers'] = df['numberwidereceivers'].astype(np.int64)
df['numberrunningbacks'] = df['numberrunningbacks'].astype(np.int64)
df['numbertightends'] = df['numbertightends'].astype(np.int64)
df['numberrushers'] = df['numberrushers'].astype(np.int64)
df['numbercoverage'] = df['numbercoverage'].astype(np.int64)
df['numberwrleft'] = df['numberwrleft'].astype(np.int64)
df['numberwrright'] = df['numberwrright'].astype(np.int64)
df['numberpassblocking'] = df['numberpassblocking'].astype(np.int64)
df['numberboxplayers'] = df['numberboxplayers'].astype(np.int64)

# Creates a column that properly defines field position
df['fieldposition'] = np.where((df['pff_FIELDPOSITION'] >= 0),df['pff_FIELDPOSITION'] , df['pff_FIELDPOSITION']+100) 

 # This defines play efficiency according to K-State's system
df['efficent'] = np.where((df['pff_DOWN'] == 1) & (df['pff_DISTANCE']*.4 <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 2) & (df['pff_DISTANCE']*.5 <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 3) & (df['pff_DISTANCE'] <= df['pff_GAINLOSS'])|
    (df['pff_DOWN'] == 4) & (df['pff_DISTANCE'] <= df['pff_GAINLOSS']), 1, 0)

 # Creates a column that classifies plays with a distance to goal of 0-20 yards as red zone
df['redzone'] = np.where((df['pff_FIELDPOSITION'] <= 20), 1, 0)

 # Creates a column that classifies plays with a distance to goal of 80-100 yards as backed up
df['backedup'] = np.where((df['pff_FIELDPOSITION'] >= 80) , 1, 0)

# Creates a column that classifies plays with more than 4 rushers as blitzes
df['blitz'] = np.where((df['numberrushers'] > 4), 1, 0)

# This section whittles the data down to factors that the offense can control before the snap
presnapoffense = ['pff_QUARTER','pff_DOWN','pff_OFFFORMATIONUNBALANCED','pff_RUNPASSOPTION',
                  'isshotgun','isemptybackfield','teleft','teright','teboth','numberwidereceivers','numberrunningbacks','numbertightends','numberwrleft',
                 'numberwrright','numberpassblocking','blitz']
psnoblitz = ['pff_QUARTER','pff_DOWN','pff_OFFFORMATIONUNBALANCED','pff_RUNPASSOPTION',
                  'isshotgun','isemptybackfield','teleft','teright','teboth','numberwidereceivers','numberrunningbacks','numbertightends','numberwrleft',
                 'numberwrright','numberpassblocking']

#-------------------------------------------------------------------------------- CREATING MODEL --------------------------------------------------------------------------------------------------------
# This only selects passing plays
passdata = pd.DataFrame(df)
passdata = passdata[passdata.numberrushers !=0] # Takes out running plays since PFF's numberrushers data is calculated poorly for rushing plays
passdata = passdata.loc[:,presnapoffense] # Filters down to presnap factors

test = test.loc[:,psnoblitz]

X_train, X_test = passdata.drop('blitz', axis=1), test # Creates inputs for model from historical and test data without the response (blitz)
y_train = passdata['blitz'] # Creates the output data (the blitz column) for the training and testing data
logmodel = LogisticRegression() # Creates a Logsitic Regression model
logmodel.fit(X_train, y_train) # Fits model to the training data set. This is where the model learns how the opponent operates.
y_pred = logmodel.predict(X_test) # Creates prediction for test data set based on the inputs

#---------------------------------------------------------------------------------OUTPUTTING PREDICTION----------------------------------------------------------------------------------------
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        print('Play',i+1,': Blitz Predicted')
        i = i+1
    else:
        print('Play',i+1,': Blitz Not Predicted')
        i = i+1

  df.drop(df.columns.difference(['pff_PLAYID','pff_WEEK','pff_GSISGAMEKEY','pff_GSISPLAYID','pff_QUARTER','pff_DRIVE','pff_DRIVEPLAY','pff_DOWN','pff_GAINLOSS','pff_TIMETOPRESSURE','pff_TIMETOTHROW','pff_DISTANCE','pff_FIELDPOSITION','pff_BLITZDOG','pff_OFFFORMATIONUNBALANCED','pff_STUNT','pff_SCREEN','pff_RUNPASSOPTION','shownisclosed','playedisclosed','isman','numberlinemen','numberlinebackers','numbersecondary','numberwidereceivers','numberrunningbacks','numbertightends','numberrushers','numbercoverage','numberboxplayers','hashposition','numberwrleft','numberwrright','numberpassblocking','ispress','isemptybackfield','isshotgun','teleft','teright','teboth']), 1, inplace=True)


Play 1 : Blitz Not Predicted
Play 2 : Blitz Not Predicted
Play 3 : Blitz Not Predicted
Play 4 : Blitz Not Predicted
Play 5 : Blitz Not Predicted
Play 6 : Blitz Not Predicted
Play 7 : Blitz Not Predicted
Play 8 : Blitz Not Predicted


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# For Predicting Individual Plays:

In [15]:
print('Enter Quarter: ')
q= input()
print('Enter Down: ')
dwn = input()
print('Formation Unbalanced? 0 for no, 1 for yes')
unb=input()
print('RPO? 0 for no, 1 for yes')
rpo=input()
print('Shotgun? 0 for no, 1 for yes')
sht=input()
print('Empty? 0 for no, 1 for yes')
emp=input()
print("Number of Tight Ends: ")
te=input()
print('TE left? 0 for no, 1 for yes')
tel=input()
print('TE right? 0 for no, 1 for yes')
ter=input()
print('TE left and right? 0 for no, 1 for yes')
teb=input()
print('Number of Wide Receivers: ')
wr=input()
print('Number of eligible WRs/RBs/TEs split out left: ')
wrl=input()
print('Number of eligible WRs/RBs/TEs split out right: ')
wrr=input()
print('Number of Runningbacks: ')
rb=input()
print('Number in Protection: ')
numprot=input()

ColNames= ['pff_QUARTER','pff_DOWN','pff_DRIVE','pff_OFFFORMATIONUNBALANCED','pff_RUNPASSOPTION',
                  'isshotgun','isemptybackfield','teleft','teright','teboth','numberwidereceivers','numberrunningbacks','numbertightends','numberwrleft',
                 'numberwrright','numberpassblocking']

userTest = [q,dwn,dnum,unb,rpo,sht,emp,tel,ter,teb,wr,rb,te,wrl,wrr,numprot] # Puts inputs into an array
UserTestDF= pd.DataFrame(userTest).T # Creates dataframe with inputs
UserTestDF.columns = ColNames # Makes column names the actual factor names

BlitzPrediction= logmodel.predict(UserTestDF) # Creates prediction

print('--------------------------------------Prediction-----------------------------------------------')
if BlitzPrediction ==1: # If a blitz is predicted
    print('Blitz Predicted')
else: # If a blitz is not predicted
    print('No Blitz Predicted') 

Enter Quarter: 
1
Enter Down: 
1
Enter Drive Number in Game: 
3
Formation Unbalanced? 0 for no, 1 for yes
0
RPO? 0 for no, 1 for yes
1
Shotgun? 0 for no, 1 for yes
1
Empty? 0 for no, 1 for yes
0
Number of Tight Ends: 
0
TE left? 0 for no, 1 for yes
0
TE right? 0 for no, 1 for yes
0
TE left and right? 0 for no, 1 for yes
0
Number of Wide Receivers (Number Split Out Wide): 
4
Number of WR left: 
2
Number of WR right: 
2
Number of Runningbacks: 
1
Number in Protection: 
5
--------------------------------------Prediction-----------------------------------------------
No Blitz Predicted
