In [182]:
# IMPORTING LOCAL MODULES
import importlib
import form_pred
import ball_movement
import get_data
import def_clean
import TrainTestNFL

# REFRESHING LOCAL CHANGES
importlib.reload(get_data)
importlib.reload(form_pred)
importlib.reload(def_clean)
importlib.reload(ball_movement)
importlib.reload(TrainTestNFL)

# IMPORTING LOCAL PACKAGES
from get_data import get_assets, get_positional_data
from form_pred import clean_positional
from ball_movement import ball_quadrants, make_quad_chart
from def_clean import DefensiveCleaning
from TrainTestNFL import TrainTestNFL
import os
import pandas as pd

class PrepPipe:
    def __init__(self, first=1, last=14, n_cuts=11, frameLimit=11,
                 simMethod='distance', quad_num=4, def_fp='assets/def_clean_output.csv'):
        self.first = first
        self.last = last
        self.n_cuts = n_cuts
        self.frameLimit = frameLimit
        self.simMethod = simMethod
        self.quad_num = quad_num
        self.def_fp = def_fp
        if not os.path.exists('Kaggle-Data-Files'):
            get_assets()
        self.positions = get_positional_data()

    def clean_data(self):
        quads = ball_quadrants(self.positions,self.quad_num)
        offense = clean_positional(self.positions)
        try:
            defense = pd.read_csv(self.def_fp).reset_index()
            if 2018123015 not in defense['gameId'].to_list():
                print('missing full 17 week dataset')
                print('getting dataset now.')
                raise LookupError
        except (FileNotFoundError, LookupError):
            def_cleaning = DefensiveCleaning(weeks_data=self.positions, n_cuts=self.n_cuts,
                                             frameLimit=self.frameLimit, simMethod=self.simMethod,
                                             )
            defense = def_cleaning.generate_full_df(1, 17, fp=self.def_fp).reset_index()
        self.train_test = TrainTestNFL(offense,defense,quads)
        X_train, X_test, y_train, y_test = self.train_test.split(self.first, self.last)
        y_train = y_train[['x_quad', 'y_quad']]
        y_test = y_test[['x_quad', 'y_quad']]
        return X_train, X_test, y_train, y_test

In [183]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import os
import pickle
from def_clust import return_pca_and_clusters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

class OffensiveFormation(BaseEstimator, TransformerMixin):
    def __init__(self, model=None, model_params=None, model_fp='models/off_form.pkl',
                 cv=5, scoring='f1_micro'):
        if not model:
            self.model = LogisticRegression(max_iter=10000)
        else:
            self.model = model
        if not model_params:
            self.model_params = {'C': [10**x for x in range(-4, 4)]}
        else:
            self.model_params = model_params
        self.model_fp = model_fp
        self.cv = cv
        self.scoring = scoring

    def fit(self, X, y=None):
        if os.path.exists(self.model_fp):
            with open(self.model_fp, 'rb') as model:
                 self.grid = pickle.load(model)
            X_train = X.drop('offenseFormation', axis=1)
            self.scaler = StandardScaler()
            self.scaler.fit_transform(X_train)
        else:
            self.grid = GridSearchCV(self.model, param_grid=self.model_params, cv=self.cv,
                                     scoring=self.scoring)
            X_train = X.drop('offenseFormation', axis=1)
            y_train = X['offenseFormation']
            self.scaler = StandardScaler()
            X_train_scaled = self.scaler.fit_transform(X_train)
            self.grid.fit(X_train_scaled, y_train)
            base = self.model_fp.split('/')[0]
            if not os.path.exists(base):
                os.mkdir(base)
            with open(self.model_fp, 'wb') as model:
                pickle.dump(self.grid, model)
        print("Offensive formation model fitted")
        return self

    def transform(self, X):
        X = X.drop('offenseFormation', axis=1)
        X_scaled = self.scaler.transform(X)
        y = self.grid.predict(X_scaled)
        X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
        X_scaled_df['offenseFormation'] = y
        print("Offensive formation predicted")
        return X_scaled_df


class DefensiveClustering(BaseEstimator, TransformerMixin):
    def __init__(self, columns='all', n_clusters=5, pca_variance=0.8):
        self.columns = columns
        self.n_clusters = n_clusters
        self.pca_variance = pca_variance

    def fit(self, X, y=None):
        actions = [action for action in X.columns if '_act' in action]
        self.melt_cols = ['gameId','playId'] + actions

        melt_df = X[self.melt_cols]
        melt_df = melt_df.melt(['gameId','playId']).dropna()
        melt_df = melt_df.groupby(['gameId','playId','value']).count()
        melt_df = melt_df.reset_index().pivot(index=['gameId','playId'],
                                              columns='value',values='variable').fillna(0)
        melt_df['TOT'] = melt_df['B'] + melt_df['M'] + melt_df['Z']
        melt_df['%B'] = melt_df['B'] / melt_df['TOT']
        melt_df['%M'] = melt_df['M'] / melt_df['TOT']
        melt_df['%Z'] = melt_df['Z'] / melt_df['TOT']
        melt_df = melt_df.fillna(0)

        self.orig_cols =  ['gameId','playId','defendersInTheBox','numberOfPassRushers','DB','LB','DL','yardline_first_dir','yardline_100_dir']
        orig_df = X[self.orig_cols].set_index(['gameId','playId'])

        orig_df = orig_df.merge(melt_df[['%B','%M','%Z']], on=['gameId','playId']).fillna(0)
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(orig_df)
        self.pca = PCA(n_components=self.pca_variance)
        scores_pca = self.pca.fit_transform(X)
        self.kmeans_pca = KMeans(n_clusters=self.n_clusters, init='k-means++', random_state=42)
        self.kmeans_pca.fit(scores_pca)
        print('KMeans and PCA fitted')
        return self

    def transform(self, X):
        melt_df = pd.DataFrame()
        for col in self.melt_cols:
            if col in X.columns:
                melt_df[col] = X[col]
            else:
                melt_df[col] = np.nan
        melt_df = melt_df.melt(['gameId','playId']).dropna()
        melt_df = melt_df.groupby(['gameId','playId','value']).count()
        melt_df = melt_df.reset_index().pivot(index=['gameId','playId'],
                                              columns='value',values='variable').fillna(0)
        melt_df['TOT'] = melt_df['B'] + melt_df['M'] + melt_df['Z']
        melt_df['%B'] = melt_df['B'] / melt_df['TOT']
        melt_df['%M'] = melt_df['M'] / melt_df['TOT']
        melt_df['%Z'] = melt_df['Z'] / melt_df['TOT']
        melt_df = melt_df.fillna(0)
        orig_df = X[self.orig_cols].set_index(['gameId','playId'])
        orig_df = orig_df.merge(melt_df[['%B','%M','%Z']], on=['gameId','playId']).fillna(0)
        X = self.scaler.transform(orig_df)

        scores_pca = self.pca.transform(X)
        kmeans_vals = self.kmeans_pca.transform(scores_pca)
        kmeans_vals_df = pd.DataFrame(kmeans_vals, columns=[f'cluster_{i}'
                                                            for i in range(kmeans_vals.shape[1])])
        pca_df = pd.DataFrame(scores_pca, columns=[f'pc_{i}' for i in range(scores_pca.shape[1])])
        df_seg = pd.concat([orig_df.reset_index()[['gameId','playId']], kmeans_vals_df, pca_df], axis=1)
        df_seg['cluster'] = self.kmeans_pca.labels_
        df_seg.drop(['gameId', 'playId'], axis=1, inplace=True)
        print("Defensive position transformed")
        return df_seg

class StandardScalerColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.means = X.mean(axis=0).values.reshape(1, -1)
        self.stds = X.std(axis=0).values.reshape(1, -1)
        return self
    def transform(self, X):
        X_scaled = (X - self.means) / self.stds
        return X_scaled

In [None]:
prep_pipe = PrepPipe()

positional data already downloaded.
reading positional data.


In [None]:
X_train, X_test, y_train, y_test = prep_pipe.clean_data()

In [None]:
off_col = prep_pipe.train_test.ofc.drop(['gameId', 'playId', 'gamePlayId', 'week'], axis=1).columns

In [None]:
def_col = prep_pipe.train_test.dfc.drop(['week', 'index'], axis=1).columns

In [61]:
off_form_cols = off_col[:-9]
off_info_cols = off_col[-9:]

In [114]:
form_col = [off_form_cols[-1]]

In [176]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression

off_form_pipe = Pipeline([('off_form_pred', OffensiveFormation())])
off_info_pipe = Pipeline([('scaler', StandardScalerColumns())])
off_pre_one_pipe = ColumnTransformer([('form', off_form_pipe, off_form_cols),
                                   ('info', off_info_pipe, off_info_cols)])
off_pre_one_add_col = Pipeline([('off_pre_one', off_pre_one_pipe),
                               ('func_trans', FunctionTransformer(lambda x: pd.DataFrame(x,
                                columns=list(off_form_cols) + list(off_info_cols))))])
form_one_pipe = ColumnTransformer([('off_form_one', OneHotEncoder(), form_col),
                                   ('nothing', FunctionTransformer(lambda x: x), slice(0, -1))])
off_full_pipe = Pipeline([('full_cols', off_pre_one_add_col), ('one_hot', form_one_pipe)])

def_cluster_pipe = Pipeline([('def_clust', DefensiveClustering())])
form_one_pipe = ColumnTransformer([('def_clust_one', OneHotEncoder(), [-1]),
                                   ('nothing', FunctionTransformer(lambda x: x), slice(0, -1))])
def_full_pipe = Pipeline([('def_clust_full', def_cluster_pipe), ('def_clust_one', form_one_pipe)])

full_pipe = ColumnTransformer([('off', off_full_pipe, off_col), ('def', def_full_pipe, def_col)])
full_pipe_model = Pipeline([('full_pipe', full_pipe), ('model', LogisticRegression())])

In [181]:
y_train

Unnamed: 0,gameId,playId,x0,y0,x_zero_base0,y_zero_base0,x1,y1,x_zero_base1,y_zero_base1,x_vec,y_vec,passResult,playDescription,down,yardsToGo,x_quad,y_quad,week
0,2018090600,75,90.11,26.85,29.89,26.45,83.29,46.47,36.71,6.83,6.82,-19.62,C,(15:00) M.Ryan pass short right to J.Jones pus...,1,15,2,0,1
1,2018090600,146,49.17,29.86,70.83,23.44,45.31,36.39,74.69,16.91,3.86,-6.53,I,(13:10) M.Ryan pass incomplete short right to ...,1,10,2,1,1
2,2018090600,168,49.05,29.80,70.95,23.50,51.59,5.81,68.41,47.49,-2.54,23.99,I,(13:05) (Shotgun) M.Ryan pass incomplete short...,2,10,1,3,1
3,2018090600,190,50.00,29.68,70.00,23.62,27.17,11.83,92.83,41.47,22.83,17.85,C,(13:01) (Shotgun) M.Ryan pass deep left to J.J...,3,10,4,3,1
4,2018090600,256,11.74,23.82,108.26,29.48,9.05,42.42,110.95,10.88,2.69,-18.60,I,(10:59) (Shotgun) M.Ryan pass incomplete short...,3,1,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13907,2018121000,3747,74.85,23.43,74.85,23.43,77.34,4.73,77.34,4.73,2.49,-18.70,I,"(2:01) (No Huddle, Shotgun) K.Cousins pass inc...",1,10,2,0,14
13908,2018121000,3786,74.91,23.77,74.91,23.77,85.24,51.15,85.24,51.15,10.33,27.38,C,(1:56) (Shotgun) K.Cousins pass short left to ...,2,10,3,3,14
13909,2018121000,3817,86.71,29.66,86.71,29.66,109.75,27.32,109.75,27.32,23.04,-2.34,I,(1:51) (Shotgun) K.Cousins pass incomplete dee...,1,10,4,1,14
13910,2018121000,3839,86.89,29.79,86.89,29.79,82.40,38.56,82.40,38.56,-4.49,8.77,C,(1:45) (Shotgun) K.Cousins pass short left to ...,2,10,1,2,14


In [180]:
LogisticRegression().fit(data, y_train)

ValueError: could not convert string to float: 'I_FORM'

In [169]:
def_full_pipe.fit_transform(X_train[def_col])

KMeans and PCA fitted
Defensive position transformed


<13920x5 sparse matrix of type '<class 'numpy.float64'>'
	with 13920 stored elements in Compressed Sparse Row format>

In [107]:
(X_train.iloc[:, :10] - X_train.iloc[:, :10].mean(axis=0).values.reshape(1, -1)) /

Unnamed: 0,FBL0_x,FBR0_x,HBL0_x,HBL1_x,HBR0_x,HBR1_x,QB0_x,QB1_x,RBL0_x,RBL1_x
0,-5.051531,0.059902,0.095219,0.001447,0.086014,0.000468,2.541775,0.013234,2.420381,0.031098
1,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,2.671775,0.013234,-4.949619,0.031098
2,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-1.288225,0.013234,2.420381,0.031098
3,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-0.898225,0.013234,-1.819619,0.031098
4,0.068469,-4.530098,0.095219,0.001447,0.086014,0.000468,-0.608225,0.013234,1.090381,0.031098
...,...,...,...,...,...,...,...,...,...,...
13915,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-0.098225,0.013234,-2.439619,0.031098
13916,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-0.338225,0.013234,2.420381,0.031098
13917,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-0.578225,0.013234,2.420381,0.031098
13918,0.068469,0.059902,0.095219,0.001447,0.086014,0.000468,-0.258225,0.013234,2.420381,0.031098


In [104]:
X_train

Unnamed: 0,FBL0_x,FBR0_x,HBL0_x,HBL1_x,HBR0_x,HBR1_x,QB0_x,QB1_x,RBL0_x,RBL1_x,...,SSL2_y_start,SSR0_act,SSR0_x_start,SSR0_y_start,SSR1_act,SSR1_x_start,SSR1_y_start,SSR2_act,SSR2_x_start,SSR2_y_start
0,-5.12,0.00,0.0,0.0,0.0,0.0,-1.73,0.0,0.00,0.0,...,0.0,M,1.11,-9.87,0,0.00,0.00,0,0.0,0.0
1,0.00,0.00,0.0,0.0,0.0,0.0,-1.60,0.0,-7.37,0.0,...,0.0,Z,2.45,-12.48,0,0.00,0.00,0,0.0,0.0
2,0.00,0.00,0.0,0.0,0.0,0.0,-5.56,0.0,0.00,0.0,...,0.0,0,0.00,0.00,0,0.00,0.00,0,0.0,0.0
3,0.00,0.00,0.0,0.0,0.0,0.0,-5.17,0.0,-4.24,0.0,...,0.0,0,0.00,0.00,0,0.00,0.00,0,0.0,0.0
4,0.00,-4.59,0.0,0.0,0.0,0.0,-4.88,0.0,-1.33,0.0,...,0.0,M,1.23,-15.01,0,0.00,0.00,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13915,0.00,0.00,0.0,0.0,0.0,0.0,-4.37,0.0,-4.86,0.0,...,0.0,M,11.58,-11.14,M,5.33,-11.15,0,0.0,0.0
13916,0.00,0.00,0.0,0.0,0.0,0.0,-4.61,0.0,0.00,0.0,...,0.0,M,5.69,-10.15,M,13.78,-11.31,0,0.0,0.0
13917,0.00,0.00,0.0,0.0,0.0,0.0,-4.85,0.0,0.00,0.0,...,0.0,0,0.00,0.00,0,0.00,0.00,0,0.0,0.0
13918,0.00,0.00,0.0,0.0,0.0,0.0,-4.53,0.0,0.00,0.0,...,0.0,M,13.88,-5.15,0,0.00,0.00,0,0.0,0.0
