In [4]:
# IMPORTING LOCAL MODULES
import importlib
import form_pred
import ball_movement
import get_data
import def_clean
import TrainTestNFL

# REFRESHING LOCAL CHANGES
importlib.reload(get_data)
importlib.reload(form_pred)
importlib.reload(def_clean)
importlib.reload(ball_movement)
importlib.reload(TrainTestNFL)

# IMPORTING LOCAL PACKAGES
from get_data import get_assets, get_positional_data
from form_pred import clean_positional
from ball_movement import ball_quadrants, make_quad_chart
from def_clean import DefensiveCleaning
from TrainTestNFL import TrainTestNFL
import os
import pandas as pd

class PrepPipe:
    def __init__(self, first=1, last=14, n_cuts=11, frameLimit=11,
                 simMethod='distance', quad_num=4, def_fp='assets/def_clean_output.csv'):
        self.first = first
        self.last = last
        self.n_cuts = n_cuts
        self.frameLimit = frameLimit
        self.simMethod = simMethod
        self.quad_num = quad_num
        self.def_fp = def_fp
        if not os.path.exists('Kaggle-Data-Files'):
            get_assets()
        self.positions = get_positional_data()

    def clean_data(self):
        quads = ball_quadrants(self.positions,self.quad_num)
        offense = clean_positional(self.positions)
        try:
            defense = pd.read_csv(self.def_fp).reset_index()
            if 2018123015 not in defense['gameId'].to_list():
                print('missing full 17 week dataset')
                print('getting dataset now.')
                raise LookupError
        except (FileNotFoundError, LookupError):
            def_cleaning = DefensiveCleaning(weeks_data=self.positions, n_cuts=self.n_cuts,
                                             frameLimit=self.frameLimit, simMethod=self.simMethod,
                                             )
            defense = def_cleaning.generate_full_df(1, 17, fp=self.def_fp).reset_index()
        self.train_test = TrainTestNFL(offense,defense,quads)
        X_train, X_test, y_train, y_test = self.train_test.split(self.first, self.last)
        return X_train, X_test, y_train, y_test

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import os
import pickle
from def_clust import return_pca_and_clusters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

class OffensiveFormation(BaseEstimator, TransformerMixin):
    def __init__(self, model=None, model_params=None, model_fp='models/off_form.pkl',
                 cv=5, scoring='fi_micro'):
        if not model:
            self.model = LogisticRegression(max_iter=10000)
        else:
            self.model = model
        if not model_params:
            self.model_params = {'C': [10**x for x in range(-4, 4)]}
        else:
            self.model_params = model_params
        self.model_fp = model_fp
        self.cv = cv
        self.scoring = scoring

    def fit(self, X):
        if os.path.exists(self.model_fp):
            with open(self.model_fp, 'rb') as model:
                 self.grid = pickle.load(model)
        else:
            self.grid = GridSearchCV(self.model, param_grid=self.model_params, cv=self.cv,
                                     scoring=self.scoring)
            X_train = X.drop('offensiveFormation', axis=1)
            y_train = X['offensiveFormation']
            self.grid.fit(X_train, y_train)
            base = self.model_fp.split('/')[0]
            if not os.path.exists(base):
                os.mkdir(base)
            with open(self.model_fp, 'wb') as model:
                pickle.dump(self.grid, model)

    def transform(self, X):
        X = X.drop('offensiveFormation', axis=1)
        y = self.grid.predict(X)
        X['offensiveFormation'] = y
        return X


class DefensiveClustering(BaseEstimator, TransformerMixin):
    def __init__(self, columns='all', n_clusters=5, pca_variance=0.8):
        self.columns = columns
        self.n_clusters = n_clusters
        self.pca_variance = pca_variance

    def fit_transform(self, X, y=None):
        actions = [action for action in X.columns if '_act' in action]
        self.melt_cols = ['gameId','playId'] + actions

        melt_df = X[self.melt_cols]
        melt_df = melt_df.melt(['gameId','playId']).dropna()
        melt_df = melt_df.groupby(['gameId','playId','value']).count()
        melt_df = melt_df.reset_index().pivot(index=['gameId','playId'],
                                              columns='value',values='variable').fillna(0)
        melt_df['TOT'] = melt_df['B'] + melt_df['M'] + melt_df['Z']
        melt_df['%B'] = melt_df['B'] / melt_df['TOT']
        melt_df['%M'] = melt_df['M'] / melt_df['TOT']
        melt_df['%Z'] = melt_df['Z'] / melt_df['TOT']
        melt_df = melt_df.fillna(0)

        self.orig_cols =  ['gameId','playId','defendersInTheBox','extra_blitzers','on_line_coverage','numberOfPassRushers','DB','LB','DL','yardline_first','yardline_100']
        orig_df = X[self.orig_cols].set_index(['gameId','playId'])

        orig_df = orig_df.merge(melt_df[['%B','%M','%Z']], on=['gameId','playId']).fillna(0)
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(orig_df)
        self.pca = PCA(n_components=self.pca_variance)
        scores_pca = self.pca.fit_transform(X)
        comps_pca = self.pca.components_
        self.kmeans_pca = KMeans(n_clusters=self.n_clusters, init='k-means++', random_state=42)
        kmeans_labels = self.kmeans_pca.fit_transform(scores_pca)
        pca_df = pd.DataFrame(scores_pca, columns=[f'pc_{i}' for i in range(scores_pca.shape[1])])
        df_seg = pd.concat([X.reset_index()[['gameId','playId']], pca_df], axis=1)
        df_seg['cluster'] = kmeans_labels
        return df_seg

    def fit(self, X):
        melt_df = pd.DataFrame()
        for col in self.melt_cols:
            if col in X.columns:
                melt_df[col] = X[col]
            else:
                melt_df[col] = np.nan
        melt_df = melt_df.melt(['gameId','playId']).dropna()
        melt_df = melt_df.groupby(['gameId','playId','value']).count()
        melt_df = melt_df.reset_index().pivot(index=['gameId','playId'],
                                              columns='value',values='variable').fillna(0)
        melt_df['TOT'] = melt_df['B'] + melt_df['M'] + melt_df['Z']
        melt_df['%B'] = melt_df['B'] / melt_df['TOT']
        melt_df['%M'] = melt_df['M'] / melt_df['TOT']
        melt_df['%Z'] = melt_df['Z'] / melt_df['TOT']
        melt_df = melt_df.fillna(0)
        orig_df = X[self.orig_cols].set_index(['gameId','playId'])
        orig_df = orig_df.merge(melt_df[['%B','%M','%Z']], on=['gameId','playId']).fillna(0)
        X = self.scaler.transform(orig_df)
        scores_pca = self.pca.transform(X)
        kmeans_labels = self.kmeans_pca.transform(scores_pca)
        pca_df = pd.DataFrame(scores_pca, columns=[f'pc_{i}' for i in range(scores_pca.shape[1])])
        df_seg = pd.concat([X.reset_index()[['gameId','playId']], pca_df], axis=1)
        df_seg['cluster'] = kmeans_labels
        return df_seg

In [6]:
prep_pipe = PrepPipe()
X_train, X_test, y_train, y_test = prep_pipe.clean_data()

positional data already downloaded.
reading positional data.
returning positional data.


In [9]:
list(X_train.columns)

['FBL0_x',
 'FBR0_x',
 'HBL0_x',
 'HBL1_x',
 'HBR0_x',
 'HBR1_x',
 'QB0_x',
 'QB1_x',
 'RBL0_x',
 'RBL1_x',
 'RBL2_x',
 'RBR0_x',
 'RBR1_x',
 'RBR2_x',
 'TEL0_x',
 'TEL1_x',
 'TEL2_x',
 'TER0_x',
 'TER1_x',
 'TER2_x',
 'WRL0_x',
 'WRL1_x',
 'WRL2_x',
 'WRL3_x',
 'WRR0_x',
 'WRR1_x',
 'WRR2_x',
 'WRR3_x',
 'FBL0_y',
 'FBR0_y',
 'HBL0_y',
 'HBL1_y',
 'HBR0_y',
 'HBR1_y',
 'QB0_y',
 'QB1_y',
 'RBL0_y',
 'RBL1_y',
 'RBL2_y',
 'RBR0_y',
 'RBR1_y',
 'RBR2_y',
 'TEL0_y',
 'TEL1_y',
 'TEL2_y',
 'TER0_y',
 'TER1_y',
 'TER2_y',
 'WRL0_y',
 'WRL1_y',
 'WRL2_y',
 'WRL3_y',
 'WRR0_y',
 'WRR1_y',
 'WRR2_y',
 'WRR3_y',
 'FBL0_in',
 'FBR0_in',
 'HBL0_in',
 'HBL1_in',
 'HBR0_in',
 'HBR1_in',
 'QB0_in',
 'QB1_in',
 'RBL0_in',
 'RBL1_in',
 'RBL2_in',
 'RBR0_in',
 'RBR1_in',
 'RBR2_in',
 'TEL0_in',
 'TEL1_in',
 'TEL2_in',
 'TER0_in',
 'TER1_in',
 'TER2_in',
 'WRL0_in',
 'WRL1_in',
 'WRL2_in',
 'WRL3_in',
 'WRR0_in',
 'WRR1_in',
 'WRR2_in',
 'WRR3_in',
 'gameId',
 'playId',
 'offenseFormation',
 'gamePlayI