# Punt Play Analytics - Unsupervised Learning Analysis

In [4]:
import pandas as pd
import numpy as np
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering


In [5]:
## Connect to the Database
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
from config import db_password


In [6]:
# Make connection to the database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/NFL_Punt"
engine = create_engine(db_string)


#### Database key info: 
- Game Data:_________GameKey
- Play Information:____GameKey__PlayID
- Player Punt Data:_________________________GSISID
- Play Player Role:_____GameKey__PlayID___GSISID
- Video Review:________GameKey__PlayID___GSISID
- NGS:_________________GameKey___PlayID__GSISID

In [7]:
games = pd.read_csv("NFL_Punt/game_data.csv")  # 666 rows  Week, StadiumType, Turf, GameWeather, Temperature, OutdoorWeather
play_info = pd.read_csv("NFL_Punt/play_information.csv")  # 6681 rows  Season_Type, Week, PlayID, Quarter, Play_Type, Score_Home_Visiting
punt = pd.read_csv('NFL_Punt/player_punt_data.csv')  # 3259 rows Position
play_player = pd.read_csv('NFL_Punt/play_player_role_data.csv')  # 146,573 rows Play+ID and Role
video_review = pd.read_csv('NFL_Punt/video_review.csv')  # 37 rows Player_Activity_Derived, Primary_Impact_Type, Primary_Partner_Activity_Derived

## Merge the NGS Data

In the Injury analysis, the NGS data was already merged together, wheras here it is broken up by year and which part of the season. We are going to initially merge all of this, while removing the unnecessary columns and any rows containing NaN for important values such as the Game and Player Identifiers

In [8]:
ngs_2016_pre = pd.read_csv('NFL_Punt/ngs-2016-pre.csv')  # 1 million rows
ngs_2016_early = pd.read_csv(
    'NFL_Punt/ngs-2016-reg-wk1-6.csv')  # 8.7 million rows
ngs_2016_mid = pd.read_csv(
    'NFL_Punt/ngs-2016-reg-wk7-12.csv')  # 8.4 million rows
ngs_2016_late = pd.read_csv(
    'NFL_Punt/ngs-2016-reg-wk13-17.csv')  # 7.6 million rows
ngs_2016_post = pd.read_csv('NFL_Punt/ngs-2016-post.csv')  # 900,000 rows


  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
ngs_2016 = ngs_2016_pre.copy()
ngs_2016 = ngs_2016.append(ngs_2016_early)
ngs_2016 = ngs_2016.append(ngs_2016_mid)
ngs_2016 = ngs_2016.append(ngs_2016_late)
ngs_2016 = ngs_2016.append(ngs_2016_post)


In [10]:
ngs_2016.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Time,x,y,dis,o,dir,Event
0,2016,6,3236,28943.0,2016-08-13 01:38:02.900,39.669998,30.690001,0.06,308.0,267.940002,
1,2016,6,3236,32366.0,2016-08-13 01:38:03.000,39.57,28.950001,0.02,24.4,284.350006,
2,2016,6,3236,31810.0,2016-08-13 01:38:03.000,39.740002,47.209999,0.0,15.76,15.42,
3,2016,6,3236,32331.0,2016-08-13 01:38:03.000,40.369999,29.969999,0.02,13.57,246.490005,
4,2016,6,3236,28932.0,2016-08-13 01:38:03.000,39.330002,28.02,0.09,324.890015,230.100006,


In [11]:
del ngs_2016_pre, ngs_2016_early, ngs_2016_mid, ngs_2016_late, ngs_2016_post

In [12]:
minGame = min(ngs_2016.GameKey)
maxGame = max(ngs_2016.GameKey)

print(f'The lowest game number for 2016 is {minGame}, and the maximum game number is {maxGame}.') 

The lowest game number for 2016 is 4, and the maximum game number is 332.


In [13]:
ngs_2017_pre = pd.read_csv('NFL_Punt/ngs-2017-pre.csv')  # 6.6 million rows
ngs_2017_early = pd.read_csv(
    'NFL_Punt/ngs-2017-reg-wk1-6.csv')  # 9.4 million rows
ngs_2017_mid = pd.read_csv(
    'NFL_Punt/ngs-2017-reg-wk7-12.csv')  # 8.6 million rows
ngs_2017_late = pd.read_csv(
    'NFL_Punt/ngs-2017-reg-wk13-17.csv')  # 8.3 million rows
ngs_2017_post = pd.read_csv('NFL_Punt/ngs-2017-post.csv')  # 1 million rows

In [14]:
ngs_2017 = ngs_2017_pre.copy()
ngs_2017 = ngs_2017.append(ngs_2017_early)
ngs_2017 = ngs_2017.append(ngs_2017_mid)
ngs_2017 = ngs_2017.append(ngs_2017_late)
ngs_2017 = ngs_2017.append(ngs_2017_post)

In [15]:
del ngs_2017_pre, ngs_2017_early, ngs_2017_mid, ngs_2017_late, ngs_2017_post

In [16]:
minGame = min(ngs_2017.GameKey)
maxGame = max(ngs_2017.GameKey)

print(f'The lowest game number for 2016 is {minGame}, and the maximum game number is {maxGame}.')


The lowest game number for 2016 is 335, and the maximum game number is 665.


Based on this finding, the games from 2017 will have to be adjusted, subtracting 334 from all games so the season starts at 1, likewise, the 2016 will be reduced by 3.  This will be done after further merges, to ensure that data isn't lost due to that manipulation

In [17]:
ngs = ngs_2016.copy()
ngs = ngs.append(ngs_2017)

In [18]:
del ngs_2016, ngs_2017

In [19]:
# ngs.drop(columns=['Season_Year', 'Event', 'Time', 'dis'], inplace=True)

Remove all rows where there is no GameKey and no GSISID, where we don't know the actual player involved

In [20]:
ngs = ngs.loc[ngs.GameKey.isna() == False]

In [21]:
ngs = ngs.loc[ngs.GSISID.isna() == False]

In [22]:
ngs.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Time,x,y,dis,o,dir,Event
0,2016,6,3236,28943.0,2016-08-13 01:38:02.900,39.669998,30.690001,0.06,308.0,267.940002,
1,2016,6,3236,32366.0,2016-08-13 01:38:03.000,39.57,28.950001,0.02,24.4,284.350006,
2,2016,6,3236,31810.0,2016-08-13 01:38:03.000,39.740002,47.209999,0.0,15.76,15.42,
3,2016,6,3236,32331.0,2016-08-13 01:38:03.000,40.369999,29.969999,0.02,13.57,246.490005,
4,2016,6,3236,28932.0,2016-08-13 01:38:03.000,39.330002,28.02,0.09,324.890015,230.100006,


Connect and push the combined and cleaned tables to the SQL database

In [23]:
ngs.to_sql(name="ngs_df", con=engine)

## Isolate the Parameters from the other Datasets

- Player_Punt_Data:  we need the player's position, based on the GSISID
- Play_Info: We need to extract the Quarter, Play_Type, Week, and Score_Home_Visiting
- Games: We need the Temperature, StadiumType, Turf, GameWeather
- Play_Player: this can give us the role they played during that game
- Video_Review: the Player and Partner Activity as well as primary impact 