# Proof of Concept for Punt_SQL_Cleaner.py

In [1]:
# Import and Cleaning dependencies
from ColumnCapitals import column_capitalizer
import warnings
import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment', None)
seed = 42

# ML Dependencies

warnings.simplefilter(action='ignore', category=FutureWarning)


## Connect to the Database
import sqlalchemy as db
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import psycopg2
from config import db_password



In [2]:
# Make connection to the database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/NFL_Punt"
engine = db.create_engine(db_string)
conn = engine.connect()
metadata = db.MetaData()
del db_password

In [3]:
# Read in the injuries table:
table = db.Table('punt_analytics', metadata,
                        autoload=True, autoload_with=engine)
query = db.select(table)
Results = conn.execute(query).fetchall()

# Create the new dataframe and set the keys
punt = pd.DataFrame(Results)
punt.columns = Results[0].keys()
punt.head()
conn.close()
del Results, metadata, conn

In [4]:
punt = column_capitalizer(punt, 'punt')
punt.head()

Unnamed: 0,GameKey,PlayID,GSISID,Role,Season_Type,Quarter,Score_Home_Visiting,Week,StadiumType,Turf,Weather,Temperature,Position
0,414,188,33704,PDL2,Reg,1,0 - 0,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB
1,414,188,33704,PDL2,Reg,1,0 - 0,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB
2,414,1107,33704,PDL2,Reg,2,7 - 7,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB
3,414,1107,33704,PDL2,Reg,2,7 - 7,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB
4,424,1113,33704,PDR3,Reg,2,3 - 3,2,,Grass,Cloudy,71.0,OLB


In [5]:
# Set Dictionaries
turfs = {
    'Grass': 'Natural',
    'Field Turf': 'Synthetic',
    'Natural Grass': 'Natural',
    'grass': 'Natural',
    'Artificial': 'Synthetic',
    'FieldTurf': 'Synthetic',
    'DD GrassMaster': 'Synthetic',
    'A-Turf Titan': 'Synthetic',
    'UBU Sports Speed S5-M': 'Synthetic',
    'UBU Speed Series S5-M': 'Synthetic',
    'Artifical': 'Synthetic',
    'UBU Speed Series-S5-M': 'Synthetic',
    'FieldTurf 360': 'Synthetic',
    'Natural grass': 'Natural',
    'Field turf': 'Synthetic',
    'Natural': 'Natural',
    'Natrual Grass': 'Natural',
    'Synthetic': 'Synthetic',
    'Natural Grass ': 'Natural',
    'Naturall Grass': 'Natural',
    'FieldTurf360': 'Synthetic'}

stadium = {'Outdoor': 'Outdoor',
           'outdoor': 'Outdoor',
           'Indoors': 'Indoor',
           'Indoors (Domed)': 'Indoor',
           'Oudoor': 'Outdoor',
           'Outdoors': 'Outdoor',
           'Outdoors ': 'Outdoor',
           'Open': 'Outdoor',
           'Closed Dome': 'Indoor',
           'Domed, closed': 'Indoor',
           'Dome': 'Indoor',
           'Indoor': 'Indoor',
           'Domed': 'Indoor',
           'Retr. Roof-Closed': 'Indoor',
           'Outdoor Retr Roof-Open': 'Outdoor',
           'Retractable Roof': 'Indoor',
           'Ourdoor': 'Outdoor',
           'Indoor, Roof Closed': 'Indoor',
           'Retr. Roof - Closed': 'Indoor',
           'Bowl': 'Outdoor',
           'Outddors': 'Outdoor',
           'Retr. Roof-Open': 'Outdoor',
           'Dome, closed': 'Indoor',
           'Indoor, Open Roof': 'Outdoor',
           'Domed, Open': 'Outdoor',
           'Domed, open': 'Outdoor',
           'Heinz Field': 'Outdoor',
           'Cloudy': 'Outdoor',
           'Retr. Roof - Open': 'Outdoor',
           'Retr. Roof Closed': 'Indoor',
           'Outdor': 'Outdoor',
           'Outside': 'Outdoor',
           'Indoor, non-retractable roof': 'Indoor',
           'Retr. roof - closed': 'Indoor',
           'Indoor, fixed roof ': 'Indoor',
           'Indoor, Non-Retractable Dome': 'Indoor',
           'Indoor, Fixed Roof': 'Indoor',
           'Indoor, fixed roof': 'Indoor'}

weather = {
    'Mostly Cloudy': 'Cloudy',
    'Sunny': 'Clear',
    'Rain': 'Rain',
    'cloudy': 'Cloudy',
    'Partly Cloudy': 'Cloudy',
    'Clear': 'Clear',
    'Cloudy': 'Cloudy',
    'Showers': 'Rain',
    'Clear skies': 'Clear',
    'Mostly cloudy': 'Cloudy',
    'Controlled Climate': 'Clear',
    'Partly cloudy': 'Cloudy',
    'Clear Skies': 'Clear',
    'Fair': 'Clear',
    'Mostly Coudy': 'Cloudy',
    'Partly sunny': 'Clear',
    'Partly cloudy, lows to upper 50s.': 'Cloudy',
    'Sunny and warm': 'Clear',
    'Scattered thunderstorms': 'Rain',
    'Indoor': 'Clear',
    'Mostly Sunny': 'Clear',
    '30% Chance of Rain': 'Rain',
    'Light Rain': 'Rain',
    'CLEAR': 'Clear',
    'Partly CLoudy': 'Cloudy',
    'Partly Sunny': 'Clear',
    'Chance of Showers': 'Rain',
    'Snow showers': 'Snow',
    'Cloudy, chance of rain': 'Cloudy',
    'Clear and Cold': 'Clear',
    'Party Cloudy': 'Cloudy',
    'Indoors': 'Clear',
    'Cloudy with rain': 'Rain',
    'Sunny intervals': 'Clear',
    'Clear and cool': 'Clear',
    'Cold': 'Cloudy',
    'Cloudy, Humid, Chance of Rain': 'Rain',
    'Cloudy and cold': 'Cloudy',
    'Cloudy and Cold': 'Cloudy',
    'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog',
    'Cloudy with patches of fog': 'Hazy/Fog',
    'Controlled': 'Clear',
    'Sunny and Clear': 'Clear',
    'Clear and warm': 'Clear',
    'Cloudy, Rain': 'Rain',
    'Cloudy with Possible Stray Showers/Thundershowers': 'Rain',
    'Suny': 'Clear',
    'Sunny Skies': 'Clear',
    'Heavy lake effect snow': 'Snow',
    'Sun & clouds': 'Cloudy',
    'T-Storms': 'Rain',
    'Sunny and cool': 'Clear',
    'Snow': 'Snow',
    'Coudy': 'Cloudy',
    'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy',
    'Sunny, highs to upper 80s': 'Clear',
    'Cloudy, steady temps': 'Cloudy',
    'Hazy, hot and humid': 'Hazy/Fog',
    'Sunny Intervals': 'Clear',
    'Cloudy, light snow accumulating 1-3"': 'Cloudy',
    'Partly Cloudy, Chance of Rain 80%': 'Rain',
    'Mostly Clear. Gusting ot 14.': 'Windy',
    'Mostly CLoudy': 'Cloudy',
    'Snow Showers, 3 to 5 inches expected.': 'Snow',
    'Rain likely, temps in low 40s.': 'Rain'
}



This creates a unique identifier for the player and play, which will be importand when merging with the NGS data

In [7]:
punt['Game_Play_ID'] = punt[['GameKey', 'PlayID', 'GSISID']].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)

In [8]:
punt.head()

Unnamed: 0,GameKey,PlayID,GSISID,Role,Season_Type,Quarter,Score_Home_Visiting,Week,StadiumType,Turf,Weather,Temperature,Position,Game_Play_ID
0,414,188,33704,PDL2,Reg,1,0 - 0,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB,414-188-33704
1,414,188,33704,PDL2,Reg,1,0 - 0,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB,414-188-33704
2,414,1107,33704,PDL2,Reg,2,7 - 7,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB,414-1107-33704
3,414,1107,33704,PDL2,Reg,2,7 - 7,1,Outdoor,Grass,Mostly Cloudy,73.0,OLB,414-1107-33704
4,424,1113,33704,PDR3,Reg,2,3 - 3,2,,Grass,Cloudy,71.0,OLB,424-1113-33704


This adjusts for the regular and postseason games that all started at week 1, after the weeks of preseason were played

In [9]:
punt['Week'] = np.where(punt['Season_Type'] == 'Reg', punt.Week + 5, punt.Week)
punt['Week'] = np.where(punt['Season_Type'] == 'Post', punt.Week + 22, punt.Week)

Drop rows with NaN values

In [10]:
punt.isna().sum()

GameKey                    0
PlayID                     0
GSISID                     0
Role                       0
Season_Type                0
Quarter                    0
Score_Home_Visiting        0
Week                       0
StadiumType            12999
Turf                     315
Weather                29416
Temperature            17838
Position                  10
Game_Play_ID               0
dtype: int64

In [11]:
punt = punt.loc[punt.Position.isna() == False]
punt = punt.loc[punt.Temperature.isna() == False]
punt = punt.loc[punt.StadiumType.isna() == False]
punt = punt.loc[punt.Turf.isna() == False]
punt = punt.loc[punt.StadiumType != 'Turf']

Use the Dictionaries to reclassify the stadium types, turfs, and weather

In [12]:
punt.StadiumType = punt.StadiumType.map(stadium)

In [13]:
punt.Turf = punt.Turf.map(turfs)

In [14]:
punt.Weather = punt.Weather.map(weather)

In [15]:
punt.head()

Unnamed: 0,GameKey,PlayID,GSISID,Role,Season_Type,Quarter,Score_Home_Visiting,Week,StadiumType,Turf,Weather,Temperature,Position,Game_Play_ID
0,414,188,33704,PDL2,Reg,1,0 - 0,6,Outdoor,Natural,Cloudy,73.0,OLB,414-188-33704
1,414,188,33704,PDL2,Reg,1,0 - 0,6,Outdoor,Natural,Cloudy,73.0,OLB,414-188-33704
2,414,1107,33704,PDL2,Reg,2,7 - 7,6,Outdoor,Natural,Cloudy,73.0,OLB,414-1107-33704
3,414,1107,33704,PDL2,Reg,2,7 - 7,6,Outdoor,Natural,Cloudy,73.0,OLB,414-1107-33704
18,356,1166,33704,PDR3,Pre,2,0 - 10,3,Outdoor,Natural,Clear,74.0,OLB,356-1166-33704



We will be removing the string of the home/visiting scores, since there are over 500 different score sets.
But there is the potential that the score each home team has, as well as the difference between the scores will make a difference.
Since the next function will contain the difference between the scores, including both the home and away would have
dependencies, and skew the data, we're only adding a column with the home score, and then one for the score difference, representing
whether the home team is ahead or down (using negative values)

In [16]:
punt['HomeScore'] = punt.Score_Home_Visiting.apply(lambda row: [int(s) for s in row.split() if s.isdigit()][0])

In [17]:
punt['HomeAway_Difference'] = punt.Score_Home_Visiting.apply(
    lambda row: [int(s) for s in row.split() if s.isdigit()][0] - [int(s) for s in row.split() if s.isdigit()][1])

In [18]:
punt.drop(columns=['Season_Type', 'Score_Home_Visiting'], inplace=True)

Remove the score-home-visiting column

In [19]:
punt.head()

Unnamed: 0,GameKey,PlayID,GSISID,Role,Quarter,Week,StadiumType,Turf,Weather,Temperature,Position,Game_Play_ID,HomeScore,HomeAway_Difference
0,414,188,33704,PDL2,1,6,Outdoor,Natural,Cloudy,73.0,OLB,414-188-33704,0,0
1,414,188,33704,PDL2,1,6,Outdoor,Natural,Cloudy,73.0,OLB,414-188-33704,0,0
2,414,1107,33704,PDL2,2,6,Outdoor,Natural,Cloudy,73.0,OLB,414-1107-33704,7,0
3,414,1107,33704,PDL2,2,6,Outdoor,Natural,Cloudy,73.0,OLB,414-1107-33704,7,0
18,356,1166,33704,PDR3,2,3,Outdoor,Natural,Clear,74.0,OLB,356-1166-33704,0,-10


Reorganize the Columns to have all Identifiers in the front

In [20]:
columns = ['Game_Play_ID', 'GameKey', 'PlayID', 'GSISID', 'Position', 'Role', 'Quarter', 'Week', 'HomeScore', 'HomeAway_Difference', 'StadiumType', 'Turf', 'Weather', 'Temperature']

In [21]:
punt = punt.reindex(columns=columns)

Remove the Weather NaN values

In [23]:
punt = punt.loc[punt.Weather.isna() == False]
