## NFL Points vs Yards per Game clustering

This is a sandbox notebook to test out some sci-kit learn clustering algorithms on NFL scoring data. We want to see if we can separate teams into "tiers" based on plots of their defensive and ofensive points-per-game (PPG) and yards-per-game (YPG). We collected the data by scraping [Pro-Football Reference](https://www.pro-football-reference.com) with our [scraper](https://github.com/andrewbowen19/football-stats-analysis/blob/main/src/scraper.py) script contained in this repo.

#### Some algorithms to try out:
- [K-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)
- Affinity Propagation
- DBSCAN

In [127]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import plotly.express as px
from sklearn.cluster import KMeans


In [128]:
# Would like to run a k-means clustering on PPG-YPG
csv_path = os.path.join("..", "data", "nfl-stats-by-season.csv")
df = pd.read_csv(csv_path)

df.columns = ['Tm', 'W', 'L', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS',
              'OSRS', 'DSRS', 'Rk', 'G', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD',
              'Cmp', 'Att', 'Yds_opp', 'Season']

# df.dropna(axis=0, how='a', inplace=True)
df

Unnamed: 0,Tm,W,L,W-L%,PF,PA,PD,MoV,SoS,SRS,...,Yds,Ply,Y/P,TO,FL,1stD,Cmp,Att,Yds_opp,Season
0,Buffalo Bills,11,6,0.647,483,289,194,11.4,-1.6,9.8,...,6493.0,1143.0,5.7,22.0,6.0,398.0,415.0,655.0,4637.0,2021
1,New England Patriots,10,7,0.588,462,303,159,9.4,-0.9,8.5,...,6008.0,1052.0,5.7,23.0,10.0,362.0,364.0,535.0,5284.0,2021
2,Miami Dolphins,9,8,0.529,341,373,-32,-1.9,-0.8,-2.7,...,5219.0,1097.0,4.8,26.0,12.0,325.0,404.0,615.0,5738.0,2021
3,New York Jets,4,13,0.235,310,504,-194,-11.4,1.0,-10.4,...,5208.0,1036.0,5.0,27.0,7.0,310.0,357.0,603.0,6760.0,2021
4,Cincinnati Bengals,10,7,0.588,460,376,84,4.9,-1.9,3.1,...,6145.0,1046.0,5.9,21.0,7.0,337.0,384.0,555.0,5964.0,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,Atlanta Falcons,5,11,0.313,299,422,-123,-7.7,0.3,-7.4,...,5164.0,1006.0,5.1,26.0,11.0,306.0,377.0,573.0,6194.0,2003
604,Los Angeles Rams,12,4,0.750,447,328,119,7.4,-1.6,5.9,...,6325.0,1058.0,6.0,23.0,5.0,355.0,406.0,607.0,5863.0,2003
605,Seattle Seahawks,10,6,0.625,404,327,77,4.8,-0.7,4.1,...,5506.0,954.0,5.8,13.0,6.0,302.0,324.0,495.0,6445.0,2003
606,San Francisco 49ers,7,9,0.438,384,337,47,2.9,0.1,3.1,...,6387.0,1046.0,6.1,24.0,10.0,361.0,343.0,514.0,5270.0,2003


In [129]:
# Calculating YPG and PPG from season totals
df['PPG'] = df['PF'] / df['G']
df['YPG'] = df['Yds'] / df['G']

df['PPG_opp'] = df['PA'] / df['G']
df['YPG_opp'] = df['Yds_opp'] / df['G'] 

df.head()


Unnamed: 0,Tm,W,L,W-L%,PF,PA,PD,MoV,SoS,SRS,...,FL,1stD,Cmp,Att,Yds_opp,Season,PPG,YPG,PPG_opp,YPG_opp
0,Buffalo Bills,11,6,0.647,483,289,194,11.4,-1.6,9.8,...,6.0,398.0,415.0,655.0,4637.0,2021,28.411765,381.941176,17.0,272.764706
1,New England Patriots,10,7,0.588,462,303,159,9.4,-0.9,8.5,...,10.0,362.0,364.0,535.0,5284.0,2021,27.176471,353.411765,17.823529,310.823529
2,Miami Dolphins,9,8,0.529,341,373,-32,-1.9,-0.8,-2.7,...,12.0,325.0,404.0,615.0,5738.0,2021,20.058824,307.0,21.941176,337.529412
3,New York Jets,4,13,0.235,310,504,-194,-11.4,1.0,-10.4,...,7.0,310.0,357.0,603.0,6760.0,2021,18.235294,306.352941,29.647059,397.647059
4,Cincinnati Bengals,10,7,0.588,460,376,84,4.9,-1.9,3.1,...,7.0,337.0,384.0,555.0,5964.0,2021,27.058824,361.470588,22.117647,350.823529


In [130]:
hover_data = ['Tm', 'Season']

In [131]:
# Plotting PPG vs YPG (Offensive)

px.scatter(df, x='YPG', y='PPG', color='Season',  hover_data=hover_data)

In [132]:
# Plotting opponent PPG vs YpG
px.scatter(df, x='YPG_opp', y='PPG_opp', color='Season',  hover_data=hover_data)

In [133]:
# Let's make these figures a bit more interactive with plotly
def make_scatter_plot(df, x_label='YPG', y_label='PPG', side_of_ball='off', cluster_type='kmeans'):
    '''Create plotly scatter plot to compare YPG and PPG'''
    phase_dict = {'off': 'Offense', 'def': 'Defense'}
    
    color_label = f'labels_{cluster_type}_{side_of_ball}'
    f = px.scatter(df, x=x_label, y=y_label, color=color_label,
               hover_data=hover_data,
               title=f"NFL {phase_dict.get(side_of_ball)} Performance ({cluster_type.capitalize()})"
              )
    return f
    

### Running [K-Means](https://scikit-learn.org/stable/modules/clustering.html#k-means) clustering

using `scikit-learn`'s built-in kmeans algorithm to cluster teams based on their YpG and PPG.

In [134]:
# Fitting K-means to YPG
data = df[['YPG', 'PPG']]
kmeans_off = KMeans(n_clusters=4, random_state=0).fit(df[['YPG', 'PPG']])
df['labels_kmeans_off'] = kmeans_off.labels_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
plt.scatter(df.YPG, df.PPG, c=kmeans_off.labels_ )

In [None]:
# Plotting defensive measures -- colored by cluster

kmeans_def = KMeans(n_clusters=4, random_state=0).fit(df[['YPG_opp', 'PPG_opp']].values)
df['labels_kmeans_def'] = kmeans_def.labels_
plt.scatter(df.YPG_opp, df.PPG_opp, c=kmeans_def.labels_)

In [None]:
f = make_scatter_plot(df)
f.show()

In [None]:
# Plotting defensive performance
f = make_scatter_plot(df, x_label='YPG_opp', y_label='PPG_opp', side_of_ball='def')
f.show()

### Affinity Propagation
Trying out the built-in [Affinity Propagation](https://scikit-learn.org/stable/modules/clustering.html#affinity-propagation) module from `scikit-learn`.

In [None]:
from sklearn.cluster import AffinityPropagation

In [None]:
data_off = df[['YPG', 'PPG']]
data_def = df[['YPG_opp', 'PPG_opp']]


ap = AffinityPropagation(verbose=True).fit(df[['YPG', 'PPG']])

df['labels_ap_off'] = ap.labels_

In [None]:
n_clusters = len(np.unique(ap.labels_))

print(f"Affinity Propagation produced {n_clusters}.")

In [None]:
plt.scatter(data_off['YPG'], data_off['PPG'], c=ap.labels_)
plt.title("NFL Points versus yards per game -- Affinity Propagation Clustering")
plt.xlabel("YPG")
plt.ylabel("PPG")

In [None]:
# Doing the same for defensive stats -- opponent YPG and PPG
ap = AffinityPropagation(verbose=True).fit(df[['YPG_opp', 'PPG_opp']])
df['labels_ap_def'] = ap.labels_

plt.scatter(data_def['YPG_opp'], data_def['PPG_opp'], c=ap.labels_)
plt.title("NFL Opponent Points versus yards per game -- Affinity Propagation")
plt.xlabel("Opp YPG")
plt.ylabel("Opp PPG")

In [None]:
# Plotting AP clustered data with plotly
f = px.scatter(df, x='YPG_opp', y='PPG_opp',
           color='labels_def', title='Defensive Performance (Affinity Propagation)',
           hover_data=['Tm', 'Season'],
           color_discrete_map=px.colors.qualitative.Dark2)

In [None]:
f.show()