In [4]:
import pandas as pd
import plotly.express as px
import os
from scipy.stats import norm
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt

In [5]:
pd.set_option('display.max_rows', 300)

In [7]:
# Create the directory if it doesn't exist

if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    
output_directory = "analysis_images"


NameError: name 'output_directory' is not defined

In [None]:
#import .csvs
reg_season_wins = pd.read_csv('nfl_reg_season_win_loss_records_2011_to_2022.csv')
playoff_wins = pd.read_csv('nfl_playoff_win_loss_records_2011_to_2022.csv')
superbowl_wins = pd.read_csv('Super_Bowl_Winners.csv')
player_salaries = pd.read_csv('nfl_salaries_2011-2022.csv')

In [None]:
player_salaries

In [None]:
#clean and standardize data 

# Make cap % a float
player_salaries['cap_percentage'] = player_salaries['cap_percentage'].astype(float)

# clean and standardize team names 
def clean_team_name(team):
    # Special cases
    if team == 'new-york-giants':
        return 'NY Giants'
    elif team == 'new-york-jets':
        return 'NY Jets'
    elif team == 'los-angeles-rams':
        return 'LA Rams'
    elif team == 'los-angeles-chargers':
        return 'LA Chargers'
    else:
        parts = team.split('-')
    
    city_name = parts[:-1]
    if city_name[-1] == 'football':
        city_name = city_name[:-1]
    return ' '.join(word.title() for word in city_name)

player_salaries['team'] = player_salaries['team'].apply(clean_team_name)

# Standardize cap hit number so it can be manipulated as a number rather than string
player_salaries['cap_hit'] = player_salaries['cap_hit'].replace('-', '0', regex=True)            
player_salaries['cap_hit'] = player_salaries['cap_hit'].str.replace('$', '', regex=True).str.replace(',', '', regex=True).astype(float)

player_salaries


In [None]:
reg_season_wins.rename(columns={'Team': 'team', 'Year': 'year'}, inplace=True)

reg_season_wins

In [None]:
playoff_wins.drop(columns={'Loss', 'Tie'}, inplace=True)

playoff_wins.rename(columns={'Team': 'team', 'Year': 'year', 'Win': 'Playoff Wins'}, inplace=True)

playoff_wins

In [None]:
superbowl_wins

# First analysis: Grab the highest paid player (by percent of cap), bucket them into groups, and determine if theres correlation between the highest paid player and success

In [None]:

# Use groupby with idxmax to get the index of rows with the highest cap_percentage per year per team
idx = player_salaries.groupby(['year', 'team'])['cap_percentage'].idxmax()

# Filter the dataframe based on the index
highest_paid = player_salaries.loc[idx].reset_index(drop=True)

highest_paid

In [None]:
# Plotting a histogram
fig_highest_paid = px.histogram(highest_paid, x='cap_percentage', nbins=50, title='highest_paid')
fig_highest_paid.show()

# Save the graph
file_path = os.path.join(output_directory, "Cap_Hit_for_Most_Expensive_Player")
fig_highest_paid.write_image(file_path, format='png')

In [None]:
# merge dataframes reg season wins losses with the player salaries df
salaries_with_reg_season_wins_df = pd.merge(highest_paid, reg_season_wins, on=['year', 'team'], how='inner')

salaries_with_reg_season_wins_df

In [None]:
# merge salaries_with_reg_season_wins_df with postseason wins
reg_and_postseason_wins_df = pd.merge(salaries_with_reg_season_wins_df, playoff_wins, on=['year', 'team'], how='left')

reg_and_postseason_wins_df['Playoff Wins'] = reg_and_postseason_wins_df['Playoff Wins'].fillna(0)

reg_and_postseason_wins_df



In [None]:
# merge reg_and_postseason_wins_df and superbowl wins
superbowl_wins.columns = ['year', 'team', 'Won Superbowl?']

# merge previous with superbowl wins
full_season_wins = pd.merge(reg_and_postseason_wins_df, superbowl_wins, on=['year', 'team'], how='left')

full_season_wins.fillna(0, inplace=True)

full_season_wins


In [None]:
# plot cap percentage in relation to wins for the highest paid player on each team since 2011
fig_highest_paid_to_wins = px.scatter(full_season_wins, x='cap_percentage', y='Win', hover_data=['name', 'team', 'year'], trendline='ols', trendline_color_override="red", title='Win vs Cap Percentage of Highest Paid Player')
# Set custom x-axis title
fig_highest_paid_to_wins.update_xaxes(title_text="Percent of the cap the most expensive player takes")
fig_highest_paid_to_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "Highest_Paid_to_wins")
fig_highest_paid_to_wins.write_image(file_path, format='png')



In [None]:
expensive_players = full_season_wins[full_season_wins['cap_percentage'] > 13]
total_playoff_wins = expensive_players['Playoff Wins'].sum()

print(f'There have been 49 seasons where a player has made more than 13% of a teams cap. This is a collection of the greatest NFL players of the past decade. But they have combined for only {total_playoff_wins} wins and 1 Superbowl win.')
expensive_players.head()


In [None]:
# plot cap percentage in relation to wins for the highest paid player on 10-win teams since 2011
playoff_teams_df = full_season_wins.loc[full_season_wins['Win'] >= 10].reset_index(drop=True)


fig_highest_paid_to_playoff_wins = px.scatter(playoff_teams_df, x='cap_percentage', y='Playoff Wins', hover_data=['name', 'team', 'year'], title='Playoff Wins vs Cap Percentage')
# Update y-axis to display only whole numbers
max_val = playoff_teams_df['Playoff Wins'].max()
fig_highest_paid_to_playoff_wins.update_yaxes(tickvals=list(range(0, int(max_val) + 1, 1)))
fig_highest_paid_to_playoff_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_highest_paid_to_playoff_wins")
fig_highest_paid_to_playoff_wins.write_image(file_path, format='png')



# Depth of talent in relation to wins

In [None]:
# Create a new dataframe with number of players over a certain % of the cap
# 2% of the cap last year was just under 5 million dollars

# player_salaries.head(60)

# Filter players with cap_percentage over 2.0
players_over_2_percent = player_salaries[player_salaries['cap_percentage'] > 2.0]

# Group by year and team, then count players
players_over_2_percent_by_team = players_over_2_percent.groupby(['year', 'team']).size().reset_index(name='players_over_2%')

# Resetting the index for your desired format
players_over_2_percent_by_team.reset_index(inplace=True)
players_over_2_percent_by_team['index'] = players_over_2_percent_by_team.index + 3

players_over_2_percent_by_team.head(60)



In [None]:
# players_over_2_percent_by_team

# Plotting a histogram
fig = px.histogram(players_over_2_percent_by_team, x='players_over_2%', nbins=50, title='Distribution of Salaries')
fig.show()





In [None]:
# merge dataframes

# merge salaries with reg season wins
roster_depth_with_reg_season_wins_df = pd.merge(players_over_2_percent_by_team, reg_season_wins, on=['year', 'team'], how='inner')

# merge previous with playoff wins
prev_and_postseason_wins_df = pd.merge(roster_depth_with_reg_season_wins_df, playoff_wins, on=['year', 'team'], how='left')

# merge previous with superbowl wins
roster_depth_and_full_season_wins = pd.merge(prev_and_postseason_wins_df, superbowl_wins, on=['year', 'team'], how='left')

roster_depth_and_full_season_wins[['Playoff Wins', 'Won Superbowl?']] = roster_depth_and_full_season_wins[['Playoff Wins', 'Won Superbowl?']].fillna(0)



roster_depth_and_full_season_wins.head(60)



In [None]:
# plot roster depth in relation to wins for the highest paid player on each team since 2011

fig_wins_to_roster_depth = px.scatter(roster_depth_and_full_season_wins, x='players_over_2%', y='Win', hover_data=['team', 'year'], trendline='ols', trendline_color_override="red", title='Wins vs Roster Depth')
fig_wins_to_roster_depth.update_xaxes(title_text="Number of Players on Team Earning 2% of Cap or More")

fig_wins_to_roster_depth.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_wins_to_roster_depth")
fig_wins_to_roster_depth.write_image(file_path, format='png')

# Analyze spending by Offense/Defense


In [None]:
# Define offensive and defensive positions
offensive_positions = ['WR', 'QB', 'LT', 'RB', 'C', 'RT', 'TE', 'G', 'FB']
defensive_positions = ['ILB', 'OLB', 'LB', 'CB', 'S', 'DE', 'DT']

# Map positions to categories
def map_to_category(pos):
    if pos in offensive_positions:
        return 'offense'
    elif pos in defensive_positions:
        return 'defense'
    else:
        return 'other'  # for any positions not listed in either list

player_salaries['category'] = player_salaries['position'].apply(map_to_category)

# Filter out rows that fall into the 'other' category (if any)
player_salaries = player_salaries[player_salaries['category'] != 'other']

# Group by year, team, and category, then sum up the cap_percentage values
offense_defense_spending = player_salaries.groupby(['year', 'team', 'category'])['cap_percentage'].sum().unstack().reset_index()

# Rename the columns
offense_defense_spending.columns.name = None  # remove the top-level category name
offense_defense_spending = offense_defense_spending.rename(columns={'offense': 'offense_spending', 'defense': 'defense_spending'})

offense_defense_spending

In [None]:
# clean data and merge the offense/defense spending df with the wins/playoff wins/sb wins df


# merge salaries with reg season wins
offense_defense_spending_with_reg_season_wins_df = pd.merge(offense_defense_spending, reg_season_wins, on=['year', 'team'], how='inner')

# merge previous with playoff wins
prev_and_postseason_wins_df = pd.merge(offense_defense_spending_with_reg_season_wins_df, playoff_wins, on=['year', 'team'], how='left')

# merge previous with superbowl wins
offense_defense_spending_and_full_season_wins = pd.merge(prev_and_postseason_wins_df, superbowl_wins, on=['year', 'team'], how='left')

offense_defense_spending_and_full_season_wins[['Playoff Wins', 'Won Superbowl?']] = offense_defense_spending_and_full_season_wins[['Playoff Wins', 'Won Superbowl?']].fillna(0)

offense_defense_spending_and_full_season_wins



In [None]:
df = offense_defense_spending_and_full_season_wins

# Create scatter plot
fig_offense_spending_to_wins = px.scatter(df, x="offense_spending", y="Win", hover_data=["team", "year"], 
                 title="Offensive Spending vs. Wins",
                 labels={"offense_spending": "Offensive Spending (%)", "Win": "Wins"},
                 trendline='ols', 
                 trendline_color_override="red",
                )

# Show plot
fig_offense_spending_to_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_offense_spending_to_wins")
fig_offense_spending_to_wins.write_image(file_path, format='png')

In [None]:

# Create scatter plot
fig_defense_spending_to_wins = px.scatter(df, x="defense_spending", y="Win", hover_data=["team", "year"], 
                 title="defense_spending vs. Wins",
                 labels={"defense_spending": "defense_spending (%)", "Win": "Wins"},
                 trendline='ols', 
                 trendline_color_override="red",
                )


# Show plot
fig_defense_spending_to_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_defense_spending_to_wins")
fig_defense_spending_to_wins.write_image(file_path, format='png')

In [None]:

playoff_teams_spending_df = offense_defense_spending_and_full_season_wins.loc[offense_defense_spending_and_full_season_wins['Playoff Wins'] >= 1].reset_index(drop=True)

df = playoff_teams_spending_df

# Create scatter plot
fig_offense_spending_to_playoff_wins = px.scatter(df, x="offense_spending", y="Playoff Wins", hover_data=["team", "year"], 
                 title="Offensive Spending vs. Playoff Wins",
                 labels={"offense_spending": "Offensive Spending (%)", "Playoff Wins": "Playoff Wins"},
                 trendline='ols', 
                 trendline_color_override="red",
                 height=500,
                 width= 800
                )

# Update y-axis to show only whole numbers
fig_offense_spending_to_playoff_wins.update_yaxes(dtick=1)

# Show plot
fig_offense_spending_to_playoff_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_offense_spending_to_playoff_wins")
fig_offense_spending_to_playoff_wins.write_image(file_path, format='png')

In [None]:
playoff_teams_spending_df = offense_defense_spending_and_full_season_wins.loc[offense_defense_spending_and_full_season_wins['Win'] >= 10].reset_index(drop=True)

df = playoff_teams_spending_df

# Create scatter plot
fig_defense_spending_to_playoff_wins = px.scatter(df, x="defense_spending", y="Playoff Wins", hover_data=["team", "year"], 
                 title="defense_spending vs. Playoff Wins",
                 labels={"defense_spending": "defense_spending (%)", "Playoff Wins": "Playoff Wins"},
                 trendline='ols', 
                 trendline_color_override="red",
                )

# Show plot
fig_defense_spending_to_playoff_wins.show()

# Save the graph
file_path = os.path.join(output_directory, "fig_defense_spending_to_playoff_wins")
fig_defense_spending_to_playoff_wins.write_image(file_path, format='png')

# Analyze spending by Position Group, Offense and Defense




In [None]:
# player_salaries

# Step 1: Mapping positions to groups
position_groups = {
    'C': 'Offensive_Line',
    'LT': 'Offensive_Line',
    'RT': 'Offensive_Line',
    'G': 'Offensive_Line',
    'WR': 'Receivers',
    'TE': 'Receivers',
    'RB': 'Running_Backs',
    'FB': 'Running_Backs',
    'S': 'Secondary',
    'CB': 'Secondary',
    'OLB': 'Linebackers',
    'ILB': 'Linebackers',
    'DT': 'Defensive_Line',
    'DE': 'Defensive_Line',
    'QB': 'QB'
}

# Map positions in the dataframe to their respective groups
player_salaries.loc[:, 'position_group'] = player_salaries['position'].map(position_groups)

# Step 2: Aggregate the data and Step 3: Calculate the average
grouped_player_salaries = (
    player_salaries.groupby(['year', 'team', 'position_group'])
    .agg(total_cap_percentage=('cap_percentage', 'sum'), 
         player_count=('position', 'size'))
    .reset_index()
)
grouped_player_salaries['avg_cap_percentage_per_player'] = grouped_player_salaries['total_cap_percentage'] / grouped_player_salaries['player_count']

# Step 4: Reshape the data
pivot_player_salaries = (
    grouped_player_salaries.pivot_table(index=['year', 'team'], 
                           columns='position_group', 
                           values='avg_cap_percentage_per_player')
    .reset_index()
)

pivot_player_salaries.columns.name = None  # Removing the name of columns for cleanliness

# Displaying the result
pivot_player_salaries.head(60)



In [None]:
# Join with wins and losses
pivot_player_salaries

with_reg_season_wins_df = pd.merge(pivot_player_salaries, reg_season_wins, on=['year', 'team'], how='inner')

# merge with playoff wins
position_spending_total_wins = pd.merge(with_reg_season_wins_df, playoff_wins, on=['year', 'team'], how='left')

# normalize the NANs
position_spending_total_wins['Playoff Wins'].fillna(0, inplace=True)


position_spending_total_wins

In [None]:
# Calculate the percentage for each team based on position columns

df = position_spending_total_wins

defense_columns = ['Defensive_Line', 'Linebackers', 'Secondary']
defense_colors = ['red', 'blue', 'green']

# Plotting each position_column separately with its color
for position, color in zip(defense_columns, defense_colors):
    plt.scatter(df[position], df['Win'], c=color, label=position, alpha=0.7)

plt.xlabel('Percent of the cap, per player, per position')
plt.ylabel('Wins')
plt.title('Wins per position spending, per player')
plt.grid(True)
plt.legend()

# Save the graph
plt.savefig(os.path.join(output_directory, "wins_per_position_per_player_chart.png"))

plt.show()




In [None]:
# Calculate the percentage for each team based on offensive columns

offense_columns = ['Running_Backs', 'Receivers', 'Offensive_Line' ]
offense_colors = ['purple', 'orange', 'cyan']

# Plotting each position_column separately with its color
for position, color in zip(offense_columns, offense_colors):
    plt.scatter(df[position], df['Win'], c=color, label=position, alpha=0.5)

plt.xlabel('Percent of the cap, per player, per position')
plt.ylabel('Wins')
plt.title('Wins per position spending, per player')
plt.grid(True)
plt.legend()

# Save the graph
plt.savefig(os.path.join(output_directory, "wins_per_position_per_player_chart_offense.png"))


plt.show()

In [None]:
# Plotting each position_column separately with its color for playoff wins
for position, color in zip(defense_columns, defense_colors):
    plt.scatter(df[position], df['Playoff Wins'], c=color, label=position, alpha=0.7)

plt.xlabel('Percent of the cap, per player, per position')
plt.ylabel('Playoff Wins')
plt.title('Playoff Wins per position spending, per player')
plt.grid(True)
plt.legend()

# Save the graph
plt.savefig(os.path.join(output_directory, "Playoff_wins_per_position_per_player_chart_defense.png"))


plt.show()

In [None]:
# Calculate the percentage for each team based on offensive columns
offense_columns = ['QB','Running_Backs', 'Receivers', 'Offensive_Line' ]
offense_colors = ['purple', 'orange', 'cyan', 'brown']

# Plotting each position_column separately with its color
for position, color in zip(offense_columns, offense_colors):
    plt.scatter(df[position], df['Playoff Wins'], c=color, label=position, alpha=0.5)

plt.xlabel('Percent of the cap, per player, per position')
plt.ylabel('Playoff Wins')
plt.title('Playoff Wins per position spending, per player')
plt.grid(True)
plt.legend()

# Save the graph
plt.savefig(os.path.join(output_directory, "Playoff_wins_per_position_per_player_chart_o.png"))

plt.show()