In [1]:
import requests
from bs4 import BeautifulSoup

session = requests.Session()

def get_team_win_loss_perc(year):
    url = f'https://www.pro-football-reference.com/years/{year}/index.htm'
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    standings = {}
    for conference in ['AFC', 'NFC']:
        table_id = conference
        table = soup.find('table', id=table_id)
        if table:
            for row in table.tbody.find_all('tr'):  # type: ignore
                team_cell = row.find('th', {'data-stat': 'team'})  # type: ignore
                wins_cell = row.find('td', {'data-stat': 'win_loss_perc'})  # type: ignore
                if team_cell and wins_cell:
                    team_name = team_cell.text.strip()  # type: ignore
                    wins = float(wins_cell.text.strip())  # type: ignore
                    standings[team_name] = wins
    return standings

# # Example usage:
# year = 2023
# team_wins = get_team_win_loss_perc(year)
# for team, wins in team_wins.items():
#     print(f"{team}: {wins} win %")

In [2]:
# TEAM IDS
ARI = 1
ATL = 2
BAL = 3
BUF = 4
CAR = 5
CHI = 6
CIN = 7
CLE = 8
DAL = 9
DEN = 10
DET = 11
GB = 12
HOU = 13
IND = 14
JAX = 15
KC = 16
LV = 17
LAC = 18
LAR = 19
MIA = 20
MIN = 21
NE = 22
NO = 23
NYG = 24
NYJ = 25
PHI = 26
PIT = 27
SEA = 28
SF = 29
TB = 30
TEN = 31
WAS = 32

# The following dictionary maps team names to their abbreviations.
FULL_NAME_TO_ID = {
  'Arizona Cardinals': ARI,
  'Atlanta Falcons': ATL,
  'Baltimore Ravens': BAL,
  'Buffalo Bills': BUF,
  'Carolina Panthers': CAR,
  'Chicago Bears': CHI,
  'Cincinnati Bengals': CIN,
  'Cleveland Browns': CLE,
  'Dallas Cowboys': DAL,
  'Denver Broncos': DEN,
  'Detroit Lions': DET,
  'Green Bay Packers': GB,
  'Houston Oilers': TEN,
  'Houston Texans': HOU,
  'Indianapolis Colts': IND,
  'Jacksonville Jaguars': JAX,
  'Kansas City Chiefs': KC,
  'Las Vegas Raiders': LV,
  'Los Angeles Chargers': LAC,
  'Los Angeles Rams': LAR,
  'Miami Dolphins': MIA,
  'Minnesota Vikings': MIN,
  'New England Patriots': NE,
  'New Orleans Saints': NO,
  'New York Giants': NYG,
  'New York Jets': NYJ,
  'Oakland Raiders': LV,
  'Philadelphia Eagles': PHI,
  'Pittsburgh Steelers': PIT,
  'San Diego Chargers': LAC,
  'San Francisco 49ers': SF,
  'Seattle Seahawks': SEA,
  'St. Louis Rams': LAR,
  'St Louis Rams': LAR,
  'Tampa Bay Buccaneers': TB,
  'Tennessee Titans': TEN,
  'Washington Commanders': WAS,
  'Washington Football Team': WAS,
  'Washington Redskins': WAS,
}

# nfl_data_py uses abbreviations for teams. Map those abbreviations to the ids.

ABBREVIATIONS_TO_ID = {
  'ARI': ARI,
  'ATL': ATL,
  'BAL': BAL,
  'BUF': BUF,
  'CAR': CAR,
  'CHI': CHI,
  'CIN': CIN,
  'CLE': CLE,
  'DAL': DAL,
  'DEN': DEN,
  'DET': DET,
  'GB': GB,
  'GNB': GB,
  'HOU': HOU,
  'IND': IND,
  'JAX': JAX,
  'KC': KC,
  'KAN': KC,
  'LV': LV,
  'LAC': LAC,
  'LAR': LAR,
  'MIA': MIA,
  'MIN': MIN,
  'NE': NE,
  'NWE': NE,
  'NO': NO,
  'NOR': NO,
  'NYG': NYG,
  'NYJ': NYJ,
  'OAK': LV,
  'PHI': PHI,
  'PIT': PIT,
  'SEA': SEA,
  'SF': SF,
  'SFO': SF,
  'SDG': LAC,
  'STL': LAR,
  'TAM': TB,
  'TB': TB,
  'TEN': TEN,
  'WAS': WAS
}

In [3]:
import pandas as pd
import random
import time

# 1. Define the range of years to analyze
start_year = 2000
end_year = 2020

# Build a pandas dataframe of mapping the team id and season to the win
# percentage using the get_team_win_loss_perc function.
def build_team_win_loss_df(start, end) -> pd.DataFrame:
  data = []
  for year in range(start, end + 1):
    time.sleep(random.uniform(1.5, 3.5))  # wait 1.5 to 3.5 seconds
    standings = get_team_win_loss_perc(year)
    for team, win_perc in standings.items():
      # Remove any non-alphanumeric characters from the team name, but keep
      # spaces.
      team = ''.join(e for e in team if e.isalnum() or e.isspace())
      # Convert the team name to the full name using the FULL_NAME_TO_ID
      team_id = FULL_NAME_TO_ID.get(team, None)
      if team_id is not None:
        print(f"Team: {team}, ID: {team_id}, Year: {year}, Win %: {win_perc}")
        data.append({'team_id': team_id, 'year': year, 'win_percentage': win_perc})
      else:
        print(f"Team {team} not found in FULL_NAME_TO_ID mapping.")
  return pd.DataFrame(data)

team_win_loss_data = build_team_win_loss_df(start_year - 2, end_year + 4)

Team: New York Jets, ID: 25, Year: 1998, Win %: 0.75
Team: Miami Dolphins, ID: 20, Year: 1998, Win %: 0.625
Team: Buffalo Bills, ID: 4, Year: 1998, Win %: 0.625
Team: New England Patriots, ID: 22, Year: 1998, Win %: 0.563
Team: Indianapolis Colts, ID: 14, Year: 1998, Win %: 0.188
Team: Jacksonville Jaguars, ID: 15, Year: 1998, Win %: 0.688
Team Tennessee Oilers not found in FULL_NAME_TO_ID mapping.
Team: Pittsburgh Steelers, ID: 27, Year: 1998, Win %: 0.438
Team: Baltimore Ravens, ID: 3, Year: 1998, Win %: 0.375
Team: Cincinnati Bengals, ID: 7, Year: 1998, Win %: 0.188
Team: Denver Broncos, ID: 10, Year: 1998, Win %: 0.875
Team: Oakland Raiders, ID: 17, Year: 1998, Win %: 0.5
Team: Seattle Seahawks, ID: 28, Year: 1998, Win %: 0.5
Team: Kansas City Chiefs, ID: 16, Year: 1998, Win %: 0.438
Team: San Diego Chargers, ID: 18, Year: 1998, Win %: 0.313
Team: Dallas Cowboys, ID: 9, Year: 1998, Win %: 0.625
Team: Arizona Cardinals, ID: 1, Year: 1998, Win %: 0.563
Team: New York Giants, ID: 24, 

In [None]:
team_win_loss_data.columns

In [4]:
import pandas as pd
import nfl_data_py as nfl

# 2. Load draft and team win data
# nfl.import_draft_picks() includes draft info
draft_data = nfl.import_draft_picks()

print("Original number of draft picks:", len(draft_data))

# Filter draft data to top 10 picks in the range
top10_drafts = draft_data[
    (draft_data['season'].between(start_year, end_year)) &
    (draft_data['round'] == 1 ) &
    (draft_data['pick'] <= 10)
]

print("Filtered number of draft picks:", len(top10_drafts))

# 3. Calculate win deltas for teams after their draft picks
results = []

for _, row in top10_drafts.sort_values(by='season').iterrows():
  year = row['season']
  team = row['team']
  position = row['position']
  # print(f"Processing team: {team}, year: {year}, position: {position}")
  # if team not in ABBREVIATIONS_TO_ID:
  #   print(f"Team {team} not found in ABBREVIATIONS_TO_ID.")
  #   break

  # Convert team abbreviation to the team id
  team_id = ABBREVIATIONS_TO_ID.get(team, None)
  if team_id is None:
    print(f"Team {team} not found in ABBREVIATIONS_TO_ID.")
    continue

  # Calculate average win percentage for previous 2 years
  prev_wins = team_win_loss_data[
      (team_win_loss_data['team_id'] == team_id) &
      (team_win_loss_data['year'].between(year - 2, year - 1))
  ]['win_percentage'].mean()

  # Average wins for 5 years after draft
  future_wins = team_win_loss_data[
      (team_win_loss_data['team_id'] == team_id) &
      (team_win_loss_data['year'].between(year, year + 4))
  ]['win_percentage'].mean()

  win_diff = future_wins - prev_wins
  print(f"Team: {team}, Year: {year}, Position: {position}, Win Diff: {win_diff}")
  results.append({
      'team_id': team_id,
      'draft_year': year,
      'position': position,
      'win_diff': win_diff
  })

# 4. Aggregate by position
df_results = pd.DataFrame(results)
position_summary = df_results.groupby('position')['win_diff'].mean().sort_values(ascending=False)

print("Average Win Difference by Position (Top 10 Draft Picks):")
print(position_summary)


Original number of draft picks: 12413
Filtered number of draft picks: 210
Team: CLE, Year: 2000, Position: DE, Win Diff: 0.2254
Team: WAS, Year: 2000, Position: LB, Win Diff: -0.07480000000000003
Team: WAS, Year: 2000, Position: T, Win Diff: -0.07480000000000003
Team: CIN, Year: 2000, Position: WR, Win Diff: 0.13099999999999998
Team: BAL, Year: 2000, Position: RB, Win Diff: 0.16269999999999984
Team: PHI, Year: 2000, Position: DT, Win Diff: 0.4873
Team: ARI, Year: 2000, Position: RB, Win Diff: -0.15619999999999995
Team: PIT, Year: 2000, Position: WR, Win Diff: 0.26249999999999996
Team: CHI, Year: 2000, Position: LB, Win Diff: 0.11289999999999994
Team: BAL, Year: 2000, Position: WR, Win Diff: 0.16269999999999984
Team: GNB, Year: 2001, Position: DE, Win Diff: 0.0685
Team: SEA, Year: 2001, Position: WR, Win Diff: 0.13139999999999996
Team: CHI, Year: 2001, Position: WR, Win Diff: 0.15639999999999998
Team: SFO, Year: 2001, Position: DE, Win Diff: 0.12509999999999993
Team: NWE, Year: 2001, Po

In [5]:
# Print the counts per position
position_counts = df_results['position'].value_counts()
print("\nCounts per Position:")
print(position_counts)


Counts per Position:
QB     37
T      30
WR     28
DE     26
DB     20
LB     18
DT     18
RB     16
CB      5
TE      4
G       3
OLB     3
S       1
ILB     1
Name: position, dtype: int64


In [7]:
position_var = df_results.groupby('position')['win_diff'].var().sort_values(ascending=True)
print("\nVariance of Win Difference by Position:")
print(position_var)


Variance of Win Difference by Position:
position
OLB    0.003707
DB     0.016768
WR     0.019346
DE     0.019678
RB     0.020252
QB     0.020294
T      0.024806
LB     0.025072
G      0.025277
TE     0.027342
DT     0.032468
CB     0.041476
ILB         NaN
S           NaN
Name: win_diff, dtype: float64


Average Win Difference by Position (Top 10 Draft Picks):
position
ILB    0.205300
OLB    0.158433
QB     0.140269
CB     0.138840
DE     0.118508
T      0.116633
DT     0.106183
DB     0.090765
WR     0.085561
G      0.084300
LB     0.061806
TE     0.061075
RB     0.052544
S     -0.196800
Name: win_diff, dtype: float64

Counts per Position:
QB     37
T      30
WR     28
DE     26
DB     20
LB     18
DT     18
RB     16
CB      5
TE      4
G       3
OLB     3
S       1
ILB     1
Name: position, dtype: int64


Variance of Win Difference by Position:
position
OLB    0.003707
DB     0.016768
WR     0.019346
DE     0.019678
RB     0.020252
QB     0.020294
T      0.024806
LB     0.025072
G      0.025277
TE     0.027342
DT     0.032468
CB     0.041476
ILB         NaN
S           NaN
Name: win_diff, dtype: float64

In [11]:
import numpy as np
win_perc_diff = np.array([0.140269, 0.118508, 0.116633, 0.106183, 0.090765, 0.085561, 0.084300, 0.061806, 0.061075, 0.052544])
win_diff = np.round(win_perc_diff * 17, decimals=1)
print("\nWin Difference (rounded to nearest integer):")
print(list(win_diff))


Win Difference (rounded to nearest integer):
[2.4, 2.0, 2.0, 1.8, 1.5, 1.5, 1.4, 1.1, 1.0, 0.9]


In [14]:
import matplotlib.pyplot as plt
import pandas as pd

# Your data
data = {
    'position': ['QB', 'DE', 'T', 'DT', 'DB', 'WR', 'LB', 'RB'],
    'avg_win_diff': [2.4, 2.0, 2.0, 1.8, 1.5, 1.5, 1.1, 0.9],
}
df = pd.DataFrame(data)

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.bar(df['position'], df['avg_win_diff'], color='skyblue', edgecolor='black')

# Remove everything but the bars
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Remove the background
fig.patch.set_alpha(0)
ax.patch.set_alpha(0)

# Export transparent PNG
plt.savefig("draft_bar_chart_clean.png", dpi=300, transparent=True)
plt.close()


In [18]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# Scaled data (gain in wins over 5 years, after multiplying by 17 games)
scale = 17
means = {
    'QB': 0.140269,
    'DE': 0.118508,
    'T': 0.116633,
    'DT': 0.106183,
    'DB': 0.090765,
    'WR': 0.085561,
    'LB': 0.061806,
    'RB': 0.052544
}
variances = {
    'DB': 0.016768,
    'WR': 0.019346,
    'DE': 0.019678,
    'RB': 0.020252,
    'QB': 0.020294,
    'T': 0.024806,
    'LB': 0.025072,
    'DT': 0.032468
}

# Apply scaling
scaled_means = {pos: mu * scale for pos, mu in means.items()}
scaled_vars = {pos: var * (scale ** 2) for pos, var in variances.items()}

# Color mapping
colors = {
    'QB': '#FA7268',
    'DE': '#E63946',
    'T':  '#457B9D',
    'DT': '#2E7D6F',
    'DB': '#43AA8B',
    'WR': '#FFA552',
    'LB': '#F4A261',
    'RB': '#588157'
}

# X range
x = np.linspace(0, 10, 1000)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

for position in scaled_means:
    mu = scaled_means[position]
    sigma = np.sqrt(scaled_vars[position])
    y = norm.pdf(x, mu, sigma)
    ax.plot(x, y, label=position, color=colors[position], linewidth=2.5)

    # Label near peak of the curve
    peak_x = mu
    peak_y = norm.pdf(peak_x, mu, sigma)
    ax.text(peak_x, peak_y + 0.002, position, color=colors[position],
            ha='center', va='bottom', fontsize=10, fontweight='bold')

# Axis settings
ax.set_xlim(0, 10)
ax.set_xticks(range(0, 11))  # includes 0
ax.set_yticks([])

# Clean look
for spine in ax.spines.values():
    spine.set_visible(False)
ax.set_xlabel('')
ax.set_ylabel('')
fig.patch.set_alpha(0)
ax.patch.set_alpha(0)

# Save it
plt.savefig('gaussian_curves_games_labeled.png', dpi=300, transparent=True)
plt.close()
