In [30]:
import ssl
import json
from cmath import nan

ssl._create_default_https_context = ssl._create_unverified_context
import nflreadpy as nfl
import pandas as pd
import random
from pathlib import Path


In [2]:
# Load current season play-by-play data
pbp = nfl.load_pbp()

# nflreadpy uses Polars instead of pandas. Convert to pandas if needed:
pbp_pandas = pbp.to_pandas()

pbp_pandas.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,1.0,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,,,,...,0.0,0.0,-0.0,,,,,,,
1,40.0,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0.0,0.0,-0.3527,,,,,,,
2,63.0,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0.0,0.0,-0.190052,,,,,,0.511128,-51.112807
3,85.0,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,1.0,0.0,1.31734,0.939998,4.750889,3.0,0.666726,0.43911,0.66894,33.105969
4,115.0,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0.0,0.0,-1.69436,,,,,,0.492038,50.796208


In [31]:
# Load player game-level stats for multiple seasons
player_stats = nfl.load_player_stats([2022, 2023])
# player_stats.head()

In [32]:
# Load all available team level stats
team_stats = nfl.load_team_stats(seasons=True)
# team_stats.head()


# query all years of available NFL data and pull in first+last names
- eventually will randomly assign first+last for recruit names

## next level:
- pull in name & demographic data to get a distribution by city/state
- e.g. John Smith is not a common name out of Kapolei, HI

In [34]:
nfl_weekly_yrs = []
for i in range(1999, 2024): ## no data before 1999
    nfl_weekly_yrs.append(i)
    i+=1

# print(nfl_weekly_yrs)
nfl_weekly = nfl.load_player_stats(nfl_weekly_yrs).to_pandas()

o_players_names = nfl_weekly.player_display_name.unique().tolist()
offensive_players = [
    x for x in o_players_names
    if pd.notna(x)
]

first_names = []
last_names = []

for playaz in offensive_players:
    first_and_last = playaz.split()
    first_names.append(first_and_last[0])
    last_names.append(first_and_last[1])

print(first_names[0:5])

['Abdul-Karim', 'Rabih', 'Rahim', 'Donnie', 'Flozell']


# get distribution of player positions
- cfb_positions_simplified aggregates
    - OT+OG+C -> OL
    - REDG, LEDG, NT, DT -> DL
- Adds in more non QB/RB strings to get a more appropriate distribution of positions
- e.g. don't want equal likelihood of a QB vs DL, there will be a bigger pool of DL at the HS level to pick from by nature of there being more DL on a football team
    - next level: see if the above statement holds true, adjust the pct likelihood for a given position

In [35]:
football_positions = nfl_weekly.position.unique().tolist()
football_positions_clean = [
    x for x in football_positions
    if pd.notna(x)
]
# print(football_positions_clean)
cfb_positions_simplified = ['QB', 'TE', 'WR', 'RB', 'FS', 'OL', 'LB', 'DL', 'CB', 'SS', 'MLB', 'ATH']

## add in more positions to accommodate for the vastness of players at these positions relative to others
for x in range (0,2):
  cfb_positions_simplified.append("WR")
  cfb_positions_simplified.append("CB")
  cfb_positions_simplified.append("LB")

for x in range (0,3):
  cfb_positions_simplified.append("OL")
  cfb_positions_simplified.append("DL")
  cfb_positions_simplified.append("ATH") ## lots of high school athletes

print(cfb_positions_simplified)


['QB', 'TE', 'WR', 'RB', 'FS', 'OL', 'LB', 'DL', 'CB', 'SS', 'MLB', 'ATH', 'WR', 'CB', 'LB', 'WR', 'CB', 'LB', 'OL', 'DL', 'ATH', 'OL', 'DL', 'ATH', 'OL', 'DL', 'ATH']


# Randomly choose name and position
- positions_simplified is weighted more heavily towards non-QB/RB since there are more requirements for the above positions within a team

In [21]:
def cfb_random_name():
    random_frst_nm = random.choice(first_names)
    random_last_nm = random.choice(last_names)

    return random_frst_nm + ' ' + random_last_nm


def cfb_random_position():
    return random.choice(cfb_positions_simplified)

print(cfb_random_name() + ' ' + cfb_random_position())

Jamarco Wrotto OL


# Get state abbrevs

In [22]:
## source: https://gist.githubusercontent.com/rogerallen/1583593/raw/2dd598547e1a5680740ebf3d4365f628c1951579/us_state_abbrev.py
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}

# Get state population by city
- use the city population as a pctage to generate given recruits

In [25]:
BASE_DIR = Path.cwd().parent   # move up one level if notebook is inside src/
CONFIG_PATH = (BASE_DIR / "outside_data" / "city_state_population_2023.csv")

us_city_state_population_2023 = pd.read_csv(CONFIG_PATH)
us_city_state_population_2023['state_abbrev'] = us_city_state_population_2023['state'].map(us_state_to_abbrev)
us_city_state_population_2023.head()

Unnamed: 0.1,Unnamed: 0,city,state,Population_2023,pct_of_total,city_state,pct_of_state_total,state_abbrev
0,0,Abbeville,Alabama,2377,1.1e-05,"Abbeville, Alabama",0.000755,AL
1,1,Adamsville,Alabama,4172,2e-05,"Adamsville, Alabama",0.001325,AL
2,2,Addison,Alabama,667,3e-06,"Addison, Alabama",0.000212,AL
3,3,Akron,Alabama,224,1e-06,"Akron, Alabama",7.1e-05,AL
4,4,Alabaster,Alabama,34107,0.000162,"Alabaster, Alabama",0.010835,AL


In [26]:
def get_random_us_city():
    city_names = us_city_state_population_2023.city_state.tolist()
    city_probability = us_city_state_population_2023.pct_of_total.tolist()
    return random.choices(city_names, weights=city_probability,k=1)[0]

print(get_random_us_city())

Port St. Lucie, Florida


In [27]:
def get_random_us_city(state=None):
    """
    Return a random city from the US city-state population table.

    Args:
        state (str, optional): The state to filter by. Defaults to None.

    Returns:
        str: A random city or a random city from the specified state.
    """
    if state:
        filtered_table = us_city_state_population_2023[us_city_state_population_2023['state_abbrev'] == state]
        city_names = filtered_table.city_state.tolist()
        city_probability = filtered_table.pct_of_state_total.tolist()
    else:
        city_names = us_city_state_population_2023.city_state.tolist()
        city_probability = us_city_state_population_2023.pct_of_total.tolist()

    return random.choices(city_names, weights=city_probability, k=1)[0]


print(get_random_us_city('CA'))
print(get_random_us_city())

Pleasant Hill, California
Edmond, Oklahoma


In [28]:
def get_us_state_abbrev(city):
    city_state = city.split(',')
    return us_state_to_abbrev.get(city_state[1].strip())


get_us_state_abbrev('Columbus, Ohio')

'OH'

# Generate a recruiting class
- On the 247 Composite there are roughly 30-35 5 stars, 300-330 4 stars, 2000 3 stars, and 400-500 2 stars. There aren't any "1 star" recruits.

## star rankings
- 1 in 7 players is a bust
- 2 in 7 players are a gem
- 4 in 7  are  normal
    - modular adjustments to recruits

## playa factor
- probably has to do with dev traits

In [47]:
BASE_DIR = Path.cwd().parent   # move up one level if notebook is inside src/
player_development_config = BASE_DIR / "config" / "gem_dev_likelihood.json"
# eventually want to adjust the distributions based on 247 Star rating

with player_development_config.open() as f:
    player_development = json.load(f)

# ensure that the totals for the gem/development likelihoods sum to 1
def validate_dev_config():
    for key in player_development.keys():
        total_likelihood = sum(player_development[key].values())
        if total_likelihood != 1:
            raise ValueError(
                f"Likelihoods for '{key}' do not sum to 1'"
            )
validate_dev_config()

print(player_development)

{'gem_bust_rates': {'gem': 0.25, 'bust': 0.175, 'regular': 0.575}, 'dev_trait_distribution': {'elite': 0.1, 'star': 0.2, 'impact': 0.3, 'normal': 0.4}}
