# Optimizing NFL Divisions

## 1.) Setting up Notebook and loading Data

In [1]:
import pandas as pd
import numpy as np
import folium
import math
from shapely.geometry import Point
from shapely.affinity import scale, rotate
from k_means_constrained import KMeansConstrained
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# Loading Data
latlon = pd.read_csv('StadiumCoords.csv')

## 2.) Defining Functions

### 2.A) Haversine Distance

In [3]:
# Full Haversine distance function (in kilometers)
def haversine_distance(lat1, lon1, lat2, lon2, radius_km=6378.137):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return radius_km * c

### 2.B) Average Yearly Distance Formula

In [4]:
def avg_distance(df, stadium_distances):
    df['avg_dist'] = 0
    
    for i in range(len(df)):
        team = df['Team'][i]
        div = df['Division'][i]
        conf = df['Conf'][i]
        
        div_df = df[df['Division'] == div]

        # Division distance is 2x the distance to each other team in the division
        # (4 teams in a division)
        div_dist = 0
        for j in range(4):
            if div_df['Team'].iloc[j] != team:
                team_comp = div_df['Team'].iloc[j]
                div_dist += (2*stadium_distances[team][team_comp])
        
        same_conf_dist = 0
        

        # Confernce distance is 0.5x the distance to each other team in the conference
        # (16 teams in a conference)
        # Conference -> 0.5 * each other in conference
        conf_df = df[df['Conf'] == conf]
        for j in range(16):
            if conf_df['Team'].iloc[j] != team:
                team_comp = conf_df['Team'].iloc[j]
                same_conf_dist += ((stadium_distances[team][team_comp]) * 0.5)
        
        # Other Conference
        # Outer-Conference distance is (5/16)x the distance to each other team in the other conference 
        # (16 teams in the other conference)
        dif_conf_dist = 0
        dif_conf_df = df[df['Conf'] != conf]
        for j in range(16):
                team_comp = dif_conf_df['Team'].iloc[j]
                dif_conf_dist += ((stadium_distances[team][team_comp]) * (5/16))
        
        df['avg_dist'].iloc[i] = (div_dist + same_conf_dist + dif_conf_dist)
        
    return(df)          

### 2.C) Used Fuel Function

In [5]:
def avg_fuel_func(df, fuel_used_df):
    df2 = df.copy()
    df2['avg_fuel'] = 0
    
    for i in range(len(df2)):
        team = df2['Team'][i]
        div = df2['Division'][i]
        conf = df2['Conf'][i]
        
        div_df = df2[df2['Division'] == div]

        
        div_dist = 0
        for j in range(4):
            if div_df['Team'].iloc[j] != team:
                team_comp = div_df['Team'].iloc[j]
                div_dist += fuel_used_df[team][team_comp]
        
        same_conf_dist = 0
        
        # Conference -> 0.5 * each other in conference
        
        conf_df = df2[df2['Conf'] == conf]
        for j in range(16):
            if conf_df['Team'].iloc[j] != team:
                team_comp = conf_df['Team'].iloc[j]
                same_conf_dist += ((fuel_used_df[team][team_comp]) * 0.5)
        
        
        # Other Conference
        dif_conf_dist = 0
        dif_conf_df = df2[df2['Conf'] != conf]
        for j in range(16):
                team_comp = dif_conf_df['Team'].iloc[j]
                dif_conf_dist += ((fuel_used_df[team][team_comp]) * (5/16))
        
        
        df2['avg_fuel'].iloc[i] = (div_dist + same_conf_dist + dif_conf_dist)
        
    return(df2.copy())          

## 3.) Running Analysis

### 3.A) Calculating distances between stadiums

In [6]:
# Create an empty distance DataFrame with team names as both rows and columns
teams = latlon['Team'].values
stadium_distances = pd.DataFrame(index=teams, columns=teams)

# Compute distances between stadiums
for i, team_i in enumerate(teams):
    lat_i, lon_i = latlon.iloc[i][['Latitude', 'Longitude']]
    for j, team_j in enumerate(teams):
        lat_j, lon_j = latlon.iloc[j][['Latitude', 'Longitude']]
        distance = haversine_distance(lat_i, lon_i, lat_j, lon_j)
        stadium_distances.iloc[i, j] = distance

In [7]:
stadium_distances.head()

Unnamed: 0,Raiders,Chiefs,Cowboys,Panthers,Saints,Broncos,Commanders,Browns,Lions,Patriots,...,Titans,Texans,Bengals,Buaccaneers,Rams,Chargers,Bears,Cardinals,Jaguars,Vikings
Raiders,0.0,1851.588605,1699.641647,3090.40189,2432.778445,979.99326,3373.82819,2947.503215,2839.403166,3810.210325,...,2545.349593,1976.526204,2712.631381,3196.603136,373.296604,373.296604,2458.807223,390.695651,3173.356215,2092.670924
Chiefs,1851.588605,0.0,739.762642,1280.794011,1089.963669,909.146259,1522.435882,1118.498565,1031.565366,1986.055855,...,751.69512,1045.872714,861.049161,1656.979466,2201.582638,2201.582638,660.342327,1705.95045,1522.42254,667.269679
Cowboys,1699.641647,739.762642,0.0,1522.393278,735.534369,1054.064527,1945.112054,1674.289058,1632.410402,2496.31499,...,1020.326796,376.708204,1334.067958,1497.026118,1976.603114,1976.603114,1314.883259,1415.383402,1489.649977,1400.564915
Panthers,3090.40189,1280.794011,1522.393278,0.0,1045.135845,2185.989178,541.637822,703.001586,814.41441,1129.474488,...,545.126124,1498.098652,539.496883,822.024429,3417.800798,3417.800798,944.09479,2880.037451,550.575786,1510.944094
Saints,2432.778445,1089.963669,735.534369,1045.135845,0.0,1742.543551,1567.62561,1490.940338,1515.840112,2158.630255,...,757.538971,515.560483,1138.129123,769.914148,2697.222621,2697.222621,1344.271008,2133.042826,813.772635,1695.327964


### 3.B) Running K-means constrained

#### 3.B.i) Conference, then Division
Intra-conference travel is more frequent, so the conferences should be created before the divisions assigned
<br>
First, a dry run to test the code. Then a Loop to find optimal Solutions

In [8]:
# Extract coordinates as a NumPy array
X = latlon[['Latitude', 'Longitude']].values
    

clf = KMeansConstrained(n_clusters= 2, size_min = 16, size_max = 16, random_state = 0)
clf.fit_predict(X)
centroid_array2 = clf.cluster_centers_
print(clf.cluster_centers_)
print(clf.labels_)

[[  37.57532625  -79.11032187]
 [  37.56023    -102.02415688]]
[1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 1]


In [9]:
# Assigining teams to their clusters
latlon['Conf'] = clf.labels_
latlon_conf0 = latlon[latlon['Conf'] == 0]
latlon_conf1 = latlon[latlon['Conf'] == 1]

In [10]:
# Extract coordinates as NumPy arrays
X0 = latlon_conf0[['Latitude', 'Longitude']].values
X1 = latlon_conf1[['Latitude', 'Longitude']].values

In [11]:
# Creating 4 Divisions in each Conference
# Conference 0:

clf = KMeansConstrained(n_clusters= 4, size_min = 4, size_max = 4, random_state = 0)
clf.fit_predict(X0)
latlon_conf0['Division'] = clf.labels_

In [12]:
# Creating 4 Divisions in each Conference
# Conference 1:

clf = KMeansConstrained(n_clusters= 4, size_min = 4, size_max = 4, random_state = np.random.randint(1, 100_000_000))
clf.fit_predict(X1)
latlon_conf1['Division'] = clf.labels_

In [13]:
# Create a copy of the relevant column (assumed to be column index 4) and assign group labels
latlon_conf0 = latlon_conf0.copy()
latlon_conf0['Division'] = latlon_conf0.iloc[:, 4]

latlon_conf1 = latlon_conf1.copy()
latlon_conf1['Division'] = latlon_conf1.iloc[:, 4] + 4  # Add 4 to shift group label

# Concatenate both conf dataframes
latlon_groups = pd.concat([latlon_conf0[['Team', 'Division']], latlon_conf1[['Team', 'Division']]])

# Merge group info into latlon
latlon2 = latlon.merge(latlon_groups, on='Team', how='left')

In [14]:
# What are thedistances for this allocation?
avg_distance(latlon2, stadium_distances)

Unnamed: 0,Team,Latitude,Longitude,Conf,Division,avg_dist
0,Raiders,36.09084,-115.18326,1,5,34757.672813
1,Chiefs,39.04896,-94.484,1,5,26324.089276
2,Cowboys,32.74777,-97.09346,1,7,24019.975621
3,Panthers,35.22569,-80.85285,0,2,19777.681019
4,Saints,29.95101,-90.08114,1,7,24607.329947
5,Broncos,39.74385,-105.02029,1,5,27389.533269
6,Commanders,38.90778,-76.86461,0,3,17710.732437
7,Browns,41.50596,-81.69969,0,2,17323.084441
8,Lions,42.34001,-83.04556,0,2,17923.685665
9,Patriots,42.09084,-71.26434,0,0,22776.388204


#### 3.B.ii) Loop to mitigate randomness in KMeans, Optimal Conference -> Division Solution

In [15]:
# # Looping through to find the actual best, lowest distance solution:
# best_val = float('inf')

# # Extract coordinates as a NumPy array
# X = latlon[['Latitude', 'Longitude']].values

# for i in tqdm(range(1_000)):
#     # Starting with the conference:
#     latlon_temp = latlon.copy()

#     # Doing the minimization of conference
#     clf = KMeansConstrained(
#         n_clusters= 2, 
#         size_min = 16, 
#         size_max = 16,  
#         random_state = np.random.randint(1, 100_000_000)
#     )
#     clf.fit_predict(X)
#     centroid_array2 = clf.cluster_centers_
    
#     # Assigning teams to conferences
#     latlon['Conf'] = clf.labels_
#     latlon_conf0 = latlon[latlon['Conf'] == 0]
#     latlon_conf1 = latlon[latlon['Conf'] == 1]
    
#     # Setting up data for division creation
#     # Extract coordinates as NumPy arrays
#     X0 = latlon_conf0[['Latitude', 'Longitude']].values
#     X1 = latlon_conf1[['Latitude', 'Longitude']].values
        
    
#     clf = KMeansConstrained(n_clusters= 4, size_min = 4, size_max = 4,  random_state = np.random.randint(1, 100_000_000))
#     clf.fit_predict(X0)
#     latlon_conf0['Division'] = clf.labels_
    
#     clf = KMeansConstrained(n_clusters= 4, size_min = 4, size_max = 4, random_state = np.random.randint(1, 100_000_000))
#     clf.fit_predict(X1)

#     latlon_conf1['Division'] = clf.labels_
    
#     # Create a copy of the relevant column (assumed to be column index 4) and assign group labels
#     latlon_conf0 = latlon_conf0.copy()
#     latlon_conf0['Division'] = latlon_conf0.iloc[:, 4]

#     latlon_conf1 = latlon_conf1.copy()
#     latlon_conf1['Division'] = latlon_conf1.iloc[:, 4] + 4  # Add 4 to shift group label

#     # Concatenate both conf dataframes
#     latlon_groups = pd.concat([latlon_conf0[['Team', 'Division']], latlon_conf1[['Team', 'Division']]])

#     # Merge group info into latlon2
#     latlon2 = latlon.merge(latlon_groups, on='Team', how='left') 
            
#     new_df2 = avg_distance(latlon2, stadium_distances)
    
    
#     if new_df2.avg_dist.sum() < best_val:
#         latlon2['Final_Division'] = latlon2['Division']
#         latlon2['Final_Conference'] = latlon2['Conf']
        
#         best_val = new_df2.avg_dist.sum()
        
# new_df2

In [16]:
# avg_distance(new_df2, stadium_distances).avg_dist.sum()

In [17]:
# # Previous best solution
# avg_distance(latlon2, stadium_distances).avg_dist.sum()

In [18]:
# new_df2.avg_dist.sum()

In [19]:
# Looping through to find the actual best, lowest distance solution:
best_val = float('inf')
best_assignment = None

X = latlon[['Latitude', 'Longitude']].values  # extract once

for _ in tqdm(range(10_000)):
    latlon_temp = latlon.copy()#.drop(columns=['Division', 'Conf']).copy()

    # Conference clustering
    clf_conf = KMeansConstrained(
        n_clusters=2, size_min=16, size_max=16,
        random_state=np.random.randint(1, 100_000_000)
    )
    latlon_temp['Conf'] = clf_conf.fit_predict(X)

    # Division clustering for each conference
    divisions = []

    for conf_label, group_offset in zip([0, 1], [0, 4]):
        group = latlon_temp[latlon_temp['Conf'] == conf_label].copy()
        X_group = group[['Latitude', 'Longitude']].values

        clf_div = KMeansConstrained(
            n_clusters=4, size_min=4, size_max=4,
            random_state=np.random.randint(1, 100_000_000)
        )
        group['Division'] = clf_div.fit_predict(X_group) + group_offset
        divisions.append(group[['Team', 'Division']])

    # Merge divisions back
    division_df = pd.concat(divisions)
    latlon_temp = latlon_temp.merge(division_df, on='Team', how='left')

    # Evaluate configuration
    result_df = avg_distance(latlon_temp, stadium_distances)
    total_distance = result_df.avg_dist.sum()

    # Check if best
    if total_distance < best_val:
        best_val = total_distance
        latlon['Final_Division'] = latlon_temp['Division']
        latlon['Final_Conference'] = latlon_temp['Conf']
        best_assignment = result_df

# Return the best result
best_assignment


100%|██████████| 10000/10000 [15:38<00:00, 10.66it/s]


Unnamed: 0,Team,Latitude,Longitude,Conf,Final_Division,Final_Conference,Division,avg_dist
0,Raiders,36.09084,-115.18326,1,5,1,4,31051.45079
1,Chiefs,39.04896,-94.484,1,7,1,7,27482.662915
2,Cowboys,32.74777,-97.09346,1,6,1,6,24019.975621
3,Panthers,35.22569,-80.85285,0,2,0,0,19091.509869
4,Saints,29.95101,-90.08114,1,6,1,6,24607.329947
5,Broncos,39.74385,-105.02029,1,7,1,7,28708.86869
6,Commanders,38.90778,-76.86461,0,3,0,0,17874.464799
7,Browns,41.50596,-81.69969,0,2,0,2,16474.600127
8,Lions,42.34001,-83.04556,0,2,0,2,16999.804396
9,Patriots,42.09084,-71.26434,0,0,0,3,22776.388204


In [20]:
print(f'Total Distance of new allocation: {avg_distance(best_assignment, stadium_distances).avg_dist.sum():,f}')

Total Distance of new allocation: 767,755.316040


#### 3.B.iii) Plotting New Allocation on a Map

In [21]:
# Function to sort points around their centroid
def sort_points_clockwise(points):
    center = np.mean(points, axis=0)
    angles = np.arctan2(points[:, 1] - center[1], points[:, 0] - center[0])
    return points[np.argsort(angles)]

In [22]:
# Assign division and conference colors using mapping dictionaries
def create_team_map(df):
    division_color_map = {
        0: 'red', 1: 'orange', 2: 'yellow', 3: 'green',
        4: 'blue', 5: 'purple', 6: 'brown', 7: 'black'
    }
    conf_color_map = {0: 'red', 1: 'blue'}

    df['color'] = df['Division'].map(division_color_map)
    df['color_conf'] = df['Conf'].map(conf_color_map)

    # Calculate center for map
    avg_lat = df['Latitude'].mean()
    avg_lon = df['Longitude'].mean()

    # Initialize folium map
    team_map = folium.Map(location=[avg_lat, avg_lon], zoom_start=5, detect_retina=True)

    # Add circle markers
    for _, row in df.iterrows():
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"Team {row.name} -- {row['Team']}",
            radius=5,
            color=row['color_conf'],
            fill=True,
            fill_color=row['color'],
            fill_opacity=0.8
        ).add_to(team_map)


    # Draw polygons for each division
    for division, group in df.groupby('Division'):
        if len(group) != 4:
            continue  # only handling 4-team divisions

        points = group[['Latitude', 'Longitude']].to_numpy()
        ordered_points = sort_points_clockwise(points)
        ordered_points = np.vstack([ordered_points, ordered_points[0]])  # close the polygon

        folium.PolyLine(
            locations=ordered_points.tolist(),
            color=division_color_map.get(division, 'gray')
        ).add_to(team_map)

    return(team_map)

In [23]:
# Plotting the optimal divisions and conferences
create_team_map(best_assignment)

### 3.C) Comparing to the Current Divisions

In [24]:
# Manualy assigning current division and conference breakdown
latlon_current = latlon.copy()

current_div_list = [3,3,4,6,6,3,4,1,5,0,0,1,0,5,7,4,2,7,1,6,4,0,2,2,1,6,7,3,5,7,2,5]
current_conf_list =[0,0,1,1,1,0,1,0,1,0,0,0,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,0,1,1,0,1]

latlon_current['Division'] = current_div_list
latlon_current['Conf'] = current_conf_list

current_dist = avg_distance(latlon_current, stadium_distances).avg_dist.sum()
print(f'Current NFL Total Distance: {current_dist:,.0f} km')

Current NFL Total Distance: 877,281 km


In [25]:
# What are the current divisions and conferences?
# Messy messy messy
create_team_map(latlon_current)

In [26]:
current_div_distance = avg_distance(latlon_current, stadium_distances).avg_dist.sum()
new_div_distance = best_assignment.avg_dist.sum()

annual_mileage_saved = (current_div_distance-new_div_distance)/current_div_distance
print(f'On average, the new divisions save {annual_mileage_saved:.2%} in travel distance.')

On average, the new divisions save 12.48% in travel distance.


In [27]:
## Do teams take a bus or plane to other teams?
# A team takes a bus if they are within roughly 200km (120 miles) of each other
plane_bus_df = stadium_distances.copy()
bus_dist = 200

for i in range(32):
    for j in range(32):
        if stadium_distances.iloc[i,j] <= bus_dist:
            plane_bus_df.iloc[i,j] = 'bus'
        else:
            plane_bus_df.iloc[i,j] = 'plane'


### 3.D) Calculating Gas/Money Costs

In [28]:
# Passenger jet uses 5 gallons of fuel per passenger mile
# 6 mpg for a bus

In [29]:
# Convert distances to miles
stadium_distances_miles = stadium_distances.copy() * 0.621371

# Calculate fuel used for each team-to-team distance
fuel_used_df = stadium_distances_miles.copy()

for i in range(32):
    for j in range(32):
        if plane_bus_df.iloc[i,j] == 'bus':
            fuel_used_df.iloc[i,j] = stadium_distances_miles.iloc[i,j] / 6
        else:
            fuel_used_df.iloc[i,j] = stadium_distances_miles.iloc[i,j] * 5
            
fuel_used_df

Unnamed: 0,Raiders,Chiefs,Cowboys,Panthers,Saints,Broncos,Commanders,Browns,Lions,Patriots,...,Titans,Texans,Bengals,Buaccaneers,Rams,Chargers,Bears,Cardinals,Jaguars,Vikings
Raiders,0.0,5752.617316,5280.540151,9601.430563,7558.289877,3044.696959,10481.994982,9157.4651,8821.613923,11837.771001,...,7908.03211,6140.78032,8427.752369,9931.382436,1159.778421,1159.778421,7639.157514,1213.834737,9859.157624,6501.625123
Chiefs,5752.617316,0.0,2298.335264,3979.241279,3386.359075,2824.585599,4729.987534,3475.012859,3204.924016,6170.387565,...,2335.407742,3249.37487,2675.154891,5147.994937,6839.998028,6839.998028,2051.58786,5300.140686,4729.946081,2073.110137
Cowboys,5280.540151,2298.335264,0.0,4729.855169,2285.198632,3274.825646,6043.181112,5201.773331,5071.662421,7755.68871,...,3170.007407,1170.377768,4144.755706,4651.043081,6141.019268,6141.019268,4085.151627,4397.390999,4628.126479,4351.352109
Panthers,9601.430563,3979.241279,4729.855169,0.0,3247.085527,6791.551409,1682.790176,2184.123994,2530.267482,3509.113461,...,1693.627823,4654.375287,1676.138589,2553.910708,10618.6115,10618.6115,2933.165618,8947.858756,1710.559133,4694.284214
Saints,7558.289877,3386.359075,2285.198632,3247.085527,0.0,5413.830144,4870.385466,4632.135445,4709.495431,6706.551201,...,2353.563741,1601.771665,3536.002156,2392.011619,8379.879587,8379.879587,4176.455101,6627.054771,2528.27358,5267.138161
Broncos,3044.696959,2824.585599,3274.825646,6791.551409,5413.830144,0.0,7507.198574,6133.136862,5785.399401,8801.744764,...,5118.026793,4417.453641,5470.921796,7607.143964,4188.647459,4188.647459,4606.497027,2941.008568,7349.474893,3503.213578
Commanders,10481.994982,4729.987534,6043.181112,1682.790176,4870.385466,7507.198574,0.0,1561.15286,2009.486493,1838.103878,...,2875.714032,6166.932657,2056.899314,4113.807554,11567.205243,11567.205243,3008.609841,9986.1933,3263.468263,4698.339862
Browns,9157.4651,3475.012859,5201.773331,2184.123994,4632.135445,6133.136862,1561.15286,0.0,15.023885,2696.470071,...,2296.204799,5605.836016,1116.501206,4684.901979,10273.860588,10273.860588,1532.960972,8772.939547,3867.393921,3145.616743
Lions,8821.613923,3204.924016,5071.662421,2530.267482,4709.495431,5785.399401,2009.486493,15.023885,0.0,3016.545729,...,2356.143912,5566.085833,1186.452166,4970.187995,9947.872185,9947.872185,1184.430639,8480.969103,4174.117052,2710.371441
Patriots,11837.771001,6170.387565,7755.68871,3509.113461,6706.551201,8801.744764,1838.103878,2696.470071,3016.545729,0.0,...,4628.028183,7968.040585,3626.745607,5815.746749,12961.558447,12961.558447,4198.74369,11469.310038,4985.61332,5585.866355


In [30]:
# My allocation
new_allocation_fuel = avg_fuel_func(best_assignment, fuel_used_df).avg_fuel.sum()

# Current Allocation
current_fuel = avg_fuel_func(latlon_current, fuel_used_df).avg_fuel.sum()

# Gallons saved * Price per Gallon
fuel_saved = (current_fuel - new_allocation_fuel) * 4
print(f'On average, the new divisions save ${fuel_saved:,.0f} in fuel costs.')

On average, the new divisions save $967,353 in fuel costs.
