# [Research Request - 5311 Apportionment Formula #1710](https://github.com/cal-itp/data-analyses/issues/1710)

- Do the past 3 years of 5311 apportionments to regional bodies passing through Caltrans align with the stated formula? If not, how would apportionments change? How would apportionments change if allocations went directly to each 5311-recipient agency?


- Double check work done by Sara.

- Cross reference all available types of census data.

- can we use the census API to read in data to a notebook? AND apple the apportionment formula

**Possible explanation of how population data was determined**
via email:
>Previous Grant Manger used a combination of population and demographic information she received from DOTP along with these FTA maps to determine the rural population in each of the CA regions
>
>We previously had discussed updating this formula to match how the FTA funds have been apportioned to California, but that conversation was put on hold for a much deeper discussion on the formula. 

Notes from Grant team 5311 apportionment excel sheet
1. Metropolitan Transportation Commission (MTC): 
  - Sonoma, 
  - Napa, 
  - Solano, 
  - Contra Costa, 
  - Alameda, 
  - Marin, 
  - San Francisco, 
  - San Mateo and 
  - Santa Clara
2. Sacramento Area Council of Governments (SACOG): 
  - Yuba, 
  - Sutter, 
  - Yolo and 
  - Sacramento
3. Alpine and Sierra Counties (collectively, 'Minimum Counties') receive a minimum funding amount of $48,000 due to low population (under 7,590)
4. Orange County does not receive Rural funds


In [1]:
import pandas as pd
import altair as alt

### County-level Urban and Rural information for the 2010 & 2020 Census

In [3]:
county_rural_2020_url = "https://www2.census.gov/geo/docs/reference/ua/2020_UA_COUNTY.xlsx"

county_rural_2010_url = "https://www2.census.gov/geo/docs/reference/ua/PctUrbanRural_County.xls"

# read in data
county_rural_data_2020 = pd.read_excel(county_rural_2020_url)
county_rural_data_2010 = pd.read_excel(county_rural_2010_url)

# lower case the columns
county_rural_data_2020.columns = county_rural_data_2020.columns.str.lower()
county_rural_data_2010.columns = county_rural_data_2010.columns.str.lower()

# add suffix to distinuish years
county_rural_data_2020 = county_rural_data_2020.add_suffix("_2020")
county_rural_data_2010 = county_rural_data_2010.add_suffix("_2010")

display(
    county_rural_data_2020.head(),
    county_rural_data_2010.head(),
)

### filter for just California

In [4]:
ca_county_2010 = county_rural_data_2010[county_rural_data_2010["statename_2010"]=="California"]
ca_county_2020 = county_rural_data_2020[county_rural_data_2020["state_name_2020"]=="California"]

# ca_county_2010["countyname_2010"].nunique() == ca_county_2020["county_name_2020"].nunique() #True

### aggregate by county name, sum population columns

In [6]:
county_2010 = ca_county_2010.groupby("countyname_2010").agg(
    {"pop_rural_2010":"sum", 
     "pop_cou_2010":"sum", 
     "poppct_rural_2010":"sum"}).reset_index()

county_2020 = ca_county_2020.groupby("county_name_2020").agg(
    {"pop_rur_2020":"sum", 
     "pop_cou_2020":"sum", 
     "poppct_rur_2020":"sum"}).reset_index()

# round population %
county_2020["poppct_rur_2020"] = (county_2020["poppct_rur_2020"]*100).round(2)

display(
    county_2010.head(),
    county_2020.head()
)

### merge 2010 and 2020 county census data

In [7]:
pop_2010_2020 = county_2010.merge(
    county_2020,
    left_on = "countyname_2010",
    right_on = "county_name_2020",
    how= "inner",
    indicator= True
).drop(columns= ["county_name_2020","pop_cou_2010","pop_cou_2020","poppct_rural_2010", "poppct_rur_2020","_merge"])

### consolidate specific counties to MTC and SACOG

In [8]:
mtc = [
    "Sonoma", 
    "Napa", 
    "Solano", 
    "Contra Costa", 
    "Alameda", 
    "Marin", 
    "San Francisco", 
    "San Mateo", 
    "Santa Clara"
]

sacog = [
    "Yuba", 
    "Sutter", 
    "Yolo", 
    "Sacramento"
]

# add MTC row
mtc_row = pop_2010_2020[pop_2010_2020["countyname_2010"].isin(mtc)][["pop_rural_2010","pop_rur_2020"]].sum()

mtc_row["countyname_2010"] = "MTC"

pop_2010_2020 = pd.concat([
    pop_2010_2020,
    pd.DataFrame([mtc_row])],
    ignore_index=True
)

# add SACOG row
sacog_row = pop_2010_2020[pop_2010_2020["countyname_2010"].isin(sacog)][["pop_rural_2010","pop_rur_2020"]].sum()

sacog_row["countyname_2010"] = "SACOG"

pop_2010_2020 = pd.concat([
    pop_2010_2020,
    pd.DataFrame([sacog_row])],
    ignore_index=True
)

# remove individual mtc and sacog counties
pop_2010_2020 = pop_2010_2020[(~pop_2010_2020["countyname_2010"].isin(mtc))
     &(~pop_2010_2020["countyname_2010"].isin(sacog))
]

## Applying OTGC formula
- via apportionment worksheet

apportionment = (county rural population/ total net population) * total apportionment available to counties - 197.27


In [9]:
# https://www.transit.dot.gov/funding/apportionments/table-9-fy-2025-section-5311-and-section-5340-rural-area-formula-0
ca_total_5311_apportionment = 43540762 

# total after removing (admin, min county, 5311f stuff)
total_available_to_counties = 32559572 

# list of min counties 
excluded_county_pop = ["Alpine", "Orange", "Sierra"]

# calculate net population
net_pop_2010 = pop_2010_2020["pop_rural_2010"].sum()-(pop_2010_2020[pop_2010_2020["countyname_2010"].isin(excluded_county_pop)]["pop_rural_2010"].sum())
net_pop_2020 = pop_2010_2020["pop_rur_2020"].sum()-(pop_2010_2020[pop_2010_2020["countyname_2010"].isin(excluded_county_pop)]["pop_rur_2020"].sum())

# function to replicate formula
def otgc_formula(
    rural_pop: int,
    total_net_pop: int,
    total_county_funds:int
):
    return ((rural_pop/total_net_pop)*total_county_funds)-197.27

# calculate 2010 apportionment
pop_2010_2020["apportionment_calc_2010"] = pop_2010_2020.apply(
    lambda row: otgc_formula(
        rural_pop = row["pop_rural_2010"],
        total_net_pop = net_pop_2010,
        total_county_funds = total_available_to_counties
    ), axis=1
).round(2)

# calculate 2020 apportionment
pop_2010_2020["apportionment_calc_2020"] = pop_2010_2020.apply(
    lambda row: otgc_formula(
        rural_pop = row["pop_rur_2020"],
        total_net_pop = net_pop_2020,
        total_county_funds = total_available_to_counties
    ), axis=1
).round(2)

## manual overwrite for orange, sierra, alpine
per instructions on apportionment worksheet

In [10]:
min_county_edit = {
    "Alpine":48000.00,
    "Orange":0,
    "Sierra":48000.00
}

# loop + .loc to update values at specific locations
for county, amount in min_county_edit.items():
    pop_2010_2020.loc[pop_2010_2020["countyname_2010"] == county, "apportionment_calc_2020"] = amount
    pop_2010_2020.loc[pop_2010_2020["countyname_2010"] == county, "apportionment_calc_2010"] = amount

## re-order columns

In [11]:
pop_2010_2020 = pop_2010_2020[[
    "countyname_2010",
    "pop_rural_2010",
    "apportionment_calc_2010",
    "pop_rur_2020",
    "apportionment_calc_2020"
]]

In [12]:
# final cleaned dataframe
pop_2010_2020.columns

Index(['countyname_2010', 'pop_rural_2010', 'apportionment_calc_2010',
       'pop_rur_2020', 'apportionment_calc_2020'],
      dtype='object')

## Compare against DLA apportionment

In [13]:
dla_fy25 = pd.read_csv("fy25_apportionment_dla.csv")

dla_fy25 = dla_fy25.rename(columns={
    'County/Region':'county',
    'ID':'id',
    'Population (9)': 'pop_rur_2025_dla',
    'FFY25 Apportionment': 'apportionment_dla_2025'
})
dla_fy25.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   county                  47 non-null     object 
 1   id                      45 non-null     float64
 2   dla_fy25_pop            47 non-null     int64  
 3   dla_fy25_apportionment  47 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.6+ KB


In [14]:
compare_merge = pop_2010_2020.merge(
    dla_fy25,
    left_on = "countyname_2010",
    right_on = "county",
    how = "inner",
    indicator = True
)

compare_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47 entries, 0 to 46
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   countyname_2010          47 non-null     object  
 1   pop_rural_2010           47 non-null     int64   
 2   apportionment_calc_2010  47 non-null     float64 
 3   pop_rur_2020             47 non-null     int64   
 4   apportionment_calc_2020  47 non-null     float64 
 5   county                   47 non-null     object  
 6   id                       45 non-null     float64 
 7   dla_fy25_pop             47 non-null     int64   
 8   dla_fy25_apportionment   47 non-null     float64 
 9   _merge                   47 non-null     category
dtypes: category(1), float64(4), int64(3), object(2)
memory usage: 3.8+ KB


In [15]:
# calc new columns that display diffferences 
compare_merge["2020_fy25_amt_diff"] = compare_merge["dla_fy25_apportionment"] - compare_merge["apportionment_calc_2020"]
compare_merge["2020_fy25_pop_diff"] = compare_merge["dla_fy25_pop"] - compare_merge["pop_rur_2020"]

In [16]:
display(
    compare_merge[["2020_fy25_amt_diff","2020_fy25_pop_diff"]].describe(),
    compare_merge.head()
)

Unnamed: 0,2020_fy25_amt_diff,2020_fy25_pop_diff
count,47.0,47.0
mean,184.635532,41175.212766
std,277530.592009,41980.759419
min,-996202.87,-7699.0
25%,-99623.91,13275.5
50%,14929.49,30210.0
75%,106611.35,54716.5
max,647155.09,180305.0


Unnamed: 0,countyname_2010,pop_rural_2010,apportionment_calc_2010,pop_rur_2020,apportionment_calc_2020,county,id,dla_fy25_pop,dla_fy25_apportionment,_merge,2020_fy25_amt_diff,2020_fy25_pop_diff
0,Alpine,1175,48000.0,1204,48000.0,Alpine,1.0,1119,48000.0,both,0.0,-85
1,Amador,23016,400197.85,28020,402349.01,Amador,2.0,46118,357467.0,both,-44882.01,18098
2,Butte,41584,723213.86,44478,638791.07,Butte,3.0,136143,1055644.0,both,416852.93,91665
3,Calaveras,34370,597716.36,37128,533198.1,Calaveras,4.0,50990,395251.0,both,-137947.1,13862
4,Colusa,6795,118011.14,9326,133783.69,Colusa,5.0,27483,212945.0,both,79161.31,18157


## Visuals

### melt dataframe

In [17]:
melt = compare_merge.melt(
    id_vars = ["countyname_2010"],
    value_vars = [
        "pop_rural_2010",
        "apportionment_calc_2010",
        "pop_rur_2020",
        "apportionment_calc_2020",
        'dla_fy25_pop',
        'dla_fy25_apportionment',
        '2020_fy25_amt_diff',
        '2020_fy25_pop_diff'
    ],
    value_name = "metric"
)

display(
    melt.sort_values(by="countyname_2010").head(),
    melt["variable"].value_counts()
)

Unnamed: 0,countyname_2010,variable,metric
0,Alpine,pop_rural_2010,1175.0
141,Alpine,apportionment_calc_2020,48000.0
47,Alpine,apportionment_calc_2010,48000.0
94,Alpine,pop_rur_2020,1204.0
282,Alpine,2020_fy25_amt_diff,0.0


pop_rural_2010             47
apportionment_calc_2010    47
pop_rur_2020               47
apportionment_calc_2020    47
dla_fy25_pop               47
dla_fy25_apportionment     47
2020_fy25_amt_diff         47
2020_fy25_pop_diff         47
Name: variable, dtype: int64

### 5311 Apportionments per County ($)

In [18]:
apportionment_cols =[
    "apportionment_calc_2010",
    "apportionment_calc_2020",
    "dla_fy25_apportionment",
    # "2020_fy25_amt_diff"
]

alt.Chart(melt[melt["variable"].isin(apportionment_cols)]).mark_bar().encode(
    x = "countyname_2010:N",
    y = "metric:Q",
    xOffset="variable:N",
    color = "variable:N",
    tooltip = ["countyname_2010","variable","metric"]
).properties(
    width="container",
    title= "5311 Apportionments per County ($)"
)


### Rural Populations per County

In [37]:
pop_cols=[
    "pop_rural_2010",
    "pop_rur_2020",
    "dla_fy25_pop",
    # "2020_fy25_pop_diff"
]

alt.Chart(melt[melt["variable"].isin(pop_cols)]).mark_bar().encode(
    x = "countyname_2010:N",
    y = "metric:Q",
    xOffset="variable:N",
    color = "variable:N",
    tooltip = ["countyname_2010","variable","metric"]
).properties(
    width= "container",
    title= "Rural Populations per County"
)


In [20]:
diff_cols=[
    "2020_fy25_amt_diff",
    "2020_fy25_pop_diff",
]

alt.Chart(melt[melt["variable"]=="2020_fy25_amt_diff"]).mark_bar().encode(
    x = "countyname_2010:N",
    y = "metric:Q",
    xOffset="variable:N",
    color = "variable:N",
    tooltip = ["countyname_2010","variable","metric"]
).properties(
    width= "container",
    title= "Difference in Rural Populations per County, 2020 rural Census populations compared to stated FFY25 populations"
)


In [21]:
alt.Chart(melt[melt["variable"]=="2020_fy25_pop_diff"]).mark_bar().encode(
    x = "countyname_2010:N",
    y = "metric:Q",
    xOffset="variable:N",
    color = "variable:N",
    tooltip = ["countyname_2010","variable","metric"]
).properties(
    width= "container",
    title= """Difference in Apportionments per County. Apportionment calculated using 2020 census compared to stated FFY25 apportionments"""
)


### 5311 Apportionments ($) and Rural Populations per County

In [23]:
melt["variable"].unique()

array(['pop_rural_2010', 'apportionment_calc_2010', 'pop_rur_2020',
       'apportionment_calc_2020', 'dla_fy25_pop',
       'dla_fy25_apportionment', '2020_fy25_amt_diff',
       '2020_fy25_pop_diff'], dtype=object)

In [41]:
# alt.Chart(melt).mark_bar().encode(
#     x = "variable",
#     y = alt.Y("metric:Q").stack(None),
#     row="countyname_2010:N",
#     column = 
#     color = "variable:N",
#     tooltip = ["countyname_2010","variable","metric"]
# ).properties(
#     width=600,
#     title= "5311 Apportionments ($) and Rural Populations per County"
# ).resolve_scale(y="independent")

#---
app_chart = alt.Chart(melt[melt["variable"].isin(
    [
        "apportionment_calc_2020",
        "dla_fy25_apportionment",
        "2020_fy25_amt_diff"
    ]
)]).mark_bar().encode(
    x = alt.X("variable"),
    y = alt.Y("metric"),
    row = "countyname_2010",
    tooltip=["countyname_2010","variable","metric"],
    color = "variable",
).resolve_scale(y="independent").properties(title = "apportionments",width=300)

pop_chart = alt.Chart(melt[melt["variable"].isin([
    "pop_rur_2020",
    "dla_fy25_pop",
    "2020_fy25_pop_diff"
])]).mark_bar().encode(
    x = alt.X("variable"),
    y = alt.Y("metric"),
    row = "countyname_2010",
    tooltip=["countyname_2010","variable","metric"],
    color = "variable",
).resolve_scale(y="independent").properties(title = "populations",width=300)

# diff_chart = alt.Chart(melt[melt["variable"].isin(diff_cols)]).mark_bar().encode(
#     x = alt.X("variable"),
#     y = alt.Y("metric"),
#     row = "countyname_2010",
#     tooltip=["countyname_2010","variable","metric"],
#     color = "variable"
# ).resolve_scale(y="independent").properties(title = "differences",width=300)

app_pop_chart = alt.hconcat(
    app_chart, 
    pop_chart, 
    # diff_chart
)

app_pop_chart