# ACS Exploratory Data Analysis

This script queries 2023 American Community Survey (ACS) 5-Year Estimates at the census tract level for California. It retrieves selected demographic and socioeconomic variables using the `Census Bureau’s ACS API`, processes and derives custom income categories, merges the data with tract-level geographic boundaries from the `pygris` package, and prepares the merged dataset ridership analysis.

- [ACS variables info](https://api.census.gov/data/2023/acs/acs5/variables.html)
- Requires a valid `API key` from the U.S. Census Bureau. Request for API key [here](https://api.census.gov/data/key_signup.html). The API key must be stored in a local file named `ACS_apikey`. 



In [1]:
pip install pygris

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
from pygris import tracts 
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [4]:
with open("ACS_apikey", "r") as file:
    api_key = file.read().strip()

In [5]:
variables = ["B01003_001E", "B09005_001E", "B16008_037E", "B01001B_001E",  "B01001I_001E", "B17001_002E", "B06012_001E", "B08014_002E", "B08201_002E", "B01001_003E", "B01001_004E",
            "B01001_005E", "B01001_006E", "B01001_007E", "B01001_008E", "B01001_009E", "B01001_010E", "B01001_020E", "B01001_021E", "B01001_022E", 
            "B01001_023E", "B01001_024E", "B01001_025E", "B01001_027E", "B01001_028E", "B01001_029E",  "B01001_030E", "B01001_031E", "B01001_032E",
            "B01001_033E", "B01001_034E", "B01001_044E", "B01001_045E", "B01001_046E", "B01001_047E", "B01001_048E", "B01001_049E", "B06010_004E", "B06010_005E",
            "B06010_006E", "B06010_007E", "B06010_008E", "B06010_009E", "B06010_010E" ]

In [6]:
variable_str = "NAME," + ",".join(variables)

In [7]:
url = f"https://api.census.gov/data/2023/acs/acs5?get={variable_str}&for=tract:*&in=state:06&key={api_key}"

In [8]:
response = requests.get(url)

In [9]:
if response.status_code == 200:
    data = response.json()
    census_data = pd.DataFrame(data[1:], columns=data[0])
    
    # Create GEOID column
    census_data["GEOID"] = census_data["state"] + census_data["county"] + census_data["tract"]

In [10]:
#Renaming variables 
census_data = census_data.rename(columns={
    'B01003_001E': 'total_pop', 'B09005_001E': 'households', 'B16008_037E': 'not_us_citizen_pop', 'B01001B_001E': 'black_pop', 'B01001I_001E': 'hispanic_pop', 'B17001_002E': 'poverty',
    'B06012_001E': 'pop_determined_poverty_status','B08014_002E': 'workers_with_no_car', 'B08201_002E': 'households_with_no_cars', 'B01001_003E': 'male_under_5', 
    'B01001_004E': 'male_5_to_9', 'B01001_005E': 'male_10_to_14', 'B01001_006E': 'male_15_to_17', 'B01001_007E': 'male_18_to_19', 'B01001_008E': 'male_20', 'B01001_009E': 'male_21', 
    'B01001_010E': 'male_22_to_24', 'B01001_020E': 'male_65_to_66', 'B01001_021E': 'male_67_to_69', 'B01001_022E': 'male_70_to_74', 'B01001_023E': 'male_75_to_79', 
    'B01001_024E': 'male_80_to_84', 'B01001_025E': 'male_85_and_over', 'B01001_027E': 'female_under_5', 'B01001_028E': 'female_5_to_9', 'B01001_029E': 'female_10_to_14', 
    'B01001_030E': 'female_15_to_17', 'B01001_031E': 'female_18_to_19', 'B01001_032E': 'female_20', 'B01001_033E': 'female_21', 'B01001_034E': 'female_22_to_24', 
    'B01001_044E': 'female_65_to_66', 'B01001_045E': 'female_67_to_69', 'B01001_046E': 'female_70_to_74', 'B01001_047E': 'female_75_to_79', 'B01001_048E': 'female_80_to_84', 
    'B01001_049E': 'female_85_and_over', 'B06010_004E': 'income_less_10000', 'B06010_005E': 'income_10000_14999', 'B06010_006E': 'income_15000_24999', 'B06010_007E': 'income_25000_34999', 
    'B06010_008E': 'income_35000_49999', 'B06010_009E': 'income_50000_64999', 'B06010_010E': 'income_65000_74999'

})

In [11]:
keep_cols = [
    'total_pop', 'households', 'not_us_citizen_pop', 'black_pop', 'hispanic_pop', 'poverty',
    'pop_determined_poverty_status', 'workers_with_no_car', 'households_with_no_cars', 'male_under_5',
    'male_5_to_9', 'male_10_to_14', 'male_15_to_17', 'male_18_to_19', 'male_20', 'male_21',
    'male_22_to_24', 'male_65_to_66', 'male_67_to_69', 'male_70_to_74', 'male_75_to_79',
    'male_80_to_84', 'male_85_and_over', 'female_under_5', 'female_5_to_9', 'female_10_to_14',
    'female_15_to_17', 'female_18_to_19', 'female_20', 'female_21', 'female_22_to_24',
    'female_65_to_66', 'female_67_to_69', 'female_70_to_74', 'female_75_to_79', 'female_80_to_84',
    'female_85_and_over', 'income_less_10000', 'income_10000_14999', 'income_15000_24999',
    'income_25000_34999', 'income_35000_49999', 'income_50000_64999', 'income_65000_74999', 'GEOID'
]

census_data = census_data[keep_cols]

In [12]:
census_data = census_data[keep_cols]
cols_to_numeric = [col for col in keep_cols if col != 'GEOID']
census_data[cols_to_numeric] = census_data[cols_to_numeric].apply(pd.to_numeric, errors='coerce')

In [13]:
census_data.head(5)

Unnamed: 0,total_pop,households,not_us_citizen_pop,black_pop,hispanic_pop,poverty,pop_determined_poverty_status,workers_with_no_car,households_with_no_cars,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,male_22_to_24,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,GEOID
0,3094,582,264,137,200,134,3075,28,85,69,106,160,19,0,0,0,26,47,84,119,49,46,78,39,110,53,26,13,0,14,19,52,70,72,85,105,107,188,75,134,157,87,129,70,6001400100
1,2093,361,96,71,196,164,2093,92,95,63,60,13,33,0,3,0,10,18,60,59,58,28,26,77,22,65,28,9,0,0,0,40,35,67,96,34,13,75,70,89,12,207,77,32,6001400200
2,5727,978,306,524,497,310,5707,157,416,78,185,171,94,88,0,6,93,23,47,113,100,24,25,115,196,84,55,33,0,26,43,108,62,194,158,13,142,383,201,300,251,400,148,291,6001400300
3,4395,1027,185,457,604,343,4351,134,204,125,115,169,30,41,0,0,0,31,70,89,19,26,36,216,137,208,27,9,0,0,38,55,105,104,43,23,30,187,105,287,215,207,178,87,6001400400
4,3822,535,231,919,557,397,3816,74,169,128,115,111,26,81,9,0,115,41,32,56,41,4,0,64,55,29,7,0,0,98,46,19,47,51,50,60,203,256,91,244,213,385,387,244,6001400500


In [14]:
#Creating custom income variables 
census_data['inc_extremelylow'] = census_data['income_less_10000'] + census_data['income_10000_14999'] + census_data['income_15000_24999']
census_data['inc_verylow'] = census_data['income_25000_34999'] + census_data['income_35000_49999']
census_data['inc_low'] = census_data['income_50000_64999'] + census_data['income_65000_74999']

In [15]:
# Aggregating age variables 
census_data['male_youth'] = census_data.loc[:,"male_under_5":"male_22_to_24"].sum(axis=1)
census_data['female_youth'] = census_data.loc[:,"female_under_5":"female_22_to_24"].sum(axis=1)

census_data['male_seniors'] = census_data.loc[:, "male_65_to_66":"male_85_and_over"].sum(axis=1)
census_data['female_seniors'] = census_data.loc[:, "female_65_to_66":"female_85_and_over"].sum(axis=1)

In [16]:
# Creating youth and senior population categories 
census_data['youth_pop'] = census_data['male_youth'] + census_data['female_youth']
census_data['seniors_pop'] = census_data['male_seniors'] + census_data['female_seniors']

In [17]:
census_data.columns

Index(['total_pop', 'households', 'not_us_citizen_pop', 'black_pop',
       'hispanic_pop', 'poverty', 'pop_determined_poverty_status',
       'workers_with_no_car', 'households_with_no_cars', 'male_under_5',
       'male_5_to_9', 'male_10_to_14', 'male_15_to_17', 'male_18_to_19',
       'male_20', 'male_21', 'male_22_to_24', 'male_65_to_66', 'male_67_to_69',
       'male_70_to_74', 'male_75_to_79', 'male_80_to_84', 'male_85_and_over',
       'female_under_5', 'female_5_to_9', 'female_10_to_14', 'female_15_to_17',
       'female_18_to_19', 'female_20', 'female_21', 'female_22_to_24',
       'female_65_to_66', 'female_67_to_69', 'female_70_to_74',
       'female_75_to_79', 'female_80_to_84', 'female_85_and_over',
       'income_less_10000', 'income_10000_14999', 'income_15000_24999',
       'income_25000_34999', 'income_35000_49999', 'income_50000_64999',
       'income_65000_74999', 'GEOID', 'inc_extremelylow', 'inc_verylow',
       'inc_low', 'male_youth', 'female_youth', 'male_senior

In [18]:
#Retrieving Tract Geometries for California
ca_tracts = tracts(state = "CA", cb = True,
                    year = 2023, cache = True)

Using FIPS code '06' for input 'CA'


In [19]:
ca_tracts.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 9109 entries, 0 to 9108
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   STATEFP     9109 non-null   object  
 1   COUNTYFP    9109 non-null   object  
 2   TRACTCE     9109 non-null   object  
 3   GEOIDFQ     9109 non-null   object  
 4   GEOID       9109 non-null   object  
 5   NAME        9109 non-null   object  
 6   NAMELSAD    9109 non-null   object  
 7   STUSPS      9109 non-null   object  
 8   NAMELSADCO  9109 non-null   object  
 9   STATE_NAME  9109 non-null   object  
 10  LSAD        9109 non-null   object  
 11  ALAND       9109 non-null   int64   
 12  AWATER      9109 non-null   int64   
 13  geometry    9109 non-null   geometry
dtypes: geometry(1), int64(2), object(11)
memory usage: 996.4+ KB


In [20]:
# Merging the census tract geometries with the census data based on the GEOID
tracts_ca_acs = ca_tracts.merge(census_data, how="inner", on="GEOID")

In [21]:
#Selecting Relevant Columns:
tracts_ca_acs = tracts_ca_acs[['ALAND', 'geometry', 'GEOID', 'total_pop', 'households', 'not_us_citizen_pop', 'black_pop', 'hispanic_pop', 'inc_extremelylow', 'inc_verylow',
                               'inc_low', 'poverty', 'pop_determined_poverty_status', 'workers_with_no_car', 'households_with_no_cars', 'male_youth', 'female_youth', 'male_seniors',
                               'female_seniors', 'youth_pop', 'seniors_pop']]

In [22]:
tracts_ca_acs.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 9109 entries, 0 to 9108
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   ALAND                          9109 non-null   int64   
 1   geometry                       9109 non-null   geometry
 2   GEOID                          9109 non-null   object  
 3   total_pop                      9109 non-null   int64   
 4   households                     9109 non-null   int64   
 5   not_us_citizen_pop             9109 non-null   int64   
 6   black_pop                      9109 non-null   int64   
 7   hispanic_pop                   9109 non-null   int64   
 8   inc_extremelylow               9109 non-null   int64   
 9   inc_verylow                    9109 non-null   int64   
 10  inc_low                        9109 non-null   int64   
 11  poverty                        9109 non-null   int64   
 12  pop_determined_poverty_sta

In [23]:
def export_gdf(gdf, filename: str):
    
    gdf.to_parquet(f"{filename}.parquet")
    
    fs.put(
        f"{filename}.parquet",
        f"{GCS_FILE_PATH}/{filename}.parquet",
        token = credentials.token
    )
    
    os.remove(f"{filename}.parquet")
    print(f"saved {GCS_FILE_PATH}/{filename}.parquet")
    
    return

In [24]:
export_gdf(tracts_ca_acs, "tracts_ca_acs")

saved gs://calitp-analytics-data/data-analyses/ahsc_grant//tracts_ca_acs.parquet
