In [1]:
#pip install pygris

In [20]:
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
from pygris import tracts 
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [3]:
with open ("ACS_apikey", "r") as file:
    api_key = file.read().strip()

In [4]:
# County Level Metrics required: "Total Population", "Total Veteran Population", "Total Senior Population", "Total Low Income Population"
variables = [
    "B01003_001E",                                                                            # Total Population
    "B17001_002E",                                                                            # Population with Income in the past 12 months below poverty level
    "B16008_037E",                                                                            # Non US Citizen Population
    "B01001_020E", "B01001_021E", "B01001_022E", "B01001_023E", "B01001_024E", "B01001_025E", # Male senior population : 65 and above
    "B01001_044E", "B01001_045E", "B01001_046E", "B01001_047E", "B01001_048E", "B01001_049E", # Female senior population : 65 and above
    "B06010_004E", "B06010_005E", "B06010_006E",                                              # Population with extremely low income
    "B06010_007E", "B06010_008E",                                                             # Population with very low income
    "B06010_009E", "B06010_010E",                                                             # Population with low income 
    "B08014_002E", "B08201_002E",                                                             # Workers and Households with no cars
    "B18101_001E",                                                                            # Total Population with Disability
    "B19058_001E"                                                                             # Public Assistance Income or Food Stamps/SNAP in past 12 months for Households
]

             

In [5]:
variable_str = "NAME," + ",".join(variables)

In [6]:
url = f"https://api.census.gov/data/2023/acs/acs5?get={variable_str}&for=tract:*&in=state:06&key={api_key}"

In [7]:
response = requests.get(url)

In [8]:
if response.status_code == 200:
    data = response.json()
    census_data = pd.DataFrame(data[1:], columns=data[0])
    
    # Create GEOID column
    census_data["GEOID"] = census_data["state"] + census_data["county"] + census_data["tract"]

In [9]:
census_data['county_name'] = census_data['NAME'].str.extract(r';\s*([A-Za-z\s]+) County;')

In [10]:
census_data = census_data.drop(columns=['NAME'])

In [11]:
census_data = census_data.rename(columns = {
    'B01003_001E': 'total_pop',
    'B17001_002E': 'poverty_pop',
    'B16008_037E': 'non_us_citizen',
    'B01001_020E': 'male_65_to_66', 'B01001_021E': 'male_67_to_69', 'B01001_022E': 'male_70_to_74', 
    'B01001_023E': 'male_75_to_79', 'B01001_024E': 'male_80_to_84', 'B01001_025E': 'male_85_and_over',
    'B01001_044E': 'female_65_to_66', 'B01001_045E': 'female_67_to_69', 'B01001_046E': 'female_70_to_74', 
    'B01001_047E': 'female_75_to_79', 'B01001_048E': 'female_80_to_84', 'B01001_049E': 'female_85_and_over',
    'B06010_004E': 'income_less_10000', 'B06010_005E': 'income_10000_14999', 'B06010_006E': 'income_15000_24999', 
    'B06010_007E': 'income_25000_34999', 'B06010_008E': 'income_35000_49999',
    'B06010_009E': 'income_50000_64999', 'B06010_010E': 'income_65000_74999',
    'B08014_002E': 'workers_with_no_car', 'B08201_002E': 'households_with_no_cars',
    'B18101_001E': 'disabled_pop',
    'B19058_001E': 'public_asst_pop'
})

In [12]:
exclude = ['state', 'county', 'tract', 'county_name', 'GEOID']
cols_to_numeric = [col for col in census_data.columns if col not in exclude]
census_data[cols_to_numeric] = census_data[cols_to_numeric].apply(pd.to_numeric, errors='coerce')

In [13]:
census_data.head(5)

Unnamed: 0,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,disabled_pop,public_asst_pop,state,county,tract,GEOID,county_name
0,3094,134,264,47,84,119,49,46,78,52,70,72,85,105,107,188,75,134,157,87,129,70,28,85,3094,1316,6,1,400100,6001400100,Alameda
1,2093,164,96,18,60,59,58,28,26,40,35,67,96,34,13,75,70,89,12,207,77,32,92,95,2093,861,6,1,400200,6001400200,Alameda
2,5727,310,306,23,47,113,100,24,25,108,62,194,158,13,142,383,201,300,251,400,148,291,157,416,5727,2713,6,1,400300,6001400300,Alameda
3,4395,343,185,31,70,89,19,26,36,55,105,104,43,23,30,187,105,287,215,207,178,87,134,204,4376,1803,6,1,400400,6001400400,Alameda
4,3822,397,231,41,32,56,41,4,0,19,47,51,50,60,203,256,91,244,213,385,387,244,74,169,3822,1655,6,1,400500,6001400500,Alameda


In [14]:
#Creating custom income variables 
census_data['inc_extremelylow'] = census_data['income_less_10000'] + census_data['income_10000_14999'] + census_data['income_15000_24999']
census_data['inc_verylow'] = census_data['income_25000_34999'] + census_data['income_35000_49999']
census_data['inc_low'] = census_data['income_50000_64999'] + census_data['income_65000_74999']

In [15]:
census_data['male_seniors'] = census_data.loc[:, "male_65_to_66":"male_85_and_over"].sum(axis=1)
census_data['female_seniors'] = census_data.loc[:, "female_65_to_66":"female_85_and_over"].sum(axis=1)

In [16]:
#Retrieving Tract Geometries for California
ca_tracts = tracts(state = "CA", cb = True,
                    year = 2023, cache = True)

Using FIPS code '06' for input 'CA'


In [18]:
# Merging the census tract geometries with the census data based on the GEOID
tracts_ca_acs = ca_tracts.merge(census_data, how="inner", on="GEOID")

In [21]:
# Querying dim organization
with db_engine.connect() as connection:
    query = """
        SELECT
            source_record_id, organization_type, ntd_id, ntd_agency_info_key, 
            public_currently_operating, _is_current,_valid_from,  _valid_to
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_organizations
    """
    dim_orgs= pd.read_sql(query, connection)