In [79]:
from dotenv import load_dotenv
import os
import requests
import pandas as pd
from typing import Dict

In [80]:
class PlacesClient:
    def __init__(self, token):
        self.base_url = 'https://data.cdc.gov/api/v3/views/'
        self.session = requests.Session()
        self.session.headers.update({
            'X-App-Token': token
        })

    def _make_request(self, url: str, params=None):
        """
        Make a get request to the API and return responses in JSON
        """
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            raise RuntimeError(f"API Error: {e}")
    
    def _json_to_df(self, data) -> pd.DataFrame:
        """
        Transform JSON data into pandas DataFrame.
        """
        df = pd.DataFrame(data)
        # remove the API's metadata
        df = df.drop(
            [':id', ':version', ':created_at', ':updated_at', 'data_value_footnote_symbol', 'data_value_footnote'], 
            axis=1, errors='ignore'
            )
        # convert numeric variables
        numeric_cols = ['data_value', 'low_confidence_limit', 'high_confidence_limit', 'totalpopulation']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        return df

    def get_measure_list(self) -> pd.DataFrame:
        """
        Display all Health Outcomes and Health Risk Behaviors Measures 

        Returns
        -------
        measures_df : pandas Data Frame
            A dataframe displaying the following the information of filtered measures:
            - id: measure identifier
            - short_name: short label
            - full_name: full descriptive name
            - catgory: measure category (Health Outcomes or Health Risk Behaviors)

        Examples
        --------
        >>> measures = client.get_measure_list()
        >>> measures.head()
        """
        data_dictionary_id = 'm35w-spkz'
        url = self.base_url + data_dictionary_id + '/query.json'

        data = self._make_request(url)
        measures_df = self._json_to_df(data)
        measures_df = measures_df[measures_df['categoryid'].isin(['HLTHOUT', 'RISKBEH'])]
        measures_df = measures_df[['measureid', 'measure_short_name', 'measure_full_name', 'category_name']]
        measures_df.columns = pd.Index(['id', 'short_name', 'full_name', 'category'])
        return measures_df
    
    def get_county_data(self, release: str ='2025') -> pd.DataFrame:
        """
        Retrieve county-level health-risk behaviors and health outcomes data from The CDC PLACES API.
        
        Parameters
        ----------
        release : string
            The version of release to retrieve from.

        Returns
        -------
        county_df : pandas DataFrame
            A dataframe containing information of county-level PLACES data
        
        Examples
        --------
        >>> df = client.get_county_data('2023')
        >>> df.head()
        """
        release_ids = {
            '2025': 'swc5-untb',
            '2024': 'fu4u-a9bh',
            '2023': 'h3ej-a9ec',
            '2022': 'duw2-7jbt',
            '2021': 'pqpp-u99h',
            '2020': 'dv4u-3x3q'
        }
        
        if not isinstance(release, str):
            raise TypeError("The release must be a string.")
        if release not in release_ids:
            raise ValueError("This release version is not supported.")

        url = self.base_url + release_ids[release] + '/query.json'

        data = self._make_request(url)
        county_df = self._json_to_df(data)
        
        # Only keep measures categorized as health outcomes and health risk behaviors
        county_df = county_df[county_df['categoryid'].isin(['HLTHOUT', 'RISKBEH'])]
        county_df = county_df.reset_index(drop=True)

        # Drop rows missing the key data
        county_df = county_df.dropna(subset=["data_value"]).reset_index(drop=True)
        return county_df

    def filter_by_measures(self, df: pd.DataFrame, measures: str=None, categories: str=None) -> pd.DataFrame:
        """
        Get a subset of a PLACES DataFrame by measures or categories. 
        Both the short names and ids of measures are supported.
        
        Parameters
        ----------
        df : pandas DataFrame
            The county-level PLACES dataset.
        measures : list of strings
            Short names or measureids of measures to keep.
        categories : list of strings
            Short namse or categoryids of categories to keep.

        Returns
        -------
        sub_df : pandas DataFrame
            A dataframe containing only data of selected measures and/or categories.
        
        Examples
        --------
        >>> new_df = client.filter_by_measures(df, measures=['Physical Inactivity','Current Asthma'])
        >>> new_df = client.filter_by_measures(df, categories=['Health Outcomes'])
        """
        sub_df = df
        if measures:
            sub_df = sub_df[sub_df['short_question_text'].isin(measures) | sub_df['measureid'].isin(measures)]
        if categories:
            sub_df = sub_df[sub_df['category'].isin(categories) | sub_df['categoryid'].isin(categories)]
        return sub_df
    
    def filter_by_regions(self, df: pd.DataFrame, states: str=None, counties: str=None) -> pd.DataFrame:
        """
        Get a subset of a PLACES DataFrame by states or counties. 
        Both the names and abbreviations of states are supported.
        
        Parameters
        ----------
        df : pandas DataFrame
            The county-level PLACES dataset.
        states : list of strings
            Names or stateabbr of states to keep.
        counties : list of strings
            Names of counties to keep.

        Returns
        -------
        sub_df : pandas DataFrame
            A dataframe containing only data of selected counties and/or states.
        
        Examples
        --------
        >>> new_df = client.filter_by_measures(df, measures=['Physical Inactivity','Current Asthma'])
        >>> new_df = client.filter_by_measures(df, categories=['Health Outcomes'])
        """
        sub_df = df
        if states:
            sub_df = sub_df[sub_df['stateabbr'].isin(states) | sub_df['statedesc'].isin(states)]
        if counties:
            sub_df = sub_df[sub_df['locationname'].isin(counties)]
        return sub_df
    
    def create_pivot_table(self, df: pd.DataFrame, level='county') -> pd.DataFrame:
        """
        Create a wide pivot table that shows all measure values for each county or for each state.
        
        Parameters
        ----------
        df : pandas DataFrame
            The county-level PLACES dataset.
        level : str, optional
            Aggregation level, county or state.

        Returns
        -------
        table : pandas DataFrame
            A pivot table with columns representing measure IDs and rows representing counties or states.
        
        Examples
        --------
        >>> state_table = client.create_pivot_table(df, level='state')
        >>> state_table.head()
        """
        if level not in ['county', 'state']:
            raise ValueError("Level must be 'county' or 'state'.")
    
        # convert df into wide format
        table = df.pivot_table(
            index='locationname',
            columns='measureid',
            values='data_value',
        )

        if level == 'state':
            # aggregate county-level data into state-level data
            counties_states = df[['locationname', 'statedesc']].drop_duplicates().set_index('locationname')
            table = table.join(counties_states, how='left')
            table = table.groupby('statedesc').mean()

        return table

    def get_correlation(self, df: pd.DataFrame, x:str, y:str) -> Dict[str, float]:
        """
        Calculate the correlation between 2 measures
        
        Parameters
        ----------
        df : pandas DataFrame
            The county-level PLACES dataset.
        x : str
        The measure ID of the first variable.
        y : str
        The measure ID of the second variable.


        Returns
        -------
        result : dict
            A dictionary containing:
            - corr_coef: the correlation coefficient (r)
            - sample_size: number of counties included in calculation
            - mean_x, mean_y: means of measure x and y
        
        Examples
        --------
        >>> client.get_correlation(places_2024, 'LPA', 'DEPRESSION')
        {'corr_coef': 0.20321713670955188, 'sample_size': 1838, 'mean_x': 26.86089867640032, 'mean_y': 23.600384332489686}
        """
        if x is None or y is None:
            raise ValueError("Two measures (x and y) must be provided.")
        if not isinstance(x, str) or not isinstance(y, str):
            raise TypeError("x and y must be strings.")
        measures = df['measureid'].unique()
        if x not in measures or y not in measures:
            raise ValueError("Invalid measureid.")
        
        sub_df = self.filter_by_measures(df, measure_ids=[x, y])

        table = sub_df.pivot_table(values='data_value', index='locationname', columns='measureid')
        table = table.dropna()
        r = table[x].corr(table[y], method='pearson')

        result = {
            'corr_coef': float(r), 
            'sample_size': len(table),
            'mean_x': float(table[x].mean()),
            'mean_y': float(table[y].mean())
        }
        return result
    
    def summarize_measure(self, df: pd.DataFrame, measureid: str) -> Dict[str, float]:
        """
        Offer basic descriptive statistics for a given PLACES measure.

        Parameters
        ----------
        df : pandas DataFrame
            The county-level PLACES dataset.
        measureid : str
            The measure ID of the measure to summarize.

        Returns
        -------
        summary : dict
            Dictionary with mean, median, min, max, and missing value count.
        """
        if measureid not in df['measureid'].unique():
            raise ValueError("Invalid measureid.")

        data = df[df['measureid'] == measureid]['data_value']

        summary_dict =  {
            'mean': float(data.mean()),
            'median': float(data.median()),
            'min': float(data.min()),
            'max': float(data.max()),
            'std': float(data.std()),
            'count': float(data.count()),
            'missing_values_count': float(data.isna().sum())
        }
        return summary_dict


In [81]:
load_dotenv()
token = os.getenv('CDC_API_TOKEN')

client = PlacesClient(token=token)

In [82]:
# test get_county_data
places_2024 = client.get_county_data('2024')

In [83]:
places_WI = client.filter_by_regions(places_2024, states=['WI'])
places_WI

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,totalpop18plus,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation
438,2022,WI,Wisconsin,Dane,BRFSS,Health Outcomes,Obesity among adults,%,Crude prevalence,30.8,27.2,34.5,568203,458016,55025,HLTHOUT,OBESITY,CrdPrv,Obesity,"{'type': 'Point', 'coordinates': [-89.41829876..."
439,2022,WI,Wisconsin,Jackson,BRFSS,Health Outcomes,Stroke among adults,%,Crude prevalence,4.1,3.7,4.5,20836,16396,55053,HLTHOUT,STROKE,CrdPrv,Stroke,"{'type': 'Point', 'coordinates': [-90.80499871..."
443,2022,WI,Wisconsin,Portage,BRFSS,Health Outcomes,Arthritis among adults,%,Crude prevalence,25.4,24.5,26.2,70718,57657,55097,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis,"{'type': 'Point', 'coordinates': [-89.50146342..."
444,2022,WI,Wisconsin,Racine,BRFSS,Health Outcomes,Arthritis among adults,%,Crude prevalence,30.9,30.3,31.5,195846,151617,55101,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis,"{'type': 'Point', 'coordinates': [-88.06151104..."
445,2022,WI,Wisconsin,Rusk,BRFSS,Health Outcomes,Stroke among adults,%,Crude prevalence,5.0,4.5,5.5,14186,11360,55107,HLTHOUT,STROKE,CrdPrv,Stroke,"{'type': 'Point', 'coordinates': [-91.13345533..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100366,2022,WI,Wisconsin,Manitowoc,BRFSS,Health Outcomes,Diagnosed diabetes among adults,%,Crude prevalence,11.5,10.0,13.2,81172,64819,55071,HLTHOUT,DIABETES,CrdPrv,Diabetes,"{'type': 'Point', 'coordinates': [-87.80959450..."
100367,2022,WI,Wisconsin,Fond du Lac,BRFSS,Health Outcomes,Stroke among adults,%,Crude prevalence,3.7,3.3,4.0,103836,82265,55039,HLTHOUT,STROKE,CrdPrv,Stroke,"{'type': 'Point', 'coordinates': [-88.48834337..."
100368,2021,WI,Wisconsin,Iron,BRFSS,Health Outcomes,High cholesterol among adults who have ever be...,%,Age-adjusted prevalence,30.1,25.4,35.0,6224,5259,55051,HLTHOUT,HIGHCHOL,AgeAdjPrv,High Cholesterol,"{'type': 'Point', 'coordinates': [-90.24211599..."
100369,2021,WI,Wisconsin,Polk,BRFSS,Health Outcomes,High blood pressure among adults,%,Crude prevalence,34.3,29.9,38.8,45709,36755,55095,HLTHOUT,BPHIGH,CrdPrv,High Blood Pressure,"{'type': 'Point', 'coordinates': [-92.44127559..."


In [77]:
summary = client.summarize_measure(places_2024, 'COPD')
summary

{'mean': 8.275484896661368,
 'median': 8.0,
 'min': 3.1,
 'max': 19.4,
 'std': 2.1930258416977533,
 'count': 6290.0,
 'missing_values_count': 0.0}

In [63]:
places_2024_pivot = client.create_pivot_table(places_2024)
places_2024_pivot.head()

measureid,ARTHRITIS,BINGE,BPHIGH,CANCER,CASTHMA,CHD,COPD,CSMOKING,DEPRESSION,DIABETES,HIGHCHOL,LPA,OBESITY,SLEEP,STROKE,TEETHLOST
locationname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Abbeville,29.65,16.55,39.2,7.9,10.25,7.4,8.45,17.25,21.8,13.45,34.25,26.75,38.55,37.6,4.4,23.35
Acadia,28.95,18.05,40.85,7.45,10.9,7.75,9.55,20.7,28.1,13.35,36.1,31.95,36.8,37.65,4.4,18.7
Accomack,33.2,14.7,41.25,8.7,11.4,8.1,9.95,20.4,23.9,15.7,36.85,30.05,43.65,36.85,4.75,24.05
Ada,23.6,17.5,26.9,7.85,10.0,5.4,4.85,11.55,24.95,8.2,28.3,17.95,28.75,30.45,2.65,17.25
Adair,29.7,16.825,35.9625,7.85,11.4875,8.1125,9.4625,19.7375,25.0625,12.975,33.45,29.8875,40.425,38.825,4.175,21.2625


In [50]:
client.get_correlation(places_2024, 'LPA', 'DEPRESSION')

{'corr_coef': 0.20321713670955188,
 'sample_size': 1838,
 'mean_x': 26.86089867640032,
 'mean_y': 23.600384332489686}

In [5]:
places_2024.columns

Index(['year', 'stateabbr', 'statedesc', 'locationname', 'datasource',
       'category', 'measure', 'data_value_unit', 'data_value_type',
       'data_value', 'low_confidence_limit', 'high_confidence_limit',
       'totalpopulation', 'totalpop18plus', 'locationid', 'categoryid',
       'measureid', 'datavaluetypeid', 'short_question_text', 'geolocation'],
      dtype='object')

In [6]:
places_2024.tail(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,totalpop18plus,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation
100369,2021,WI,Wisconsin,Polk,BRFSS,Health Outcomes,High blood pressure among adults,%,Crude prevalence,34.3,29.9,38.8,45709,36755,55095,HLTHOUT,BPHIGH,CrdPrv,High Blood Pressure,"{'type': 'Point', 'coordinates': [-92.44127559..."
100370,2022,WI,Wisconsin,Trempealeau,BRFSS,Health Outcomes,Depression among adults,%,Age-adjusted prevalence,24.5,20.9,28.2,30899,23116,55121,HLTHOUT,DEPRESSION,AgeAdjPrv,Depression,"{'type': 'Point', 'coordinates': [-91.35842148..."
100371,2022,WY,Wyoming,Fremont,BRFSS,Health Risk Behaviors,Binge drinking among adults,%,Crude prevalence,16.4,13.5,19.5,39472,29818,56013,RISKBEH,BINGE,CrdPrv,Binge Drinking,"{'type': 'Point', 'coordinates': [-108.6304546..."


In [11]:
# test get_measure_list()
client.get_measure_list()

Unnamed: 0,id,short_name,full_name,category
0,ARTHRITIS,Arthritis,Arthritis among adults,Health Outcomes
1,BPHIGH,High Blood Pressure,High blood pressure among adults,Health Outcomes
2,CANCER,Cancer (non-skin) or melanoma,Cancer (non-skin) or melanoma among adults,Health Outcomes
3,CASTHMA,Current Asthma,Current asthma among adults,Health Outcomes
4,CHD,Coronary Heart Disease,Coronary heart disease among adults,Health Outcomes
5,COPD,COPD,Chronic obstructive pulmonary disease among ad...,Health Outcomes
6,DEPRESSION,Depression,Depression among adults,Health Outcomes
7,DIABETES,Diabetes,Diagnosed diabetes among adults,Health Outcomes
8,HIGHCHOL,High Cholesterol,High cholesterol among adults who have ever be...,Health Outcomes
9,KIDNEY,Chronic Kidney Disease,Chronic kidney disease among adults aged >=18 ...,Health Outcomes


In [84]:
sub_df = client.filter_by_measures(places_2024, measures=['COPD', 'CASTHMA'])
sub_df.tail()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,totalpop18plus,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation
100316,2022,WI,Wisconsin,Winnebago,BRFSS,Health Outcomes,Current asthma among adults,%,Age-adjusted prevalence,11.1,9.8,12.4,170718,136750,55139,HLTHOUT,CASTHMA,AgeAdjPrv,Current Asthma,"{'type': 'Point', 'coordinates': [-88.64469436..."
100318,2022,WI,Wisconsin,Iron,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.0,5.3,6.7,6224,5259,55051,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-90.24211599..."
100346,2022,WI,Wisconsin,Juneau,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,9.0,8.1,10.0,26866,21737,55057,HLTHOUT,COPD,CrdPrv,COPD,"{'type': 'Point', 'coordinates': [-90.11373312..."
100352,2022,WI,Wisconsin,Milwaukee,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,6.6,5.9,7.2,918661,702428,55079,HLTHOUT,COPD,CrdPrv,COPD,"{'type': 'Point', 'coordinates': [-87.96694912..."
100363,2022,WI,Wisconsin,Milwaukee,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,11.7,10.5,12.9,918661,702428,55079,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,"{'type': 'Point', 'coordinates': [-87.96694912..."
