In [2]:
import requests
import pandas as pd
import json

url = "https://chronicdata.cdc.gov/resource/cwsq-ngmh.json"
params = {"stateabbr": "MA", "$limit": "10"}
response = requests.get(url, params=params)
print(f"Status: {response.status_code}")
ma_data = response.json()
print(f"Retrieved {len(ma_data)} Massachusetts records")

Status: 200
Retrieved 10 Massachusetts records


In [3]:
if ma_data:
    first_record = ma_data[0]
    print("Available fields:")
    for key in first_record.keys():
        print(f"  {key}")

Available fields:
  year
  stateabbr
  statedesc
  countyname
  countyfips
  locationname
  datasource
  category
  measure
  data_value_unit
  data_value_type
  data_value
  low_confidence_limit
  high_confidence_limit
  totalpopulation
  totalpop18plus
  geolocation
  locationid
  categoryid
  measureid
  datavaluetypeid
  short_question_text


In [4]:
print("Sample record structure:")
if ma_data:
    sample = ma_data[0]
    print(f"Location: {sample['locationname']} ({sample['countyname']} County)")
    print(f"Measure: {sample['measure']}")
    print(f"Value: {sample['data_value']}% ({sample['data_value_type']})")
    print(f"Confidence Interval: {sample['low_confidence_limit']}-{sample['high_confidence_limit']}%")
    print(f"Population: {sample['totalpopulation']} (18+: {sample['totalpop18plus']})")
    print(f"Year: {sample['year']}, Source: {sample['datasource']}")

Sample record structure:
Location: 25003900400 (Berkshire County)
Measure: Stroke among adults
Value: 3.3% (Crude prevalence)
Confidence Interval: 3.0-3.7%
Population: 4742 (18+: 3813)
Year: 2022, Source: BRFSS


In [5]:
categories = {}
for record in ma_data:
    cat = record.get('category', 'Unknown')
    categories[cat] = categories.get(cat, 0) + 1

print("Health Categories in MA data:")
for category, count in sorted(categories.items()):
    print(f"  {category}: {count} records")

Health Categories in MA data:
  Health Outcomes: 10 records


In [6]:
measures = {}
for record in ma_data:
    measure = record.get('measure', 'Unknown')
    measures[measure] = measures.get(measure, 0) + 1

print(f"Found {len(measures)} unique health measures:")
for measure, count in sorted(measures.items()):
    print(f"  {measure}: {count} records")


Found 1 unique health measures:
  Stroke among adults: 10 records


In [7]:
#Ah let's get more
large_params = {"stateabbr": "MA", "$limit": "500"}
large_response = requests.get(url, params=large_params)
large_ma_data = large_response.json()
print(f"Retrieved {len(large_ma_data)} MA records for analysis")

Retrieved 500 MA records for analysis


In [8]:
counties = {}
for record in large_ma_data:
    county = record.get('countyname', 'Unknown')
    counties[county] = counties.get(county, 0) + 1

print("Massachusetts Counties in dataset:")
for county, count in sorted(counties.items()):
    print(f"  {county}: {count} records")

Massachusetts Counties in dataset:
  Barnstable: 68 records
  Berkshire: 52 records
  Bristol: 159 records
  Dukes: 7 records
  Essex: 134 records
  Franklin: 1 records
  Hampden: 9 records
  Hampshire: 2 records
  Middlesex: 13 records
  Norfolk: 21 records
  Plymouth: 9 records
  Suffolk: 23 records
  Worcester: 2 records


In [9]:
years = {}
for record in large_ma_data:
    year = record.get('year', 'Unknown')
    years[year] = years.get(year, 0) + 1
print("Data years available:")
for year, count in sorted(years.items()):
    print(f"  {year}: {count} records")

Data years available:
  2021: 33 records
  2022: 467 records


In [10]:
sources = {}
for record in large_ma_data:
    source = record.get('datasource', 'Unknown')
    sources[source] = sources.get(source, 0) + 1

print("Data sources:")
for source, count in sorted(sources.items()):
    print(f"  {source}: {count} records")

Data sources:
  BRFSS: 500 records


In [11]:
equity_measures = []
for record in large_ma_data:
    measure = record.get('measure', '').lower()
    if any(keyword in measure for keyword in ['diabetes', 'obesity', 'blood pressure', 'heart', 'stroke']):
        equity_measures.append(record)

print(f"Healthcare Equity relevant measures: {len(equity_measures)}")
equity_conditions = {}
for record in equity_measures:
    measure = record.get('measure', 'Unknown')
    equity_conditions[measure] = equity_conditions.get(measure, 0) + 1

for condition, count in sorted(equity_conditions.items()):
    print(f"  {condition}: {count} records")

Healthcare Equity relevant measures: 158
  Coronary heart disease among adults: 9 records
  Diagnosed diabetes among adults: 9 records
  High blood pressure among adults: 8 records
  Obesity among adults: 28 records
  Stroke among adults: 96 records
  Taking medicine to control high blood pressure among adults with high blood pressure: 8 records


In [12]:
geo_records = []
for record in large_ma_data:
    geo = record.get('geolocation', {})
    if geo and isinstance(geo, dict) and 'coordinates' in geo:
        coords = geo['coordinates']
        if len(coords) >= 2:
            geo_records.append({
                'location': record.get('locationname', ''),
                'county': record.get('countyname', ''),
                'longitude': coords[0],
                'latitude': coords[1],
                'measure': record.get('measure', ''),
                'value': record.get('data_value', ''),
                'year': record.get('year', '')
            })

print(f"Records with geographic coordinates: {len(geo_records)}")
if geo_records:
    print("Sample geographic data:")
    for item in geo_records[:5]:
        print(f"  {item['county']}: {item['latitude']:.3f}, {item['longitude']:.3f}")

Records with geographic coordinates: 500
Sample geographic data:
  Berkshire: 42.479, -73.255
  Berkshire: 42.480, -73.154
  Bristol: 42.041, -71.136
  Bristol: 42.035, -71.223
  Bristol: 41.886, -71.068


In [13]:
print("Data Quality Analysis:")
total_records = len(large_ma_data)
missing_values = 0
valid_ranges = 0

for record in large_ma_data:
    value = record.get('data_value', '')
    if not value or value == '':
        missing_values += 1
    else:
        try:
            val = float(value)
            if 0 <= val <= 100:
                valid_ranges += 1
        except:
            missing_values += 1

print(f"Total records: {total_records}")
print(f"Missing/invalid values: {missing_values} ({missing_values/total_records*100:.1f}%)")
print(f"Valid percentage values: {valid_ranges} ({valid_ranges/total_records*100:.1f}%)")

Data Quality Analysis:
Total records: 500
Missing/invalid values: 0 (0.0%)
Valid percentage values: 500 (100.0%)


In [14]:
df_data = []
for record in large_ma_data:
    try:
        df_data.append({
            'year': record.get('year', ''),
            'county': record.get('countyname', ''),
            'location_id': record.get('locationname', ''),
            'measure': record.get('measure', ''),
            'category': record.get('category', ''),
            'value': float(record.get('data_value', 0)) if record.get('data_value') else None,
            'confidence_low': float(record.get('low_confidence_limit', 0)) if record.get('low_confidence_limit') else None,
            'confidence_high': float(record.get('high_confidence_limit', 0)) if record.get('high_confidence_limit') else None,
            'population': int(record.get('totalpopulation', 0)) if record.get('totalpopulation') else None,
            'data_source': record.get('datasource', ''),
            'data_type': record.get('data_value_type', '')
        })
    except:
        pass

df = pd.DataFrame(df_data)
print(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
print(f"Columns: {df.columns.tolist()}")


Created DataFrame with 500 rows and 11 columns
Columns: ['year', 'county', 'location_id', 'measure', 'category', 'value', 'confidence_low', 'confidence_high', 'population', 'data_source', 'data_type']


In [15]:
diabetes_df = df[df['measure'].str.contains('Diabetes', case=False, na=False)]
print(f"Diabetes records: {len(diabetes_df)}")

if not diabetes_df.empty:
    print(f"Diabetes data years: {sorted(diabetes_df['year'].unique())}")
    print(f"Counties with diabetes data: {len(diabetes_df['county'].unique())}")
    
    
    print("\nSample diabetes prevalence data:")
    for _, row in diabetes_df.head().iterrows():
        print(f"  {row['county']} ({row['year']}): {row['value']}%")


Diabetes records: 9
Diabetes data years: ['2022']
Counties with diabetes data: 4

Sample diabetes prevalence data:
  Barnstable (2022): 10.7%
  Essex (2022): 9.1%
  Essex (2022): 7.4%
  Essex (2022): 9.4%
  Bristol (2022): 13.5%


In [16]:
diabetes_df = df[df['measure'].str.contains('Diabetes', case=False, na=False)]
print(f"Diabetes records: {len(diabetes_df)}")

if not diabetes_df.empty:
    print(f"Diabetes data years: {sorted(diabetes_df['year'].unique())}")
    print(f"Counties with diabetes data: {len(diabetes_df['county'].unique())}")
    
    
    print("\nSample diabetes prevalence data:")
    for _, row in diabetes_df.head().iterrows():
        print(f"  {row['county']} ({row['year']}): {row['value']}%")


Diabetes records: 9
Diabetes data years: ['2022']
Counties with diabetes data: 4

Sample diabetes prevalence data:
  Barnstable (2022): 10.7%
  Essex (2022): 9.1%
  Essex (2022): 7.4%
  Essex (2022): 9.4%
  Bristol (2022): 13.5%


In [17]:
conditions = ['Diabetes', 'Obesity', 'High blood pressure', 'Stroke', 'Cancer']
condition_summary = {}

for condition in conditions:
    subset = df[df['measure'].str.contains(condition, case=False, na=False)]
    if not subset.empty:
        condition_summary[condition] = {
            'records': len(subset),
            'counties': len(subset['county'].unique()),
            'years': sorted(subset['year'].unique()),
            'avg_value': subset['value'].mean() if subset['value'].notna().any() else 0
        }

print("Multi-condition summary for analytics:")
for condition, stats in condition_summary.items():
    print(f"\n{condition}:")
    print(f"  Records: {stats['records']}")
    print(f"  Counties: {stats['counties']}")
    print(f"  Years: {stats['years']}")
    print(f"  Average prevalence: {stats['avg_value']:.1f}%")

Multi-condition summary for analytics:

Diabetes:
  Records: 9
  Counties: 4
  Years: ['2022']
  Average prevalence: 10.3%

Obesity:
  Records: 28
  Counties: 9
  Years: ['2022']
  Average prevalence: 28.8%

High blood pressure:
  Records: 16
  Counties: 5
  Years: ['2021']
  Average prevalence: 56.1%

Stroke:
  Records: 96
  Counties: 12
  Years: ['2022']
  Average prevalence: 3.2%

Cancer:
  Records: 12
  Counties: 4
  Years: ['2022']
  Average prevalence: 48.7%


In [18]:
latest_year = df['year'].max()
latest_data = df[df['year'] == latest_year]

print(f"Latest year analysis ({latest_year}):")
print(f"Records: {len(latest_data)}")


county_summary = latest_data.groupby('county').agg({
    'value': ['count', 'mean'],
    'population': 'max'
}).round(2)

if not county_summary.empty:
    print(f"\nCounty health metrics for {latest_year}:")
    print(county_summary.head())

Latest year analysis (2022):
Records: 467

County health metrics for 2022:
           value        population
           count   mean        max
county                            
Barnstable    58  18.66       7201
Berkshire     48  19.92       6330
Bristol      147  22.46       8101
Dukes          6  18.57       5341
Essex        128  17.75       9069


In [19]:
sample_export = df.head(20)[['year', 'county', 'measure', 'value', 'confidence_low', 'confidence_high']]
print("Sample data ready for export:")
print(sample_export)

print(f"\nData pipeline summary:")
print(f"- From the CDC PLACES API")
print(f"- Retrieved {len(large_ma_data)} Massachusetts health records")
print(f"- Processed {len(df)} valid records into analysis format")
print(f"- Identified {len(counties)} counties with health data")
print(f"- Found data spanning {len(years)} years")


Sample data ready for export:
    year     county                 measure  value  confidence_low  \
0   2022  Berkshire     Stroke among adults    3.3             3.0   
1   2022  Berkshire     Stroke among adults    3.3             3.0   
2   2022    Bristol     Stroke among adults    3.0             2.7   
3   2022    Bristol     Stroke among adults    2.3             2.1   
4   2022    Bristol     Stroke among adults    3.2             3.0   
5   2022    Bristol     Stroke among adults    3.0             2.7   
6   2022    Bristol     Stroke among adults    5.4             4.9   
7   2022    Bristol     Stroke among adults    4.4             4.0   
8   2022    Bristol     Stroke among adults    4.0             3.6   
9   2022    Bristol     Stroke among adults    3.5             3.2   
10  2022    Bristol     Stroke among adults    3.9             3.6   
11  2022    Bristol     Stroke among adults    3.7             3.4   
12  2022    Bristol     Stroke among adults    4.0          