In [None]:
import requests
import pandas as pd
import plotly.express as px
from census_api import CENSUS_API_KEY
import json
import duckdb as db
from ydata_profiling import ProfileReport
from IPython.display import HTML

#### Get all the data from census bureau api for state and state code

In [None]:
# get the state and code
# Step 1: Fetch U.S. Census data using the Census API
api_key = CENSUS_API_KEY
base_url = "https://api.census.gov/data/2020/dec/pl"
params = {
    'get': 'NAME,P1_001N',  # 'NAME' is state name, 'P1_001N' is total population
    'for': 'state:*',       # Get data for all states
    'key': api_key
}

response = requests.get(base_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Parse JSON response
else:
    print(f"Error: {response.status_code}, {response.text}")
    data = []

# Step 2: Convert Census data into a DataFrame
if data:
    headers = data[0]  # The first row contains headers
    rows = data[1:]    # The remaining rows contain the actual data
    
    # Create DataFrame
    df1 = pd.DataFrame(rows, columns=headers)
    # Rename columns for easier access
    df1['State Code'] = df1['state'].astype('string').astype('category')
    df1.rename(columns={'NAME': 'State', 'P1_001N': 'Population'}, inplace=True)
    # Convert population to numeric (currently it's in string format)
    df1['Population'] = pd.to_numeric(df1['Population'])
    
else:
    print("No data available.")

##### get a subset of the data

In [None]:
df_state = df1[['State', 'State Code']]

In [None]:

def get_census_zipcode_data(year, dsource, cols, state, for_loop):
    BASE_URL = f'https://api.census.gov/data/{year}/{dsource}?get={cols}&for={for_loop}:*&in=state:{state}&key={api_key}'
    """
    Sends a GET request to the specified Census API URL and returns the response.
    
    Args:
        BASE_URL (str): The URL for the Census API request, including the necessary parameters.
    
    Returns:
        requests.Response: The response object from the Census API request.
    """
    response = requests.get(BASE_URL)
    if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data[1:], columns=data[0]) # Skip the header row
            df['B19013_001E'] = pd.to_numeric(df['B19013_001E'])
            df.rename(columns={'zip code tabulation area': 'zipcode'}, inplace=True)
            # print(df.head())
    else:
            print(f"Error: {response.status_code} - {response.text}")
    return df

In [None]:
def new_func():
    year='2019'
    dsource='acs/acs5'
    cols= 'B19013_001E'
    for_loop = 'zip%20code%20tabulation%20area' #'block%20group'
    dict = {}
    for ii, statecode in enumerate(df_state.iterrows()):
        state_code  = statecode[1]['State Code']
        state_name  = statecode[1]['State']
        df = get_census_zipcode_data(year, dsource, cols, state_code, for_loop)
        df['state']=state_name
        df['state_code']=state_code
        df['zipcode'] = df['zipcode'].astype('string').astype('category')
        if state_name not in dict.keys():
            dict.update({f'{state_name}':{f'{state_code}':df}})
    return dict

dict = new_func()

In [None]:
dict.keys()

In [None]:
# get the geojson of the usa
geojson_url = 'https://raw.githubusercontent.com/loganpowell/census-js-examples/master/data/ZCTAs-acs-acs5-B19083_001E-GINI.json'
geojson_data_USA = json.loads(requests.get(geojson_url).json())

In [None]:
def geojson_func():
    for state in dict.keys():
    # print(dict[state])
        for statecode in dict[state]:
                    # print(dict[state][statecode]['zipcode'])
            inziplist = []
            for ft in geojson_data_USA['features']:
                if ft['properties']['ZCTA5CE10'] in list(dict[state][statecode]['zipcode']):
            # print(ft['properties']['ZCTA5CE10'])
                    inziplist.append(ft)
            # print(len(inziplist))
            new_zip_json = {}
            new_zip_json['type'] = 'FeatureCollection'
            new_zip_json['features'] = inziplist
            new_zip_json = json.loads(json.dumps(new_zip_json))
        with open(f"./geojson_file/{state.replace(' ' , '_')}_StateCode_{statecode}.json", mode="w", encoding="utf-8") as write_file:
            json.dump(new_zip_json, write_file)

geojson_func()

##### conduct data qaulity checks

In [None]:
df_zipcode = pd.concat(dict[i][j] for i in dict.keys() for j in dict[i].keys())

In [None]:
df_zipcode.to_csv("zipcode.csv", index=False)

In [None]:
df_zipcode=pd.read_csv("zipcode.csv", dtype = {'zipcode':'category','state_code': 'category', 'state': 'category', 'B19013_001E':'float'})


In [None]:
df_zipcode.dtypes

In [None]:
## using ydata-profile for data quality checks
profile = ProfileReport(df_zipcode, title=" Census Zipcode Data Profiling Report")

In [None]:
profile.to_file("qa_report.html")

In [None]:
df_zipcode.loc[(df_zipcode['state_code']=='34') & (df_zipcode['B19013_001E']>0)].dtypes

In [None]:
df_state = df_zipcode[['state_code', 'state']].drop_duplicates()

In [None]:
df_state.reset_index(drop=True, inplace=True)

In [None]:
fig

In [None]:
from plotly.subplots import make_subplots as msubplot
import plotly.graph_objects as go

# Open the JSON file

# Create the choropleth map
fig = go.Figure(go.Choropleth( geojson = geojson_data_USA, 
                              locationmode = "USA-states",
                                  locations = df_zipcode.zipcode,
                                 z = df_zipcode[(df_zipcode['B19013_001E']>0)],
                                # color="B19013_001E",
                                colorscale="Viridis", marker_opacity=0.5, marker_line_width=0
                                ))
fig.update_layout(map_style="carto-positron",
                  map_zoom=3, map_center = {"lat": 37.0902, "lon": -95.7129})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
            # Show the map
fig.show()

In [None]:
from plotly.subplots import make_subplots as msubplot
import plotly.graph_objects as go
# fig  =  msubplot(rows = 2, cols = 2)

# Open the JSON file
for  state in df_state.iterrows():
    if state[0]<4:
        with open(f'./geojson_file/{state[1]['state'].replace(' ' , '_')}_StateCode_{state[1]['state_code']}.json') as f:
                # Load the JSON data into a Python dictionary
            geojson_state = json.load(f)
            df = df_zipcode.loc[(df_zipcode['state_code']==state[1]['state_code']) & (df_zipcode['B19013_001E']>0)]
                    # Create the choropleth map
            fig = px.choropleth(df, geojson = geojson_state, featureidkey="properties.ZCTA5CE10", 
                                locations = 'zipcode',
                                title = f'{state[1]["state"]} - Zipcode Income',
                                color="B19013_001E",
                                 color_continuous_scale="Viridis"
                                )
            # df = df_zipcode.loc[(df_zipcode['state_code']==state[1]['state_code']) & (df_zipcode['B19013_001E']>0)]
            # fig.add_trace(px.choropleth(
            #             df, 
            #             geojson=geojson_state,  # GeoJSON file for ZIP boundaries
            #             locations="zipcode",  # Column in your DataFrame that has ZIP codes
            #             featureidkey="properties.ZCTA5CE10",  # The matching ZIP code field in GeoJSON
            #             color="B19013_001E",  # The column in your DataFrame to color by
            #             color_continuous_scale="Viridis",  # Color scale (you can choose other scales)
            #             title="Median Household Income by ZIP Code"
            #         ), row=state[0], col = state[0])
  #Update the map for better visuals

            fig.update_geos(
                    fitbounds="locations",  # Zoom to fit the data
                    visible=False  # Hide the base map
                )
            # Show the map
            fig.show()