In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
import time
import pandas as pd
# from tqdm.notebook import tqdm  # Remove notebook version
from tqdm import tqdm  # Use standard tqdm
import logging
import re

# Set up logging
logging.basicConfig(filename='census_metadata_errors.log', level=logging.DEBUG,
                    format='%(asctime)s:%(levelname)s:%(message)s')

# Set your Census API key
API_KEY = 'YOUR API KEY HERE'

# Function to determine the decade
def get_decade(year):
    try:
        return f"{(int(year) // 10) * 10}s"
    except ValueError:
        return 'Unknown'

# Function to extract the year from a string
def extract_year(text):
    match = re.search(r'(19|20)\d{2}', text)
    if match:
        return match.group()
    else:
        return 'Unknown'

# Set up a session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504], raise_on_status=False)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# API Discovery endpoint
discovery_url = 'https://api.census.gov/data.json'

# Make the GET request to retrieve datasets
try:
    response = session.get(discovery_url, timeout=30)
    datasets_json = response.json()
except Exception as e:
    logging.error(f"Failed to retrieve datasets: {e}")
    raise SystemExit(f"Failed to retrieve datasets: {e}")

# Extract the list of datasets
datasets = datasets_json.get('dataset', [])
if not datasets:
    logging.error("No datasets found in the API response.")
    raise SystemExit("No datasets found in the API response.")

print(f"Total datasets found: {len(datasets)}")

# Initialize lists to store metadata
all_metadata = []
datasets_metadata = []
variables_links = []

# For progress tracking over datasets
dataset_iter = tqdm(datasets, desc='Extracting Dataset Metadata', unit='dataset')

for dataset in dataset_iter:
    # Extract relevant metadata fields
    title = dataset.get('title', '')
    description = dataset.get('description', '')
    identifier = dataset.get('identifier', '')
    contact = dataset.get('contactPoint', {}).get('fn', '')
    access_level = dataset.get('accessLevel', '')
    modified = dataset.get('modified', '')
    publisher = dataset.get('publisher', {}).get('name', '')
    references = dataset.get('references', [])
    keywords = dataset.get('keyword', [])
    c_dataset = dataset.get('c_dataset', [])
    c_vintage = dataset.get('c_vintage', [])
    c_variablesLink = dataset.get('c_variablesLink', '')

    # Ensure 'c_dataset' and 'c_vintage' are lists
    if not isinstance(c_dataset, list):
        c_dataset = [c_dataset]
    if not isinstance(c_vintage, list):
        c_vintage = [c_vintage]

    # For each combination of 'c_dataset' and 'c_vintage', create an entry
    for dataset_name in c_dataset:
        for year in c_vintage:
            dataset_entry = {
                'dataset_name': dataset_name,
                'year': year,
                'title': title,
                'description': description,
                'identifier': identifier,
                'contact': contact,
                'access_level': access_level,
                'modified': modified,
                'publisher': publisher,
                'references': ', '.join(references) if references else '',
                'keywords': ', '.join(keywords) if keywords else ''
            }
            datasets_metadata.append(dataset_entry)

    # Collect 'c_variablesLink' for variables metadata retrieval
    if c_variablesLink:
        # If 'c_variablesLink' is a list, extend the variables_links list
        if isinstance(c_variablesLink, list):
            variables_links.extend([(link, title) for link in c_variablesLink])
        else:
            variables_links.append((c_variablesLink, title))

# Create a DataFrame from the datasets metadata
datasets_metadata_df = pd.DataFrame(datasets_metadata)

# Save datasets metadata to CSV
datasets_metadata_df.to_csv('datasets_metadata.csv', index=False)

print("Datasets metadata has been saved to 'datasets_metadata.csv'")

# Now, process the variables links
variables_iter = tqdm(variables_links, desc='Processing Variables Metadata', unit='dataset')

for variables_url, title in variables_iter:
    # Append the API key if needed
    if '?' in variables_url:
        variables_url_with_key = f'{variables_url}&key={API_KEY}'
    else:
        variables_url_with_key = f'{variables_url}?key={API_KEY}'

    # Extract dataset_name and year from the URL
    # Updated regex to handle more complex URL patterns
    match = re.match(r'.*/data/((?:19|20)\d{2})/([^/]+(?:/[^/]+)*)/variables.json', variables_url)
    if match:
        year = match.group(1)
        dataset_name = match.group(2)
    else:
        # If we can't extract, attempt to extract year from the title
        year = extract_year(title)
        dataset_name = variables_url

    logging.debug(f"Processing variables URL: {variables_url_with_key}")
    logging.debug(f"Dataset Name: {dataset_name}, Year: {year}, Title: {title}")

    try:
        variables_response = session.get(variables_url_with_key, timeout=30)
        content_type = variables_response.headers.get('Content-Type', '')

        if variables_response.status_code == 200 and 'application/json' in content_type.lower():
            try:
                variables_json = variables_response.json()
                variables = variables_json.get('variables', {})

                for var_name, var_info in variables.items():
                    metadata_entry = {
                        'dataset_name': dataset_name,
                        'year': year,
                        'title': title,
                        'variable_name': var_name,
                        'label': var_info.get('label', ''),
                        'concept': var_info.get('concept', ''),
                        'predicateType': var_info.get('predicateType', ''),
                        'group': var_info.get('group', ''),
                        'limit': var_info.get('limit', ''),
                        'attributes': var_info.get('attributes', '')
                    }
                    all_metadata.append(metadata_entry)
                logging.info(f"Successfully processed variables for dataset '{dataset_name}', year '{year}'.")
            except json.JSONDecodeError as e:
                logging.error(f"JSON decode error for variables URL '{variables_url}': {e}")
                logging.error(f"Response content: {variables_response.text}")
        else:
            logging.error(f"Failed to retrieve valid JSON for variables URL '{variables_url}': HTTP {variables_response.status_code}")
            logging.error(f"Content-Type: {content_type}")
            logging.error(f"Response content: {variables_response.text}")
    except requests.Timeout as e:
        logging.error(f"Timeout error for variables URL '{variables_url}': {e}")
    except requests.RequestException as e:
        logging.error(f"Request error for variables URL '{variables_url}': {e}")
    except Exception as e:
        logging.error(f"Unexpected error for variables URL '{variables_url}': {e}")
        continue

    # Respectful delay to avoid hitting rate limits
    time.sleep(0.5)

# Check if any metadata was collected
if all_metadata:
    # Create a DataFrame from the metadata list
    metadata_df = pd.DataFrame(all_metadata)

    # Convert 'year' to string if it's not already
    metadata_df['year'] = metadata_df['year'].astype(str)
    metadata_df['decade'] = metadata_df['year'].apply(get_decade)

    # Save metadata to separate CSV files by decade
    for decade, group_df in metadata_df.groupby('decade'):
        filename = f'census_metadata_{decade}.csv'
        group_df.to_csv(filename, index=False)
        print(f"Metadata for {decade} has been saved to '{filename}'")
else:
    print("No metadata was collected.")


Total datasets found: 1636


Extracting Dataset Metadata: 100%|█████████████████████████████████████████| 1636/1636 [00:00<00:00, 216545.11dataset/s]


Datasets metadata has been saved to 'datasets_metadata.csv'


Processing Variables Metadata: 100%|███████████████████████████████████████████| 1636/1636 [57:06<00:00,  2.09s/dataset]


Metadata for 1980s has been saved to 'census_metadata_1980s.csv'
Metadata for 1990s has been saved to 'census_metadata_1990s.csv'
Metadata for 2000s has been saved to 'census_metadata_2000s.csv'
Metadata for 2010s has been saved to 'census_metadata_2010s.csv'
Metadata for 2020s has been saved to 'census_metadata_2020s.csv'
Metadata for Unknown has been saved to 'census_metadata_Unknown.csv'
