In [3]:
import numpy as np
import pandas as pd

def generate_sample_data(n_blocks=1000, n_providers=15, locations_per_block=100):
    np.random.seed(42)

    data = []
    location_counter = 1

    for block in range(1, n_blocks + 1):
        # Pennsylvania FIPS code is 42, block code is 13 after the state FIPS code
        block_geoid = f"42{block:013d}"

        n_locations = np.random.randint(20, locations_per_block + 1)

        # First, assign which providers serve this block (2-6 providers per block)
        n_providers_in_block = np.random.randint(2, min(7, n_providers + 1))
        
        providers_in_block = np.random.choice(
            [f"PROV_{i:03d}" for i in range(1, n_providers + 1)],
            size=n_providers_in_block,
            replace=False,
        )

        # Create locations
        for _ in range(n_locations):
            location_id = f"LOC_{location_counter:06d}"
            location_counter += 1

            # Each location gets a subset of the providers in this block
            # Coverage ranges from 40% to 80% to create edge-out opportunities
            coverage_pct = np.random.uniform(0.4, 0.8)
            n_providers_at_loc = max(1, int(len(providers_in_block) * coverage_pct))

            providers_at_loc = np.random.choice(
                providers_in_block,
                size=min(n_providers_at_loc, len(providers_in_block)),
                replace=False,
            )

            for provider in providers_at_loc:
                # Random technology - reduced slow tech to create edge-out opportunities
                # Strategy: Make slow tech rare so upgrade opportunities are limited
                # This allows the model to demonstrate edge-out builds once upgrades are exhausted
                technology = np.random.choice([10, 40, 50], p=[0.05, 0.25, 0.7])

                # Speed based on technology
                if technology == 10:  # Copper
                    # All slow copper to maximize upgrade opportunities where copper exists
                    download = 25  # Always slow
                    upload = download // 10
                elif technology == 40:  # Cable
                    # Mostly fast cable (80% fast) to minimize upgrade opportunities
                    download = np.random.choice([200, 600, 1000], p=[0.2, 0.5, 0.3])
                    upload = download // 10
                else:  # Fiber
                    download = np.random.choice([500, 1000, 2000], p=[0.3, 0.5, 0.2])
                    upload = download

                data.append(
                    {
                        "location_id": location_id,
                        "block_geoid": block_geoid,
                        "provider_id": provider,
                        "technology": technology,
                        "max_advertised_download_speed": download,
                        "max_advertised_upload_speed": upload,
                    }
                )

    return pd.DataFrame(data)

In [4]:
# Generate sample data
df = generate_sample_data()

# Save DataFrame to CSV (since notebook is in '12 - Inputs', save locally)
output_path = 'sample_data.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")

DataFrame saved to sample_data.csv
