In [1]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
import numpy as np
from folium.plugins import HeatMap

# -------------------------
# 1. Load and Prepare Marker Data from data.csv
# -------------------------
df = pd.read_csv("data.csv", encoding="ISO-8859-1")
df.columns = df.columns.str.strip()

# Filter for operating startups in CA and SF Bay Area
df = df[
    (df['state_code'] == 'CA') &
    (df['region'] == 'SF Bay Area') &
    (df['status'].str.lower() == 'operating')
]

# Clean and convert the funding column
df['funding_total_usd'] = df['funding_total_usd'].replace('[\$,]', '', regex=True)
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')

# Get top-funded startup per city
top_funded = df.sort_values('funding_total_usd', ascending=False).groupby('city').first()
top_funded = top_funded[['name', 'funding_total_usd']].rename(
    columns={'name': 'top_startup', 'funding_total_usd': 'top_funding'}
)

# Calculate summary metrics per city
summary = df.groupby('city').agg(
    startup_count=('name', 'count'),
    avg_funding=('funding_total_usd', 'mean'),
    common_market=('market', lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')
).reset_index()
summary = summary.merge(top_funded, left_on='city', right_index=True, how='left')

# Geocode cities using Nominatim with a rate limiter
geolocator = Nominatim(user_agent="sf_mapper")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def safe_geocode(city):
    try:
        location = geocode(f"{city}, California, USA")
        return pd.Series([location.latitude, location.longitude]) if location else pd.Series([None, None])
    except Exception:
        return pd.Series([None, None])

summary[['lat', 'lon']] = summary['city'].apply(safe_geocode)
summary.dropna(subset=['lat', 'lon', 'avg_funding'], inplace=True)

# Create a GeoDataFrame and set the coordinate reference system
geometry = [Point(xy) for xy in zip(summary['lon'], summary['lat'])]
gdf = gpd.GeoDataFrame(summary, geometry=geometry)
gdf.set_crs(epsg=4326, inplace=True)

# -------------------------------
# 2. Define Funding Color Scale for Markers
# -------------------------------
fund_min = gdf['avg_funding'].min()
fund_max = gdf['avg_funding'].max()
num_bins = 6  # six intervals
# Create equally spaced bin edges
bin_edges = np.linspace(fund_min, fund_max, num_bins + 1)
# Define a stronger blue palette (from light to dark)
colors = ["#deebf7", "#9ecae1", "#4292c6", "#2171b5", "#08519c", "#08306b"]

# -------------------------------
# 3. Create the Map and Marker Layer
# -------------------------------
# Use CartoDB Positron as the base map
m = folium.Map(location=[37.77, -122.42], zoom_start=9, tiles='CartoDB positron')

# Create a FeatureGroup for the bubble markers ("Startup Markers")
marker_fg = folium.FeatureGroup(name="Startup Markers", show=True)

# Scale bubble (marker) size based on the number of startups using a log transformation
# This transformation uses np.log(x+1) to avoid log(0), multiplies by 5 and adds an offset of 2.
gdf['scaled_size'] = gdf['startup_count'].apply(lambda x: np.log(x + 1) * 5 + 2)

# Add city markers (each marker’s color is chosen according to the funding bin its avg funding falls into)
for _, row in gdf.iterrows():
    tooltip_html = f"""
        <div style="font-size: 14px;">
            <b>{row['city']}</b><br>
            Startups: {row['startup_count']}<br>
            Avg. Funding: ${row['avg_funding']:,.0f}<br>
            Top Market: {row['common_market']}<br><br>
            <b>Top-Funded Startup:</b><br>
            {row['top_startup']} (${row['top_funding']:,.0f})
        </div>
    """
    funding = row['avg_funding']
    # Determine the funding bin
    bin_index = np.digitize(funding, bin_edges, right=True) - 1
    if bin_index < 0:
        bin_index = 0
    elif bin_index >= len(colors):
        bin_index = len(colors) - 1
    marker_color = colors[bin_index]

    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=row['scaled_size'],
        color=marker_color,
        fill=True,
        fill_color=marker_color,
        fill_opacity=0.85,
        tooltip=folium.Tooltip(tooltip_html)
    ).add_to(marker_fg)

# Add the markers FeatureGroup to the map
marker_fg.add_to(m)

# --------------------------------------------------------
# 4. Build a Custom Combined Legend (Funding + Bubble Size)
# --------------------------------------------------------
def format_funding(amount):
    if amount >= 1e6:
        return f"${amount/1e6:.1f}M"
    else:
        return f"${amount:,.0f}"

# Build Funding Legend HTML items
funding_legend_items = ""
for i in range(num_bins):
    lower = bin_edges[i]
    upper = bin_edges[i+1]
    label = f"{format_funding(lower)} - {format_funding(upper)}"
    color_sw = colors[i]
    funding_legend_items += f'''
    <div style="display: flex; align-items: center; margin-bottom: 2px;">
        <div style="width: 20px; height: 20px; background:{color_sw}; margin-right: 5px;"></div>
        <div>{label}</div>
    </div>
    '''

# Define bubble size legend ranges (adjust these as needed)
bubble_ranges = [
    {"range": "1–5", "rep": 3},
    {"range": "6–20", "rep": 13},
    {"range": "21–50", "rep": 35.5},
    {"range": "51+", "rep": 75},
]

# Build Bubble Size Legend HTML items
bubble_legend_items = ""
for item in bubble_ranges:
    rep = item["rep"]
    radius = np.sqrt(rep) * 2 + 2
    svg_size = int(radius * 2 + 10)
    bubble_legend_items += f'''
    <div style="display: flex; align-items: center; margin-bottom: 5px;">
        <svg width="{svg_size}" height="{svg_size}">
            <circle cx="{radius+5}" cy="{radius+5}" r="{radius}"
                style="fill:#3182bd; opacity:0.85; stroke:#000; stroke-width:0.5"/>
        </svg>
        <div style="margin-left: 8px;">{item["range"]} Startups</div>
    </div>
    '''

legend_html = f'''
<div id="legend" style="
    position: fixed;
    top: 20px;
    right: 20px;
    z-index: 9999;
    background: white;
    padding: 10px;
    font-size: 14px;
    border: 2px solid grey;
    opacity: 0.9;
    width: 260px;
">
    <div style="font-weight: bold; margin-bottom: 5px;">Average Funding (USD)</div>
    {funding_legend_items}
    <hr style="margin:10px 0;">
    <div style="font-weight: bold; margin-bottom: 5px;">Startup Count</div>
    {bubble_legend_items}
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# --------------------------------------------------------
# 5. Add Optional Heatmap Layer from ds4200_cleaned_data.csv
# --------------------------------------------------------
# For this example, we'll re-use "data.csv" for the heatmap
df_heat = pd.read_csv("data.csv", encoding="ISO-8859-1")
df_heat.columns = df_heat.columns.str.strip()

df_heat = df_heat[
    (df_heat['state_code'] == 'CA') &
    (df_heat['region'] == 'SF Bay Area') &
    (df_heat['status'].str.lower() == 'operating')
]

df_heat['funding_total_usd'] = df_heat['funding_total_usd'].replace('[\$,]', '', regex=True)
df_heat['funding_total_usd'] = pd.to_numeric(df_heat['funding_total_usd'], errors='coerce')

geolocator_heat = Nominatim(user_agent="sf_startup_heatmap")
geocode_heat = RateLimiter(geolocator_heat.geocode, min_delay_seconds=1)

city_coords = {}
for city in df_heat['city'].dropna().unique():
    location = geocode_heat(f"{city}, California, USA")
    if location:
        city_coords[city] = (location.latitude, location.longitude)

df_heat['lat'] = df_heat['city'].map(lambda x: city_coords.get(x, (None, None))[0])
df_heat['lon'] = df_heat['city'].map(lambda x: city_coords.get(x, (None, None))[1])
df_heat.dropna(subset=['lat','lon'], inplace=True)

heat_data = df_heat.groupby(['lat', 'lon']).size().reset_index(name='count')
heatmap_points = heat_data[['lat', 'lon', 'count']].values.tolist()

# Create a FeatureGroup for the heatmap layer ("Startup Heatmap")
heatmap_fg = folium.FeatureGroup(name="Startup Heatmap", show=True)
custom_gradient = {"0.4": "blue", "0.65": "lime", "1": "red"}
HeatMap(
    heatmap_points,
    radius=18,
    blur=15,
    max_zoom=12,
    gradient=custom_gradient
).add_to(heatmap_fg)
heatmap_fg.add_to(m)

# --------------------------------------------------------
# 6. Add a Heatmap Legend (explaining the color intensity)
# --------------------------------------------------------
heatmap_legend = '''
<div id="heatmap-legend" style="
    position: fixed;
    bottom: 20px;
    left: 20px;
    z-index: 9998;
    background: white;
    padding: 10px;
    font-size: 14px;
    border: 2px solid grey;
    opacity: 0.9;
    width: 200px;
">
    <div style="font-weight: bold; margin-bottom: 5px;">Heatmap Intensity</div>
    <div style="display: flex; align-items: center;">
        <div style="width: 100%; height: 15px; background: linear-gradient(to right, blue, lime, red);"></div>
    </div>
    <div style="display: flex; justify-content: space-between;">
        <span>Low</span>
        <span>High</span>
    </div>
</div>
'''
m.get_root().html.add_child(folium.Element(heatmap_legend))

# --------------------------------------------------------
# 7. Reposition the Layer Control to the Bottom Right and Add it
# --------------------------------------------------------
# Custom CSS to force the LayerControl to display in the bottom right
layer_control_style = """
<style>
.leaflet-control-layers {
    position: fixed !important;
    top: auto !important;
    bottom: 20px !important;
    left: auto !important;
    right: 20px !important;
    z-index: 1100 !important;
}
</style>
"""
m.get_root().html.add_child(folium.Element(layer_control_style))
folium.LayerControl(collapsed=False).add_to(m)

# --------------------------------------------------------
# 8. Save the Map to HTML
# --------------------------------------------------------
m.save("CA_startups.html")

In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
import numpy as np
from folium.plugins import HeatMap

# -------------------------
# 1. Load and Prepare Marker Data from data.csv
# -------------------------
df = pd.read_csv("data.csv", encoding="ISO-8859-1")
df.columns = df.columns.str.strip()

# Filter for operating startups in CA and SF Bay Area
df = df[
    (df['state_code'] == 'CA') &
    (df['region'] == 'SF Bay Area') &
    (df['status'].str.lower() == 'operating')
]

# Clean and convert the funding column
df['funding_total_usd'] = df['funding_total_usd'].replace('[\$,]', '', regex=True)
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')

# Get top-funded startup per city
top_funded = df.sort_values('funding_total_usd', ascending=False).groupby('city').first()
top_funded = top_funded[['name', 'funding_total_usd']].rename(
    columns={'name': 'top_startup', 'funding_total_usd': 'top_funding'}
)

# Calculate summary metrics per city
summary = df.groupby('city').agg(
    startup_count=('name', 'count'),
    avg_funding=('funding_total_usd', 'mean'),
    common_market=('market', lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')
).reset_index()
summary = summary.merge(top_funded, left_on='city', right_index=True, how='left')

# Geocode cities using Nominatim with a rate limiter
geolocator = Nominatim(user_agent="sf_mapper")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def safe_geocode(city):
    try:
        location = geocode(f"{city}, California, USA")
        return pd.Series([location.latitude, location.longitude]) if location else pd.Series([None, None])
    except Exception:
        return pd.Series([None, None])

summary[['lat', 'lon']] = summary['city'].apply(safe_geocode)
summary.dropna(subset=['lat', 'lon', 'avg_funding'], inplace=True)

# Create a GeoDataFrame and set the coordinate reference system
geometry = [Point(xy) for xy in zip(summary['lon'], summary['lat'])]
gdf = gpd.GeoDataFrame(summary, geometry=geometry)
gdf.set_crs(epsg=4326, inplace=True)

# Scale bubble (marker) size based on the number of startups
gdf['scaled_size'] = gdf['startup_count'].apply(lambda x: np.sqrt(x) * 2 + 2)

# -------------------------------
# 2. Define Funding Color Scale for Markers
# -------------------------------
fund_min = gdf['avg_funding'].min()
fund_max = gdf['avg_funding'].max()
num_bins = 6  # six intervals
# Create equally spaced bin edges
bin_edges = np.linspace(fund_min, fund_max, num_bins + 1)
# Define a stronger blue palette (from light to dark)
colors = ["#deebf7", "#9ecae1", "#4292c6", "#2171b5", "#08519c", "#08306b"]

# -------------------------------
# 3. Create the Map and Marker Layer
# -------------------------------
# Use CartoDB Positron as the base map
m = folium.Map(location=[37.77, -122.42], zoom_start=9, tiles='CartoDB positron')

# Create a FeatureGroup for the bubble markers ("Startup Markers")
marker_fg = folium.FeatureGroup(name="Startup Markers", show=True)

# Add city markers (each marker’s color is chosen according to the funding bin its avg funding falls into)
for _, row in gdf.iterrows():
    tooltip_html = f"""
        <div style="font-size: 14px;">
            <b>{row['city']}</b><br>
            Startups: {row['startup_count']}<br>
            Avg. Funding: ${row['avg_funding']:,.0f}<br>
            Top Market: {row['common_market']}<br><br>
            <b>Top-Funded Startup:</b><br>
            {row['top_startup']} (${row['top_funding']:,.0f})
        </div>
    """
    funding = row['avg_funding']
    # Determine the funding bin
    bin_index = np.digitize(funding, bin_edges, right=True) - 1
    if bin_index < 0:
        bin_index = 0
    elif bin_index >= len(colors):
        bin_index = len(colors) - 1
    marker_color = colors[bin_index]

    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=row['scaled_size'],
        color=marker_color,
        fill=True,
        fill_color=marker_color,
        fill_opacity=0.85,
        tooltip=folium.Tooltip(tooltip_html)
    ).add_to(marker_fg)

# Add the markers FeatureGroup to the map
marker_fg.add_to(m)

# --------------------------------------------------------
# 4. Build a Custom Combined Legend (Funding + Bubble Size)
# --------------------------------------------------------
def format_funding(amount):
    if amount >= 1e6:
        return f"${amount/1e6:.1f}M"
    else:
        return f"${amount:,.0f}"

# Build Funding Legend HTML items
funding_legend_items = ""
for i in range(num_bins):
    lower = bin_edges[i]
    upper = bin_edges[i+1]
    label = f"{format_funding(lower)} - {format_funding(upper)}"
    color_sw = colors[i]
    funding_legend_items += f'''
    <div style="display: flex; align-items: center; margin-bottom: 2px;">
        <div style="width: 20px; height: 20px; background:{color_sw}; margin-right: 5px;"></div>
        <div>{label}</div>
    </div>
    '''

# Define bubble size legend ranges (adjust these as needed)
bubble_ranges = [
    {"range": "1–5", "rep": 3},
    {"range": "6–20", "rep": 13},
    {"range": "21–50", "rep": 35.5},
    {"range": "51+", "rep": 75},
]

# Build Bubble Size Legend HTML items
bubble_legend_items = ""
for item in bubble_ranges:
    rep = item["rep"]
    radius = np.sqrt(rep) * 2 + 2
    svg_size = int(radius * 2 + 10)
    bubble_legend_items += f'''
    <div style="display: flex; align-items: center; margin-bottom: 5px;">
        <svg width="{svg_size}" height="{svg_size}">
            <circle cx="{radius+5}" cy="{radius+5}" r="{radius}"
                style="fill:#3182bd; opacity:0.85; stroke:#000; stroke-width:0.5"/>
        </svg>
        <div style="margin-left: 8px;">{item["range"]} Startups</div>
    </div>
    '''

legend_html = f'''
<div id="legend" style="
    position: fixed;
    top: 20px;
    right: 20px;
    z-index: 9999;
    background: white;
    padding: 10px;
    font-size: 14px;
    border: 2px solid grey;
    opacity: 0.9;
    width: 260px;
">
    <div style="font-weight: bold; margin-bottom: 5px;">Average Funding (USD)</div>
    {funding_legend_items}
    <hr style="margin:10px 0;">
    <div style="font-weight: bold; margin-bottom: 5px;">Startup Count</div>
    {bubble_legend_items}
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# --------------------------------------------------------
# 5. Add Optional Heatmap Layer from ds4200_cleaned_data.csv
# --------------------------------------------------------
# For this example, we'll re-use "data.csv" for the heatmap
df_heat = pd.read_csv("data.csv", encoding="ISO-8859-1")
df_heat.columns = df_heat.columns.str.strip()

df_heat = df_heat[
    (df_heat['state_code'] == 'CA') &
    (df_heat['region'] == 'SF Bay Area') &
    (df_heat['status'].str.lower() == 'operating')
]

df_heat['funding_total_usd'] = df_heat['funding_total_usd'].replace('[\$,]', '', regex=True)
df_heat['funding_total_usd'] = pd.to_numeric(df_heat['funding_total_usd'], errors='coerce')

geolocator_heat = Nominatim(user_agent="sf_startup_heatmap")
geocode_heat = RateLimiter(geolocator_heat.geocode, min_delay_seconds=1)

city_coords = {}
for city in df_heat['city'].dropna().unique():
    location = geocode_heat(f"{city}, California, USA")
    if location:
        city_coords[city] = (location.latitude, location.longitude)

df_heat['lat'] = df_heat['city'].map(lambda x: city_coords.get(x, (None, None))[0])
df_heat['lon'] = df_heat['city'].map(lambda x: city_coords.get(x, (None, None))[1])
df_heat.dropna(subset=['lat', 'lon'], inplace=True)

heat_data = df_heat.groupby(['lat', 'lon']).size().reset_index(name='count')
heatmap_points = heat_data[['lat', 'lon', 'count']].values.tolist()

# Create a FeatureGroup for the heatmap layer ("Startup Heatmap")
heatmap_fg = folium.FeatureGroup(name="Startup Heatmap", show=True)
custom_gradient = {"0.4": "blue", "0.65": "lime", "1": "red"}
HeatMap(
    heatmap_points,
    radius=18,
    blur=15,
    max_zoom=12,
    gradient=custom_gradient
).add_to(heatmap_fg)
heatmap_fg.add_to(m)

# --------------------------------------------------------
# 6. Add a Heatmap Legend (explaining the color intensity)
# --------------------------------------------------------
heatmap_legend = '''
<div id="heatmap-legend" style="
    position: fixed;
    bottom: 20px;
    left: 20px;
    z-index: 9998;
    background: white;
    padding: 10px;
    font-size: 14px;
    border: 2px solid grey;
    opacity: 0.9;
    width: 200px;
">
    <div style="font-weight: bold; margin-bottom: 5px;">Heatmap Intensity</div>
    <div style="display: flex; align-items: center;">
        <div style="width: 100%; height: 15px; background: linear-gradient(to right, blue, lime, red);"></div>
    </div>
    <div style="display: flex; justify-content: space-between;">
        <span>Low</span>
        <span>High</span>
    </div>
</div>
'''
m.get_root().html.add_child(folium.Element(heatmap_legend))

# --------------------------------------------------------
# 7. Reposition the Layer Control to the Bottom Right and Add it
# --------------------------------------------------------
# Custom CSS to force the LayerControl to display in the bottom right
layer_control_style = """
<style>
.leaflet-control-layers {
    position: fixed !important;
    top: auto !important;
    bottom: 20px !important;
    left: auto !important;
    right: 20px !important;
    z-index: 1100 !important;
}
</style>
"""
m.get_root().html.add_child(folium.Element(layer_control_style))
folium.LayerControl(collapsed=False).add_to(m)

# --------------------------------------------------------
# 8. Save the Map to HTML
# --------------------------------------------------------
m.save("CA_startups_map_plot.html")