# Venues & Vibrancy — Neighborhoods and Districts
Self-contained notebook to compute national cuisine diversity and vibrancy labels. Exports CSVs:
- neighborhoods: [district, neighborhood, hashtags, source]
- districts: [district, hashtags, source]

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import folium

# Config (resolve project root so paths work from this notebook)
ROOT = Path.cwd()
if not (ROOT/'data').exists():
    ROOT = ROOT.parent
if not (ROOT/'data').exists():
    raise FileNotFoundError(f"Couldn't locate 'data' directory from {Path.cwd()}")
RAW_DIR = ROOT/'data'/'raw'
NEI_PATH = ROOT/'data'/'neighborhoods.geojson'
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(parents=True, exist_ok=True)
VENUES_CSV = RAW_DIR/'venues.csv'

def ensure_wgs84(gdf):
    if gdf.crs is None: return gdf.set_crs(4326)
    return gdf.to_crs(4326) if gdf.crs.to_epsg()!=4326 else gdf
def compute_area_km2(gdf):
    gutm = ensure_wgs84(gdf).to_crs(25833)
    gdf['area_km2'] = (gutm.geometry.area/1e6).values
    gdf['area_eff_km2'] = gdf['area_km2'].clip(lower=0.20)
    return gdf
def percentile_score(s, lo=10, hi=90):
    s = s.astype(float)
    p_lo, p_hi = np.nanpercentile(s, lo), np.nanpercentile(s, hi)
    rng = max(p_hi-p_lo, 1e-9)
    return ((s - p_lo)/rng).clip(0,1)*100.0
def hashtags(theme, label):
    norm = str(label).strip().lower().replace(' ','-')
    return ';'.join([f'#'+theme, f'#'+norm])

# Simple national cuisine vocabulary
NATIONALS = set('italian,french,spanish,portuguese,greek,turkish,german,polish,russian,ukrainian,balkan,hungarian,romanian,bulgarian,georgian,mexican,argentinian,peruvian,brazilian,colombian,venezuelan,caribbean,american,texmex,lebanese,israeli,palestinian,syrian,iraqi,iranian,afghan,moroccan,tunisian,algerian,ethiopian,eritrean,egyptian,southafrican,nigerian,indian,pakistani,bangladeshi,srilankan,nepali,chinese,japanese,korean,thai,vietnamese,laotian,cambodian,indonesian,malaysian,singaporean,filipino'.split(','))
EXCLUDE = set('pizza,pasta,sushi,ramen,doner,döner,kebab,burger,bbq,grill,steak,noodles,dumpling,dumplings,sandwich,bakery,cafe,coffee,bubbletea,boba,falafel'.split(','))
def tokenize(cuisines):
    if pd.isna(cuisines): return []
    toks = [''.join(ch for ch in str(p).lower().strip() if ch.isalnum()) for p in str(cuisines).split(';')]
    return [t for t in toks if t and t not in EXCLUDE and t in NATIONALS]

GDF = compute_area_km2(ensure_wgs84(gpd.read_file(NEI_PATH)))
V = pd.read_csv(VENUES_CSV)
V['tokens'] = V['cuisine'].apply(tokenize)
nei = V.groupby(['district_id','neighborhood'], dropna=False).agg(n_venues=('cuisine','size'), n_types=('tokens', lambda s: len(set().union(*s)) if len(s) else 0)).reset_index()
nei = GDF[['district_id','district','neighborhood','area_eff_km2']].merge(nei, on=['district_id','neighborhood'], how='left').fillna({'n_venues':0,'n_types':0})
nei['venues_per_km2'] = (nei['n_venues']/nei['area_eff_km2']).replace([np.inf,-np.inf], np.nan)
nei['V_score'] = percentile_score(nei['venues_per_km2']); nei['C_score'] = percentile_score(nei['n_types'])
nei['VV_index'] = 0.65*nei['V_score'] + 0.35*nei['C_score']
q1,q2 = np.nanpercentile(nei['VV_index'], 33.333), np.nanpercentile(nei['VV_index'], 66.666)
nei['vibrancy_label'] = nei['VV_index'].apply(lambda v: 'vibrant' if (not np.isnan(v) and v>=q2) else ('sparse' if (not np.isnan(v) and v<=q1) else 'average'))
nei_out = pd.DataFrame({
    'district': nei['district'],
    'neighborhood': nei['neighborhood'],
    'hashtags': [hashtags('venues', v) for v in nei['vibrancy_label']],
    'source': 'venues.ipynb:rule-based'
})
nei_out.to_csv(OUT_DIR/'neighborhood_labels_venues.csv', index=False)
# Append to combined neighborhoods long (idempotent)
nei_long_path = OUT_DIR/'berlin_neighborhoods_labels_long.csv'
if nei_long_path.exists():
    _old = pd.read_csv(nei_long_path)
    _new = pd.concat([_old, nei_out], ignore_index=True)
else:
    _new = nei_out.copy()
_new = _new.drop_duplicates(subset=['district','neighborhood','hashtags','source'])
_new.to_csv(nei_long_path, index=False)
# Update neighborhoods wide (idempotent)
nei_wide_cols = ['district','neighborhood','area_eff_km2','n_venues','n_types','venues_per_km2','V_score','C_score','VV_index','vibrancy_label']
nei_wide = nei[nei_wide_cols].copy()
nei_wide_path = OUT_DIR/'berlin_neighborhoods_labels_wide.csv'
if nei_wide_path.exists():
    _nw = pd.read_csv(nei_wide_path)
    _nw = _nw.merge(nei_wide, on=['district','neighborhood'], how='outer', suffixes=('', '_new'))
    for c in ['area_eff_km2','n_venues','n_types','venues_per_km2','V_score','C_score','VV_index','vibrancy_label']:
        if c+'_new' in _nw.columns:
            _nw[c] = _nw[c].combine_first(_nw[c+'_new'])
            _nw = _nw.drop(columns=[c+'_new'])
else:
    _nw = nei_wide
_nw.to_csv(nei_wide_path, index=False)

dist = V.groupby(['district_id'], dropna=False).agg(n_venues=('cuisine','size'), n_types=('tokens', lambda s: len(set().union(*s)) if len(s) else 0)).reset_index()
dist = GDF[['district_id','district','area_eff_km2']].drop_duplicates('district_id').merge(dist, on='district_id', how='left').fillna({'n_venues':0,'n_types':0})
dist['venues_per_km2'] = (dist['n_venues']/dist['area_eff_km2']).replace([np.inf,-np.inf], np.nan)
dist['V_score'] = percentile_score(dist['venues_per_km2']); dist['C_score'] = percentile_score(dist['n_types'])
dist['VV_index'] = 0.65*dist['V_score'] + 0.35*dist['C_score']
q1d,q2d = np.nanpercentile(dist['VV_index'], 33.333), np.nanpercentile(dist['VV_index'], 66.666)
dist['vibrancy_label'] = dist['VV_index'].apply(lambda v: 'vibrant' if (not np.isnan(v) and v>=q2d) else ('sparse' if (not np.isnan(v) and v<=q1d) else 'average'))
dist_out = pd.DataFrame({
    'district': dist['district'],
    'hashtags': [hashtags('venues', v) for v in dist['vibrancy_label']],
    'source': 'venues.ipynb:rule-based'
})
dist_out.to_csv(OUT_DIR/'district_labels_venues.csv', index=False)
# Append to combined districts long (idempotent)
dist_long_path = OUT_DIR/'berlin_districts_labels_long.csv'
if dist_long_path.exists():
    _old = pd.read_csv(dist_long_path)
    _new = pd.concat([_old, dist_out], ignore_index=True)
else:
    _new = dist_out.copy()
_new = _new.drop_duplicates(subset=['district','hashtags','source'])
_new.to_csv(dist_long_path, index=False)
# Update districts wide (idempotent)
dist_wide_cols = ['district','area_eff_km2','n_venues','n_types','venues_per_km2','V_score','C_score','VV_index','vibrancy_label']
dist_wide = dist[dist_wide_cols].copy()
dist_wide_path = OUT_DIR/'berlin_districts_labels_wide.csv'
if dist_wide_path.exists():
    _wd = pd.read_csv(dist_wide_path)
    _wd = _wd.merge(dist_wide, on=['district'], how='outer', suffixes=('', '_new'))
    for c in ['area_eff_km2','n_venues','n_types','venues_per_km2','V_score','C_score','VV_index','vibrancy_label']:
        if c+'_new' in _wd.columns:
            _wd[c] = _wd[c].combine_first(_wd[c+'_new'])
            _wd = _wd.drop(columns=[c+'_new'])
else:
    _wd = dist_wide
_wd.to_csv(dist_wide_path, index=False)

# Maps
CAT = {'vibrant':'#1a9641','average':'#a6d96a','sparse':'#fee08b'}
def style_cat(f, col):
    v = f['properties'].get(col); return {'fillColor': CAT.get(str(v).lower() if isinstance(v,str) else v, '#cccccc'), 'color':'#555','weight':0.5, 'fillOpacity':0.75}
m_nei = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
g_nei = GDF.merge(nei[['district','neighborhood','vibrancy_label']], on=['district','neighborhood'])
folium.GeoJson(g_nei, style_function=lambda f, c='vibrancy_label': style_cat(f,c), tooltip=folium.GeoJsonTooltip(fields=['neighborhood','district','vibrancy_label'])).add_to(m_nei)
m_nei.save(str(OUT_DIR/'venues_map_neighborhoods.html'))
m_dist = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
dist_polys = GDF.dissolve(by=['district'], as_index=False)
g_dist = dist_polys.merge(dist[['district','vibrancy_label']], on='district')
folium.GeoJson(g_dist, style_function=lambda f, c='vibrancy_label': style_cat(f,c), tooltip=folium.GeoJsonTooltip(fields=['district','vibrancy_label'])).add_to(m_dist)
m_dist.save(str(OUT_DIR/'venues_map_districts.html'))
