# Documentation — Hashtags and Folium Map

Computes mobility for neighborhoods and districts using U-Bahn stations and bus/tram stops and produces:
- CSVs with `hashtags` like `#mobility;#well-connected` (districts omit neighborhood).
- Folium maps at both levels.

## Hashtag logic
- Count features per neighborhood:
  - `ubahn_stations` and `bus_tram_stops` from raw CSVs.
  - Effective area `area_eff_km2` uses an area floor of 0.20 km² for stability.
- Compute `connectivity_density` = (0.7 · ubahn + 0.3 · bus_tram) / area_eff_km2.
- Convert to `mobility_score` via percentile scaling (p10→0, p90→100).
- Classify by terciles of `mobility_score`: top → `well-connected`, mid → `moderate`, bottom → `remote`.
- Export hashtags as `#mobility;#<label>` for neighborhoods and the aggregated district table.

## Folium visualization
- Build a `folium.Map` and add a `GeoJson` layer colored by `mobility_label`.
- Tooltips show connection density and score where available; maps saved to `labels_with_visualization/outputs/mobility_map_*.html`.


# Mobility Labels — Neighborhoods and Districts
Self-contained notebook to compute rule-based mobility labels (U-Bahn + Bus/Tram) and export CSVs:
- neighborhoods: columns [district, neighborhood, hashtags, source]
- districts: columns [district, hashtags, source]
Also saves two Folium maps (neighborhoods and districts).

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import folium
import branca

# Config (resolve project root so paths work from this notebook)
ROOT = Path.cwd()
if not (ROOT/'data').exists():
    ROOT = ROOT.parent
if not (ROOT/'data').exists():
    raise FileNotFoundError(f"Couldn't locate 'data' directory from {Path.cwd()}")
RAW_DIR = ROOT/'data'/'raw'
NEI_PATH = ROOT/'data'/'neighborhoods.geojson'
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(parents=True, exist_ok=True)
UBAHN_CSV = RAW_DIR/'ubahns.csv'
BUS_TRAM_CSV = RAW_DIR/'bus_tram_stops.csv'

# Helpers
def ensure_wgs84(gdf):
    if gdf.crs is None: return gdf.set_crs(4326)
    return gdf.to_crs(4326) if gdf.crs.to_epsg()!=4326 else gdf

def compute_area_km2(gdf):
    gutm = ensure_wgs84(gdf).to_crs(25833)
    gdf['area_km2'] = (gutm.geometry.area/1e6).values
    gdf['area_eff_km2'] = gdf['area_km2'].clip(lower=0.20)
    return gdf

def to_points(df):
    lat = next(c for c in df.columns if c.lower() in {'lat','latitude','y'})
    lon = next(c for c in df.columns if c.lower() in {'lon','lng','long','longitude','x'})
    g = gpd.GeoDataFrame(df.copy(), geometry=[Point(xy) for xy in zip(df[lon], df[lat])], crs=4326)
    return ensure_wgs84(g)

def percentile_score(s, lo=10, hi=90):
    s = s.astype(float)
    p_lo, p_hi = np.nanpercentile(s, lo), np.nanpercentile(s, hi)
    rng = max(p_hi-p_lo, 1e-9)
    return ((s - p_lo)/rng).clip(0,1)*100.0

def tercile_label(v, q1, q2, hi_label, mid_label, lo_label):
    if np.isnan(v): return mid_label
    if v >= q2: return hi_label
    if v <= q1: return lo_label
    return mid_label

def compute_labels_neighborhoods(gdf_nei, df_ubahn, df_bus):
    gdf = compute_area_km2(ensure_wgs84(gdf_nei.copy()))
    g_u = to_points(df_ubahn); g_b = to_points(df_bus)
    # spatial join by polygon index
    base = gdf.reset_index().rename(columns={'index':'poly_index'})
    polys = base[['poly_index','geometry']]
    pts_u = gpd.sjoin(g_u, polys, how='inner', predicate='within')
    pts_b = gpd.sjoin(g_b, polys, how='inner', predicate='within')
    cu = pts_u.groupby('poly_index').size().rename('ubahn_stations').reset_index()
    cb = pts_b.groupby('poly_index').size().rename('bus_tram_stops').reset_index()
    out = base.merge(cu, on='poly_index', how='left').merge(cb, on='poly_index', how='left')
    out[['ubahn_stations','bus_tram_stops']] = out[['ubahn_stations','bus_tram_stops']].fillna(0).astype(int)
    out['total_stops'] = out['ubahn_stations'] + out['bus_tram_stops']
    out['connectivity_density'] = (0.7*out['ubahn_stations'] + 0.3*out['bus_tram_stops'])/out['area_eff_km2']
    out['mobility_score'] = percentile_score(out['connectivity_density'])
    q1, q2 = np.nanpercentile(out['mobility_score'], 33.333), np.nanpercentile(out['mobility_score'], 66.666)
    out['mobility_label'] = [tercile_label(v,q1,q2,'well-connected','moderate','remote') for v in out['mobility_score']]
    return out

def hashtags_from_label(theme, label):
    norm = str(label).strip().lower().replace(' ','-')
    return ';'.join([f'#'+theme, f'#'+norm])

# Load data
GDF = ensure_wgs84(gpd.read_file(NEI_PATH))
GDF = compute_area_km2(GDF)
df_ubahn = pd.read_csv(UBAHN_CSV)
df_bus = pd.read_csv(BUS_TRAM_CSV)

nei_labels = compute_labels_neighborhoods(GDF, df_ubahn, df_bus)
# Neighborhood output table
nei_out = pd.DataFrame({
    'district': nei_labels['district'] if 'district' in nei_labels.columns else nei_labels['district_id'].astype(str),
    'neighborhood': nei_labels['neighborhood'],
    'hashtags': [hashtags_from_label('mobility', v) for v in nei_labels['mobility_label']],
    'source': 'mobility.ipynb:rule-based'
})
nei_out.to_csv(OUT_DIR/'neighborhood_labels_mobility.csv', index=False)
# Append to combined neighborhoods long (idempotent)
nei_long_path = OUT_DIR/'berlin_neighborhoods_labels_long.csv'
if nei_long_path.exists():
    _old = pd.read_csv(nei_long_path)
    _new = pd.concat([_old, nei_out], ignore_index=True)
else:
    _new = nei_out.copy()
_new = _new.drop_duplicates(subset=['district','neighborhood','hashtags','source'])
_new.to_csv(nei_long_path, index=False)
# Update neighborhoods wide (idempotent)
nei_wide_cols = ['district','neighborhood','ubahn_stations','bus_tram_stops','area_eff_km2','connectivity_density','mobility_score','mobility_label']
nei_wide = nei_labels[nei_wide_cols].copy()
nei_wide_path = OUT_DIR/'berlin_neighborhoods_labels_wide.csv'
if nei_wide_path.exists():
    _nw = pd.read_csv(nei_wide_path)
    _nw = _nw.merge(nei_wide, on=['district','neighborhood'], how='outer', suffixes=('', '_new'))
    for c in ['ubahn_stations','bus_tram_stops','area_eff_km2','connectivity_density','mobility_score','mobility_label']:
        if c+'_new' in _nw.columns:
            _nw[c] = _nw[c].combine_first(_nw[c+'_new'])
            _nw = _nw.drop(columns=[c+'_new'])
else:
    _nw = nei_wide
_nw.to_csv(nei_wide_path, index=False)

# District aggregation
grp = nei_labels.groupby(['district','district_id'] if 'district' in nei_labels.columns else ['district_id'], dropna=False)[['ubahn_stations','bus_tram_stops','area_eff_km2']].sum().reset_index()
grp['connectivity_density'] = (0.7*grp['ubahn_stations'] + 0.3*grp['bus_tram_stops'])/grp['area_eff_km2']
grp['mobility_score'] = percentile_score(grp['connectivity_density'])
q1d, q2d = np.nanpercentile(grp['mobility_score'], 33.333), np.nanpercentile(grp['mobility_score'], 66.666)
grp['mobility_label'] = [tercile_label(v,q1d,q2d,'well-connected','moderate','remote') for v in grp['mobility_score']]
dist_out = pd.DataFrame({
    'district': grp['district'] if 'district' in grp.columns else grp['district_id'].astype(str),
    'hashtags': [hashtags_from_label('mobility', v) for v in grp['mobility_label']],
    'source': 'mobility.ipynb:rule-based'
})
dist_out.to_csv(OUT_DIR/'district_labels_mobility.csv', index=False)
# Append to combined districts long (idempotent)
dist_long_path = OUT_DIR/'berlin_districts_labels_long.csv'
if dist_long_path.exists():
    _old = pd.read_csv(dist_long_path)
    _new = pd.concat([_old, dist_out], ignore_index=True)
else:
    _new = dist_out.copy()
_new = _new.drop_duplicates(subset=['district','hashtags','source'])
_new.to_csv(dist_long_path, index=False)
# Update districts wide (idempotent)
dist_wide_cols = ['district','ubahn_stations','bus_tram_stops','area_eff_km2','connectivity_density','mobility_score','mobility_label']
grp2 = grp.copy()
if 'district' not in grp2.columns and 'district_id' in grp2.columns:
    # If no district name column, derive a string id for key consistency
    grp2['district'] = grp2['district_id'].astype(str)
dist_wide = grp2[['district'] + dist_wide_cols[1:]].copy()
dist_wide_path = OUT_DIR/'berlin_districts_labels_wide.csv'
if dist_wide_path.exists():
    _wd = pd.read_csv(dist_wide_path)
    _wd = _wd.merge(dist_wide, on=['district'], how='outer', suffixes=('', '_new'))
    for c in ['ubahn_stations','bus_tram_stops','area_eff_km2','connectivity_density','mobility_score','mobility_label']:
        if c+'_new' in _wd.columns:
            _wd[c] = _wd[c].combine_first(_wd[c+'_new'])
            _wd = _wd.drop(columns=[c+'_new'])
else:
    _wd = dist_wide
_wd.to_csv(dist_wide_path, index=False)

# Folium maps
CAT_PALETTE = {'well-connected':'#1a9641','moderate':'#a6d96a','remote':'#fee08b'}
def style_cat(feature, column):
    v = feature['properties'].get(column)
    color = CAT_PALETTE.get(str(v).lower(), '#cccccc')
    return {'fillColor': color, 'color': '#555555', 'weight': 0.5, 'fillOpacity': 0.75}

# Neighborhood map
nei_map = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
g_nei = GDF.merge(nei_labels[['neighborhood','district','mobility_label']], on=['neighborhood','district']) if 'district' in GDF.columns else GDF.merge(nei_labels[['neighborhood','mobility_label']], on='neighborhood')
folium.GeoJson(g_nei, style_function=lambda f, col='mobility_label': style_cat(f,col), tooltip=folium.GeoJsonTooltip(fields=[c for c in ['neighborhood','district','mobility_label'] if c in g_nei.columns])).add_to(nei_map)
nei_map.save(str(OUT_DIR/'mobility_map_neighborhoods.html'))

# District map
dist_polys = GDF.dissolve(by=['district','district_id'] if 'district' in GDF.columns else ['district_id'], as_index=False)
g_dist = dist_polys.merge(grp[['district','district_id','mobility_label']] if 'district' in dist_polys.columns else grp[['district_id','mobility_label']], on=['district','district_id'] if 'district' in dist_polys.columns else ['district_id'])
dist_map = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
folium.GeoJson(g_dist, style_function=lambda f, col='mobility_label': style_cat(f,col), tooltip=folium.GeoJsonTooltip(fields=[c for c in ['district','mobility_label'] if c in g_dist.columns])).add_to(dist_map)
dist_map.save(str(OUT_DIR/'mobility_map_districts.html'))


DataSourceError: outputs\neighborhoods.geojson: No such file or directory