# Documentation — Hashtags and Folium Map

This notebook loads district-level attributes (income, safety, unemployment, density, diversity, plus composites) and produces:
- per-theme CSVs with hashtags (e.g. `outputs/district_labels_income.csv`)
- a Folium map with one layer per label group (toggle via LayerControl).

## Hashtag logic
- Input columns like `income_label`, `safety_label`, `unemployment_label`, `density_label`, `diversity_label` are expected to contain category strings (e.g. `low_income`, `average_income`, `high_income`).
- They are normalized to hashtags using `to_hashtag()` (lowercased, spaces/dashes → `_`, prefixed with `#`).
- Composite labels are derived upstream (or merged from wide tables) and follow these combinations:
  - `income_safety_label` → one of `#affluent_and_safe`, `#affluent_but_risky`, `#affordable_and_safe`, `#disadvantaged_and_risky`, or `#mixed_income_safety`.
  - `urbanity_label` → one of `#vibrant_and_diverse`, `#dense_less_diverse`, `#spread_diverse`, `#quiet_and_homogeneous`, or `#moderately_urban`.
- When exporting per-theme CSVs, each row carries a combined `hashtags` field like `#income;#high_income` where the first tag is the theme and the second is the normalized category.

## Folium visualization
- District polygons are obtained by dissolving neighborhoods (`neighborhoods.geojson`) by district.
- A color palette per label group is defined in `LABEL_COLORS` (Okabe–Ito + greys).
- For each present label column, a `FeatureGroup` is added to the map with a `GeoJson` layer:
  - `style_function` looks up the feature's category → hashtag → color.
  - `GeoJsonTooltip` shows human-friendly aliases (e.g. median income, crimes per 1,000).
- `LayerControl` lets you toggle layers for side-by-side comparison.

## Outputs
- CSVs under `labels_with_visualization/outputs/`: one per theme present in the data.
- HTML map: `labels_with_visualization/outputs/districts_enriched_map.html`.


# District Labels — Enriched Dataset
Loads district-level labels from `data/raw/berlin_districts_enriched.csv`,
writes per-theme district label CSVs to `labels_with_visualization/outputs`,
and creates a Folium map with layer toggles for each label group.

In [None]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
import folium

# Config (resolve project root so paths work from this notebook)
ROOT = Path.cwd()
if not (ROOT/'data').exists():
    ROOT = ROOT.parent
if not (ROOT/'data').exists():
    raise FileNotFoundError(f"Couldn't locate 'data' directory from {Path.cwd()}")
RAW_DIR = ROOT/'data'/'raw'
NEI_PATH = ROOT/'data'/'neighborhoods.geojson'  # for dissolving to districts
CSV_PATH = RAW_DIR/'berlin_districts_enriched.csv'
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(parents=True, exist_ok=True)

# Minimal geo helpers
def ensure_wgs84(gdf):
    if gdf.crs is None: return gdf.set_crs(4326)
    return gdf.to_crs(4326) if gdf.crs.to_epsg()!=4326 else gdf
def compute_area_km2(gdf):
    gutm = ensure_wgs84(gdf).to_crs(25833)
    gdf['area_km2'] = (gutm.geometry.area/1e6).values
    gdf['area_eff_km2'] = gdf['area_km2'].clip(lower=0.20)
    return gdf

# Color maps for Folium (Okabe–Ito palette + neutral grey)
LABEL_COLORS = {
    'income_label': {
        '#low_income':      '#D55E00',
        '#average_income':  '#B9B9B9',
        '#high_income':     '#009E73',
    },
    'safety_label': {
        '#low_safety':      '#D55E00',
        '#average_safety':  '#B9B9B9',
        '#high_safety':     '#0072B2',
    },
    'unemployment_label': {
        '#high_unemployment':   '#D55E00',
        '#average_unemployment': '#B9B9B9',
        '#low_unemployment':     '#009E73',
    },
    'density_label': {
        '#low_density':     '#56B4E9',
        '#average_density': '#B9B9B9',
        '#high_density':    '#0072B2',
    },
    'diversity_label': {
        '#low_diversity':       '#56B4E9',
        '#average_diversity':   '#B9B9B9',
        '#high_diversity':      '#CC79A7',
    },
    # composites
    'income_safety_label': {
        '#affluent_and_safe':        '#009E73',
        '#affluent_but_risky':       '#D55E00',
        '#affordable_and_safe':      '#56B4E9',
        '#disadvantaged_and_risky':  '#CC79A7',
        '#mixed_income_safety':      '#B9B9B9',
    },
    'urbanity_label': {
        '#vibrant_and_diverse':  '#0072B2',
        '#dense_less_diverse':   '#E69F00',
        '#spread_diverse':       '#009E73',
        '#quiet_and_homogeneous':'#CC79A7',
        '#moderately_urban':     '#B9B9B9',
    },
    # parks (districts)
    'green_share_label': {
        '#low_green_share':     '#56B4E9',
        '#average_green_share': '#B9B9B9',
        '#high_green_share':    '#0072B2',
    },
    # venues vibrancy
    'vibrancy_label': {
        '#sparse':  '#fee08b',
        '#average': '#a6d96a',
        '#vibrant': '#1a9641',
    },
    # mobility
    'mobility_label': {
        '#remote':         '#fee08b',
        '#moderate':       '#a6d96a',
        '#well-connected': '#1a9641',
    },
    # playgrounds density
    'playgrounds_density_label': {
        '#low_playground_density':     '#56B4E9',
        '#average_playground_density': '#B9B9B9',
        '#high_playground_density':    '#0072B2',
    },
}

def to_hashtag(label: str) -> str:
    if pd.isna(label): return '#unknown'
    s = str(label).strip().lower().replace(' ', '_').replace('-', '_')
    return s if s.startswith('#') else '#'+s
def tag_for_value(v, col):
    if pd.isna(v): return '#unknown'
    s = str(v).strip().lower()
    if col == 'playgrounds_density_label':
        return {'below average':'#low_playground_density','average':'#average_playground_density','above average':'#high_playground_density'}.get(s, to_hashtag(s))
    if col == 'green_share_label':
        return {'below average':'#low_green_share','average':'#average_green_share','above average':'#high_green_share','low_green_share':'#low_green_share','average_green_share':'#average_green_share','high_green_share':'#high_green_share'}.get(s, to_hashtag(s))
    return to_hashtag(s)

def color_for(label: str, category: str, default: str='#666666') -> str:
    return LABEL_COLORS.get(category, {}).get(label, default)

# Load polygons and dissolve to districts
GDF = compute_area_km2(ensure_wgs84(gpd.read_file(NEI_PATH)))
by_cols = ['district','district_id'] if 'district' in GDF.columns else ['district_id']
DIST = GDF.dissolve(by=by_cols, as_index=False)

# Load enriched district labels
D = pd.read_csv(CSV_PATH)
# Fallback: if labels are not present in the enriched CSV, try merging from wide labels output
labels = ['income_label','safety_label','unemployment_label','density_label','diversity_label','income_safety_label','urbanity_label']
if not any(c in D.columns for c in labels):
    try:
        W = pd.read_csv(OUT_DIR/'berlin_districts_labels_wide.csv')
        use_cols = ['district'] + [c for c in labels if c in W.columns]
        D = D.merge(W[use_cols], on='district', how='left')
    except FileNotFoundError:
        pass

# Determine join key
if 'district_id' in D.columns and 'district_id' in DIST.columns:
    key = ['district_id']
elif 'district' in D.columns and 'district' in DIST.columns:
    key = ['district']
else:
    raise KeyError(
        "Expected 'district_id' or 'district' in enriched CSV to join to polygons."
    )

# Merge labels into district polygons for mapping
DIST_L = DIST.merge(D, on=key, how='left')
# Ensure GeoDataFrame type after merge
DIST_L = gpd.GeoDataFrame(DIST_L, geometry='geometry', crs=DIST.crs)
# Merge district-wide computed labels/metrics from outputs if available
DW_PATH = OUT_DIR/'berlin_districts_labels_wide.csv'
if DW_PATH.exists():
    W = pd.read_csv(DW_PATH)
    if 'district' in W.columns:
        cols = ['district'] + [c for c in W.columns if c != 'district' and c not in DIST_L.columns]
        DIST_L = DIST_L.merge(W[cols], on='district', how='left')

# ===== Compute formatted value fields for tooltips (if source columns exist) =====
def fmt_eur(v):
    return (f"{int(v):,} €" if pd.notna(v) else 'n/a').replace(',', ' ')
def rate_per(numer, denom, per=1000.0):
    return numer / denom * per
# income
if 'weighted_median_income' in DIST_L.columns:
    DIST_L['income_value_eur_fmt'] = DIST_L['weighted_median_income'].apply(fmt_eur)
# safety
if {'total_crimes','population'}.issubset(DIST_L.columns):
    r = rate_per(DIST_L['total_crimes'], DIST_L['population'], 1000.0)
    DIST_L['crimes_per_1000_fmt'] = r.apply(lambda v: f"{v:.1f} crimes / 1,000" if pd.notna(v) else 'n/a')
# unemployment
if {'unemployment_2023','population'}.issubset(DIST_L.columns):
    u = rate_per(DIST_L['unemployment_2023'], DIST_L['population'], 1000.0)
    DIST_L['unemployment_per_1000_fmt'] = u.apply(lambda v: f"{v:.1f} unemployed / 1,000" if pd.notna(v) else 'n/a')
# density
if 'population_density_per_km2' in DIST_L.columns:
    d = DIST_L['population_density_per_km2']
    DIST_L['density_per_km2_fmt'] = d.apply(lambda v: f"{v:,.0f} people / km²".replace(',', ' ') if pd.notna(v) else 'n/a')
elif {'population','total_area_km2'}.issubset(DIST_L.columns):
    d = DIST_L['population'] / DIST_L['total_area_km2']
    DIST_L['density_per_km2_fmt'] = d.apply(lambda v: f"{v:,.0f} people / km²".replace(',', ' ') if pd.notna(v) else 'n/a')
# diversity
if {'migration_background','population'}.issubset(DIST_L.columns):
    s = DIST_L['migration_background'] / DIST_L['population']
    DIST_L['diversity_share_pct_fmt'] = s.apply(lambda v: (f"{v*100:.1f}".replace('.', ',') + '%') if pd.notna(v) else 'n/a')

# Prepare outputs per label group (if present)
def write_district_labels(label_col: str, theme: str):
    if label_col not in D.columns:
        return False
    # District + hashtags column
    tags = ['#'+theme,] + []
    dist_out = pd.DataFrame({
        'district': DIST_L['district'] if 'district' in DIST_L.columns else DIST_L['district_id'],
        'hashtags': (('#'+theme) + ';' + DIST_L[label_col].map(to_hashtag)),
        'source': 'districts_enriched.ipynb:imported',
    })
    dist_out.to_csv(OUT_DIR/f'district_labels_{theme}.csv', index=False)
    return True

present_labels = []
for col, theme in [
    ('income_label','income'),
    ('safety_label','safety'),
    ('unemployment_label','unemployment'),
    ('density_label','density'),
    ('diversity_label','diversity'),
    ('income_safety_label','income_safety'),
    ('urbanity_label','urbanity'),
    ('green_share_label','green_share'),
    ('vibrancy_label','vibrancy'),
    ('mobility_label','mobility'),
    ('playgrounds_density_label','playgrounds_density'),
]:
    # expose any layer present in the merged dataset; write per-source CSV only if the label comes from enriched CSV
    if col in DIST_L.columns:
        present_labels.append(col)
    if col in D.columns:
        try:
            write_district_labels(col, theme)
        except Exception:
            pass

# Build Folium map with layer toggles for each present label
m = folium.Map(location=[52.52, 13.405], zoom_start=10, tiles='cartodbpositron')
# Map label -> value fields and aliases for tooltip
value_fields = {
    'income_label': ['income_value_eur_fmt'],
    'safety_label': ['crimes_per_1000_fmt'],
    'unemployment_label': ['unemployment_per_1000_fmt'],
    'density_label': ['density_per_km2_fmt'],
    'diversity_label': ['diversity_share_pct_fmt'],
    'income_safety_label': ['income_value_eur_fmt','crimes_per_1000_fmt'],
    'urbanity_label': ['density_per_km2_fmt','diversity_share_pct_fmt'],
    'green_share_label': ['green_share'],
    'vibrancy_label': ['VV_index','venues_per_km2'],
    'mobility_label': ['connectivity_density','mobility_score'],
    'playgrounds_density_label': ['playgrounds_per_km2','n_playgrounds'],
}
aliases_map = {
    'income_value_eur_fmt': 'Median income',
    'crimes_per_1000_fmt': 'Crimes per 1,000',
    'unemployment_per_1000_fmt': 'Unemployed per 1,000',
    'density_per_km2_fmt': 'Population density',
    'diversity_share_pct_fmt': 'Migrant background or non-German citizenship',
    'green_share': 'Green share',
    'VV_index': 'Vibrancy index',
    'venues_per_km2': 'Venues per km^2',
    'connectivity_density': 'Transit connectivity density',
    'mobility_score': 'Mobility score',
    'playgrounds_per_km2': 'Playgrounds per km^2',
    'n_playgrounds': 'Playgrounds (count)',
}
first = True
for col in present_labels:
    theme = col.replace('_label','')
    fg = folium.FeatureGroup(name=theme, show=first)
    def style_func(feat, col_=col):
        v = feat['properties'].get(col_)
        tag = tag_for_value(v, col_)
        return {
            'fillColor': color_for(tag, col_),
            'color': '#333333',
            'weight': 1,
            'fillOpacity': 0.7,
        }
    # Tooltip shows label + metric(s)
    fields = ['district', col] + [f for f in value_fields.get(col, []) if f in DIST_L.columns]
    label_alias = col.replace('_', ' ').replace('label','').strip().title() + ' label'
    aliases = ['District', label_alias] + [aliases_map.get(f, f) for f in fields[2:]]
    # include the label column in properties for styling
    gj = folium.GeoJson(
        DIST_L[[c for c in (['geometry'] + fields) if c in DIST_L.columns]],
        style_function=style_func,
        tooltip=folium.GeoJsonTooltip(fields=fields, aliases=aliases)
    )
    gj.add_to(fg)
    fg.add_to(m)
    first = False

folium.LayerControl(collapsed=False).add_to(m)
m.save(str(OUT_DIR/'districts_enriched_map.html'))
