# Documentation — Base and Composite Labels + Map

Builds district-level base labels (income, safety, unemployment, density, diversity), then composites, and renders a Folium map.

## Hashtag logic
- Base labels use tertile classification over districts:
  - `income_label`: `#low_income` / `#average_income` / `#high_income` from `weighted_median_income`.
  - `safety_label`: invert crime rate so fewer crimes ⇒ `#high_safety` (rate per 1,000 residents).
  - `unemployment_label`: `#low_unemployment` / `#average_unemployment` / `#high_unemployment`.
  - `density_label`: from population density (precomputed or `population/area`).
  - `diversity_label`: share with migrant background or non-German citizenship.
- Composites:
  - `income_safety_label`: combines `income_label` and `safety_label` into one of
    `#affluent_and_safe`, `#affluent_but_risky`, `#affordable_and_safe`, `#disadvantaged_and_risky`, `#mixed_income_safety`.
  - `urbanity_label`: combines `density_label` and `diversity_label` into
    `#vibrant_and_diverse`, `#dense_less_diverse`, `#spread_diverse`, `#quiet_and_homogeneous`, `#moderately_urban`.
- Exported CSVs include a `hashtags` column combining a theme with the category, e.g. `#income;#high_income`.

## Folium visualization
- Dissolve neighborhoods to district polygons.
- Create one layer per label group with a color palette for categories; tooltips show formatted values (e.g., income, crimes/1,000).
- Add a LayerControl and save the map HTML.


In [2]:
# Path config matching local notebooks
from pathlib import Path
import pandas as pd
import geopandas as gpd
import folium

ROOT = Path.cwd()
if not (ROOT/'data').exists():
    ROOT = ROOT.parent
if not (ROOT/'data').exists():
    raise FileNotFoundError(f"Couldn't locate 'data' directory from {Path.cwd()}")
RAW_DIR = ROOT/'data'/'raw'
NEI_PATH = ROOT/'data'/'neighborhoods.geojson'
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(parents=True, exist_ok=True)

def ensure_wgs84(gdf):
    if gdf.crs is None: return gdf.set_crs(4326)
    return gdf.to_crs(4326) if gdf.crs.to_epsg()!=4326 else gdf
def compute_area_km2(gdf):
    gutm = ensure_wgs84(gdf).to_crs(25833)
    gdf['area_km2'] = (gutm.geometry.area/1e6).values
    gdf['area_eff_km2'] = gdf['area_km2'].clip(lower=0.20)
    return gdf


Mounted at /content/drive


In [3]:
# ==== Utils: Klassifizierung & Label-Bau ====
import pandas as pd
from pathlib import Path

SOURCE_TAG = "districts_with_income:rule-based"

def rate_per(numer, denom, per=1000.0):
    return numer / denom * per

def tertile_labels(series, tags_low_mid_high, invert=False):
    """
    Teilt Werte in 3 gleich große Gruppen nach Rang (z.B. 12 -> 4/4/4).
    tags_low_mid_high: Tuple/List mit 3 Hashtags für niedrige/mittlere/hohe Werte.
    invert=True kehrt die Bedeutung um (niedriger Wert -> drittes Tag), z.B. für 'safety'.
    """
    s = pd.Series(series).rank(method="first", ascending=True)
    n = len(s)
    bin_size = max(1, n // 3)
    idx = ((s - 1) // bin_size).clip(upper=2).astype(int)  # 0=low,1=mid,2=high
    tags = list(tags_low_mid_high)
    if invert:  # z.B. Sicherheit: niedrige Rate => #high_safety
        tags = [tags[2], tags[1], tags[0]]
    return idx.map({0: tags[0], 1: tags[1], 2: tags[2]})

def combine_tags(*tags):
    tags = [t for t in tags if isinstance(t, str) and t.strip()]
    return " ".join(dict.fromkeys(tags))  # Reihenfolge beibehalten, Duplikate raus

def build_base_labels(df_enriched: pd.DataFrame) -> pd.DataFrame:
    """
    Baut Basis-Labels: income, safety, unemployment, density, diversity.
    Erwartet Spalten: district, weighted_median_income, total_crimes, population,
                      unemployment_2023, total_area_km2 (oder population_density_per_km2),
                      migration_background.
    """
    out = pd.DataFrame({"district": df_enriched["district"]})

    # Income
    out["income_value_eur"] = df_enriched["weighted_median_income"]
    out["income_value_eur_fmt"] = out["income_value_eur"].apply(lambda v: (f"{int(v):,} €" if pd.notna(v) else 'n/a').replace(',', ' '))
    out["income_label"] = tertile_labels(
        out["income_value_eur"],
        ("#low_income", "#average_income", "#high_income")
    )

    # Safety (niedrigere Kriminalitätsrate => #high_safety)
    crime_rate = rate_per(df_enriched["total_crimes"], df_enriched["population"], per=1000)
    out["crimes_per_1000"] = crime_rate
    out["crimes_per_1000_fmt"] = out["crimes_per_1000"].apply(lambda v: f"{v:.1f} crimes / 1,000" if pd.notna(v) else 'n/a')
    out["safety_label"] = tertile_labels(
        crime_rate, ("#low_safety", "#average_safety", "#high_safety"), invert=True
    )

    # Unemployment
    unemp_rate = rate_per(df_enriched["unemployment_2023"], df_enriched["population"], per=1000)
    out["unemployment_per_1000"] = unemp_rate
    out["unemployment_per_1000_fmt"] = out["unemployment_per_1000"].apply(lambda v: f"{v:.1f} unemployed / 1,000" if pd.notna(v) else 'n/a')
    out["unemployment_label"] = tertile_labels(
        unemp_rate, ("#low_unemployment", "#average_unemployment", "#high_unemployment")
    )

    # Density
    dens = (df_enriched["population"] / df_enriched["total_area_km2"]
            if "population_density_per_km2" not in df_enriched.columns
            else df_enriched["population_density_per_km2"])
    out["density_per_km2"] = dens
    out["density_per_km2_fmt"] = out["density_per_km2"].apply(lambda v: f"{v:,.0f} people / km²".replace(',', ' ') if pd.notna(v) else 'n/a')
    out["density_label"] = tertile_labels(
        dens, ("#low_density", "#average_density", "#high_density")
    )

    # Diversity
    diversity_rate = df_enriched["migration_background"] / df_enriched["population"]
    out["diversity_share"] = diversity_rate
    out["diversity_share_pct_fmt"] = out["diversity_share"].apply(lambda v: (f"{v*100:.1f}".replace('.', ',') + '%') if pd.notna(v) else 'n/a')
    out["diversity_label"] = tertile_labels(
        diversity_rate, ("#low_diversity", "#average_diversity", "#high_diversity")
    )

    return out

def build_composites(df_wide: pd.DataFrame) -> pd.DataFrame:
    """Nur die gewünschten Composite-Labels: income_safety_label & urbanity_label."""
    def pick(val, *opts): return isinstance(val, str) and val in opts

    def income_safety(row):
        inc, saf = row["income_label"], row["safety_label"]
        if pick(inc, "#high_income") and pick(saf, "#high_safety"): return "#affluent_and_safe"
        if pick(inc, "#high_income") and pick(saf, "#low_safety"):  return "#affluent_but_risky"
        if pick(inc, "#low_income") and pick(saf, "#high_safety"):  return "#affordable_and_safe"
        if pick(inc, "#low_income") and pick(saf, "#low_safety"):   return "#disadvantaged_and_risky"
        return "#mixed_income_safety"

    def urbanity(row):
        den, divv = row["density_label"], row["diversity_label"]
        if pick(den, "#high_density") and pick(divv, "#high_diversity"): return "#vibrant_and_diverse"
        if pick(den, "#high_density") and pick(divv, "#low_diversity"):  return "#dense_less_diverse"
        if pick(den, "#low_density")  and pick(divv, "#high_diversity"): return "#spread_diverse"
        if pick(den, "#low_density")  and pick(divv, "#low_diversity"):  return "#quiet_and_homogeneous"
        return "#moderately_urban"

    df = df_wide.copy()
    df["income_safety_label"] = df.apply(income_safety, axis=1)
    df["urbanity_label"] = df.apply(urbanity, axis=1)
    return df

def to_long(df_wide: pd.DataFrame, include_cols=None) -> pd.DataFrame:
    """
    Long-Format (district, hashtags, source) aus ausgewählten Spalten.
    """
    if include_cols is None:
        include_cols = [
            "income_label","safety_label","unemployment_label",
            "density_label","diversity_label",
            "income_safety_label","urbanity_label"
        ]
    cols = ["district"] + [c for c in include_cols if c in df_wide.columns]
    df = df_wide[cols].copy()
    df["hashtags"] = [
        combine_tags(*[df.loc[i, c] for c in cols[1:]])
        for i in range(len(df))
    ]
    return df[["district","hashtags"]].assign(source=SOURCE_TAG)

def save_outputs(df_wide: pd.DataFrame, path: Path):
    df_wide_out = df_wide.copy()
    df_wide_out["source"] = SOURCE_TAG
    df_wide_out.to_csv(path / "berlin_districts_labels_wide.csv", index=False)
    df_long = to_long(df_wide_out)
    df_long.to_csv(path / "berlin_districts_labels_long.csv", index=False)
    return df_wide_out, df_long


In [4]:
# ==== Pipeline: laden -> Basislabels -> Composites -> speichern & map ====
# 1) Laden
enriched_file = RAW_DIR / 'berlin_districts_enriched.csv'
df_enriched = pd.read_csv(enriched_file)

# 2) Basislabels & Composites
df_wide = build_base_labels(df_enriched)
df_wide = build_composites(df_wide)

# 3) Speichern (wide & long) in local outputs
df_wide, df_long = save_outputs(df_wide, OUT_DIR)

# 4) kurzer Blick
display(df_wide.sort_values("district").head(12))
display(df_long.sort_values("district").head(12))
print("Saved:", OUT_DIR / "berlin_districts_labels_wide.csv")
print("Saved:", OUT_DIR / "berlin_districts_labels_long.csv")

# 5) Folium map with toggles
LABEL_COLORS = {
    'income_label': {
        '#low_income':      '#D55E00',
        '#average_income':  '#B9B9B9',
        '#high_income':     '#009E73',
    },
    'safety_label': {
        '#low_safety':      '#D55E00',
        '#average_safety':  '#B9B9B9',
        '#high_safety':     '#0072B2',
    },
    'unemployment_label': {
        '#high_unemployment':   '#D55E00',
        '#average_unemployment': '#B9B9B9',
        '#low_unemployment':     '#009E73',
    },
    'density_label': {
        '#low_density':     '#56B4E9',
        '#average_density': '#B9B9B9',
        '#high_density':    '#0072B2',
    },
    'diversity_label': {
        '#low_diversity':       '#56B4E9',
        '#average_diversity':   '#B9B9B9',
        '#high_diversity':      '#CC79A7',
    },
    'income_safety_label': {
        '#affluent_and_safe':        '#009E73',
        '#affluent_but_risky':       '#D55E00',
        '#affordable_and_safe':      '#56B4E9',
        '#disadvantaged_and_risky':  '#CC79A7',
        '#mixed_income_safety':      '#B9B9B9',
    },
    'urbanity_label': {
        '#vibrant_and_diverse':  '#0072B2',
        '#dense_less_diverse':   '#E69F00',
        '#spread_diverse':       '#009E73',
        '#quiet_and_homogeneous':'#CC79A7',
        '#moderately_urban':     '#B9B9B9',
    },
}
def color_for(label: str, category: str, default: str='#666666') -> str:
    return LABEL_COLORS.get(category, {}).get(label, default)

G = compute_area_km2(ensure_wgs84(gpd.read_file(NEI_PATH)))
by_cols = ['district','district_id'] if 'district' in G.columns else ['district_id']
DIST = G.dissolve(by=by_cols, as_index=False)
DIST_L = DIST.merge(df_wide, on='district', how='left')
# Ensure GeoDataFrame type after merge
DIST_L = gpd.GeoDataFrame(DIST_L, geometry='geometry', crs=DIST.crs)

m = folium.Map(location=[52.52, 13.405], zoom_start=10, tiles='cartodbpositron')
present = [c for c in ['income_label','safety_label','unemployment_label','density_label','diversity_label','income_safety_label','urbanity_label'] if c in df_wide.columns]
# Map label -> value fields and aliases for tooltip
value_fields = {
    'income_label': ['income_value_eur_fmt'],
    'safety_label': ['crimes_per_1000_fmt'],
    'unemployment_label': ['unemployment_per_1000_fmt'],
    'density_label': ['density_per_km2_fmt'],
    'diversity_label': ['diversity_share_pct_fmt'],
    'income_safety_label': ['income_value_eur_fmt','crimes_per_1000_fmt'],
    'urbanity_label': ['density_per_km2_fmt','diversity_share_pct_fmt'],
}
aliases_map = {
    'income_value_eur_fmt': 'Median income',
    'crimes_per_1000_fmt': 'Crimes per 1,000',
    'unemployment_per_1000_fmt': 'Unemployed per 1,000',
    'density_per_km2_fmt': 'Population density',
    'diversity_share_pct_fmt': 'Migrant background or non-German citizenship',
}
for col in present:
    fg = folium.FeatureGroup(name=col.replace('_label',''), show=False)
    def style_func(feat, col_=col):
        v = feat['properties'].get(col_)
        return {
            'fillColor': color_for(v, col_),
            'color': '#333333',
            'weight': 1,
            'fillOpacity': 0.7,
        }
    # Tooltip shows label + metric(s)
    fields = ['district', col] + [f for f in value_fields.get(col, []) if f in DIST_L.columns]
    label_alias = col.replace('_', ' ').replace('label','').strip().title() + ' label'
    aliases = ['District', label_alias] + [aliases_map.get(f, f) for f in fields[2:]]
    # include geometry first; fields already include the label column
    folium.GeoJson(
        DIST_L[[c for c in (['geometry'] + fields) if c in DIST_L.columns]],
        style_function=style_func,
        tooltip=folium.GeoJsonTooltip(fields=fields, aliases=aliases)
    ).add_to(fg)
    fg.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)
m.save(str(OUT_DIR/'districts_income_map.html'))


Unnamed: 0,district,income_label,safety_label,unemployment_label,density_label,diversity_label,income_safety_label,urbanity_label,source
3,Charlottenburg-Wilmersdorf,#high_income,#low_safety,#low_unemployment,#average_density,#high_diversity,#affluent_but_risky,#moderately_urban,districts_with_income:rule-based
9,Friedrichshain-Kreuzberg,#high_income,#low_safety,#high_unemployment,#high_density,#high_diversity,#affluent_but_risky,#vibrant_and_diverse,districts_with_income:rule-based
5,Lichtenberg,#low_income,#average_safety,#average_unemployment,#average_density,#average_diversity,#mixed_income_safety,#moderately_urban,districts_with_income:rule-based
8,Marzahn-Hellersdorf,#low_income,#high_safety,#average_unemployment,#average_density,#low_diversity,#affordable_and_safe,#moderately_urban,districts_with_income:rule-based
1,Mitte,#high_income,#low_safety,#high_unemployment,#high_density,#high_diversity,#affluent_but_risky,#vibrant_and_diverse,districts_with_income:rule-based
4,Neukölln,#low_income,#low_safety,#high_unemployment,#high_density,#high_diversity,#disadvantaged_and_risky,#vibrant_and_diverse,districts_with_income:rule-based
0,Pankow,#high_income,#high_safety,#low_unemployment,#average_density,#low_diversity,#affluent_and_safe,#moderately_urban,districts_with_income:rule-based
10,Reinickendorf,#average_income,#average_safety,#average_unemployment,#low_density,#average_diversity,#mixed_income_safety,#moderately_urban,districts_with_income:rule-based
11,Spandau,#low_income,#average_safety,#high_unemployment,#low_density,#average_diversity,#mixed_income_safety,#moderately_urban,districts_with_income:rule-based
6,Steglitz-Zehlendorf,#average_income,#high_safety,#low_unemployment,#low_density,#low_diversity,#mixed_income_safety,#quiet_and_homogeneous,districts_with_income:rule-based


Unnamed: 0,district,hashtags,source
3,Charlottenburg-Wilmersdorf,#high_income #low_safety #low_unemployment #av...,districts_with_income:rule-based
9,Friedrichshain-Kreuzberg,#high_income #low_safety #high_unemployment #h...,districts_with_income:rule-based
5,Lichtenberg,#low_income #average_safety #average_unemploym...,districts_with_income:rule-based
8,Marzahn-Hellersdorf,#low_income #high_safety #average_unemployment...,districts_with_income:rule-based
1,Mitte,#high_income #low_safety #high_unemployment #h...,districts_with_income:rule-based
4,Neukölln,#low_income #low_safety #high_unemployment #hi...,districts_with_income:rule-based
0,Pankow,#high_income #high_safety #low_unemployment #a...,districts_with_income:rule-based
10,Reinickendorf,#average_income #average_safety #average_unemp...,districts_with_income:rule-based
11,Spandau,#low_income #average_safety #high_unemployment...,districts_with_income:rule-based
6,Steglitz-Zehlendorf,#average_income #high_safety #low_unemployment...,districts_with_income:rule-based


Saved: /content/drive/My Drive/webeet/tables_from_db_csv/berlin_districts_labels_wide.csv
Saved: /content/drive/My Drive/webeet/tables_from_db_csv/berlin_districts_labels_long.csv
