# Documentation — Hashtags and Folium Map

Computes green share (parks + forest) for neighborhoods and districts and produces:
- CSVs with `hashtags` like `#low_green_share` / `#average_green_share` / `#high_green_share`.
- Folium maps visualizing the categories.

## Hashtag logic
- Neighborhoods:
  - Sum park polygon area (km²) from `parks.csv` per neighborhood → `green_area_km2`.
  - Compute `green_share` = `green_area_km2` / neighborhood `area_km2`.
  - Label by a median band: `< median-0.03` → `below average`, `> median+0.03` → `above average`, else `average`.
  - Map to tags via `parks_tag_nei()` → `#low_green_share` / `#average_green_share` / `#high_green_share`.
- Districts:
  - Merge parks area (sum of neighborhoods) with regional statistics forest area (ha) and total area (ha).
  - `green_share` = (parks_ha + forest_ha) / total_area_ha.
  - Apply the same median±band labeling and map to the same three tags.

## Folium visualization
- Build a `folium.Map` and add a `GeoJson` layer colored by the green-share class.
- Tooltips show district/neighborhood and class; saved to `labels_with_visualization/outputs/parks_map_*.html`.


# Parks Labels — Neighborhoods and Districts
Self-contained notebook to compute green share labels and export CSVs with schemas:
- neighborhoods: [district, neighborhood, hashtags, source]
- districts: [district, hashtags, source]

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import folium

# Config (resolve project root so paths work from this notebook)
ROOT = Path.cwd()
if not (ROOT/'data').exists():
    ROOT = ROOT.parent
if not (ROOT/'data').exists():
    raise FileNotFoundError(f"Couldn't locate 'data' directory from {Path.cwd()}")
RAW_DIR = ROOT/'data'/'raw'
NEI_PATH = ROOT/'data'/'neighborhoods.geojson'
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(parents=True, exist_ok=True)
PARKS_CSV = RAW_DIR/'parks.csv'
REG_STATS_CSV = RAW_DIR/'regional_statistics.csv'

def ensure_wgs84(gdf):
    if gdf.crs is None: return gdf.set_crs(4326)
    return gdf.to_crs(4326) if gdf.crs.to_epsg()!=4326 else gdf
def compute_area_km2(gdf):
    gutm = ensure_wgs84(gdf).to_crs(25833)
    gdf['area_km2'] = (gutm.geometry.area/1e6).values
    gdf['area_eff_km2'] = gdf['area_km2'].clip(lower=0.20)
    return gdf
def hashtags(theme, label):
    norm = str(label).strip().lower().replace(' ','-')
    return ';'.join([f'#'+theme, f'#'+norm])
# Map parks green-share categories to single tags (neighborhood-level)
def parks_tag_nei(lbl: str) -> str:
    lab = str(lbl).strip().lower()
    if lab == 'below average': return '#low_green_share'
    if lab == 'above average': return '#high_green_share'
    return '#average_green_share'

GDF = compute_area_km2(ensure_wgs84(gpd.read_file(NEI_PATH)))
parks_raw = pd.read_csv(PARKS_CSV)
parks_raw['green_area_km2'] = parks_raw['size_sqm'].astype(float)/1e6
agg = parks_raw.groupby(['district_id','neighborhood'], dropna=False)['green_area_km2'].sum().reset_index()
nei = GDF[['district_id','district','neighborhood','area_km2']].merge(agg, on=['district_id','neighborhood'], how='left').fillna({'green_area_km2':0.0})
nei['green_share'] = (nei['green_area_km2']/nei['area_km2']).replace([np.inf,-np.inf], np.nan)
med = np.nanmedian(nei['green_share']); lower, upper = med-0.03, med+0.03
def label_g(v):
    if np.isnan(v): return 'average'
    if v<lower: return 'below average'
    if v>upper: return 'above average'
    return 'average'
nei['green_share_label'] = nei['green_share'].apply(label_g)
nei_out = pd.DataFrame({
    'district': nei['district'],
    'neighborhood': nei['neighborhood'],
    'hashtags': [parks_tag_nei(v) for v in nei['green_share_label']],
    'source': 'parks.ipynb:rule-based'
})
nei_out.to_csv(OUT_DIR/'neighborhood_labels_parks.csv', index=False)
# Append to combined neighborhoods long (idempotent)
nei_long_path = OUT_DIR/'berlin_neighborhoods_labels_long.csv'
if nei_long_path.exists():
    _old = pd.read_csv(nei_long_path)
    _new = pd.concat([_old, nei_out], ignore_index=True)
else:
    _new = nei_out.copy()
_new = _new.drop_duplicates(subset=['district','neighborhood','hashtags','source'])
_new.to_csv(nei_long_path, index=False)
# Update/append to neighborhoods wide table (idempotent)
nei_wide_cols = ['district','neighborhood','green_area_km2','area_km2','green_share','green_share_label']
nei_wide = nei[nei_wide_cols].copy()
nei_wide_path = OUT_DIR/'berlin_neighborhoods_labels_wide.csv'
if nei_wide_path.exists():
    _nw = pd.read_csv(nei_wide_path)
    _nw = _nw.merge(nei_wide, on=['district','neighborhood'], how='outer', suffixes=('', '_new'))
    for c in ['green_area_km2','area_km2','green_share','green_share_label']:
        if c+'_new' in _nw.columns:
            _nw[c] = _nw[c].combine_first(_nw[c+'_new'])
            _nw = _nw.drop(columns=[c+'_new'])
else:
    _nw = nei_wide
_nw.to_csv(nei_wide_path, index=False)

# District-level green share using parks + forest from regional statistics
# Parks area by district (from neighborhoods aggregation)
parks_area_km2_by_dist = nei.groupby('district', dropna=False)['green_area_km2'].sum().rename('parks_area_km2').reset_index()
parks_area_ha_by_dist = parks_area_km2_by_dist.assign(green_area_parks_ha=lambda d: d['parks_area_km2']*100)[['district','green_area_parks_ha']]
# Regional statistics: forest and total area (prefer 2024 if present; else latest)
rs = pd.read_csv(REG_STATS_CSV)
rs.columns = [c.strip().lower() for c in rs.columns]
dcol = 'district' if 'district' in rs.columns else ('bezirk' if 'bezirk' in rs.columns else None)
ycol = 'year' if 'year' in rs.columns else ('jahr' if 'jahr' in rs.columns else None)
fcol = 'forest_area_ha' if 'forest_area_ha' in rs.columns else ('forest_ha' if 'forest_ha' in rs.columns else None)
tcol = 'total_area_ha' if 'total_area_ha' in rs.columns else ('area_total_ha' if 'area_total_ha' in rs.columns else None)
sfcol = 'share_forest' if 'share_forest' in rs.columns else None
if dcol is None or (fcol is None and sfcol is None):
    raise ValueError('regional_statistics.csv must provide district and either forest_area_ha or share_forest')
if ycol and 2024 in set(pd.to_numeric(rs[ycol], errors='coerce').dropna().astype(int)):
    rs = rs[pd.to_numeric(rs[ycol], errors='coerce').astype('Int64')==2024]
elif ycol:
    yy = pd.to_numeric(rs[ycol], errors='coerce').dropna().astype(int)
    if not yy.empty:
        latest = yy.max()
        rs = rs[yy==latest]
keep = [dcol] + ([fcol] if fcol else []) + ([tcol] if tcol else []) + ([sfcol] if sfcol else [])
rs_slim = rs[keep].rename(columns={dcol:'district'})
if fcol is None and sfcol and tcol:
    rs_slim['forest_area_ha'] = pd.to_numeric(rs_slim[sfcol], errors='coerce') * pd.to_numeric(rs_slim[tcol], errors='coerce')
else:
    rs_slim['forest_area_ha'] = pd.to_numeric(rs_slim.get('forest_area_ha', np.nan), errors='coerce')
if tcol:
    rs_slim['total_area_ha'] = pd.to_numeric(rs_slim[tcol], errors='coerce')
else:
    dist_polys = GDF.dissolve(by=['district'], as_index=False)
    total_km2 = dist_polys[['district','area_km2']].rename(columns={'area_km2':'total_area_km2'})
    rs_slim = rs_slim.merge(total_km2, on='district', how='left')
    rs_slim['total_area_ha'] = rs_slim['total_area_km2']*100
# Merge parks area with regional stats
dist2 = rs_slim.merge(parks_area_ha_by_dist, on='district', how='left').fillna({'green_area_parks_ha':0.0})
dist2['green_area_total_ha'] = pd.to_numeric(dist2['green_area_parks_ha'], errors='coerce') + pd.to_numeric(dist2['forest_area_ha'], errors='coerce')
dist2['green_share'] = (dist2['green_area_total_ha'] / pd.to_numeric(dist2['total_area_ha'], errors='coerce')).replace([np.inf,-np.inf], np.nan)
# Label into three buckets
medd = np.nanmedian(dist2['green_share']); l,u = medd-0.03, medd+0.03
dist2['green_share_label'] = dist2['green_share'].apply(lambda v: ('low_green_share' if (pd.notna(v) and v<l) else ('high_green_share' if (pd.notna(v) and v>u) else 'average_green_share')))
def to_tag(lbl):
    return {'low_green_share':'#low_green_share','average_green_share':'#average_green_share','high_green_share':'#high_green_share'}.get(lbl, '#average_green_share')
dist_out = pd.DataFrame({
    'district': dist2['district'],
    'hashtags': dist2['green_share_label'].map(to_tag),
    'source': 'parks.ipynb:rule-based'
})
dist_out.to_csv(OUT_DIR/'district_labels_parks.csv', index=False)
# Append to combined districts long (idempotent)
dist_long_path = OUT_DIR/'berlin_districts_labels_long.csv'
if dist_long_path.exists():
    _old = pd.read_csv(dist_long_path)
    _new = pd.concat([_old, dist_out], ignore_index=True)
else:
    _new = dist_out.copy()
_new = _new.drop_duplicates(subset=['district','hashtags','source'])
_new.to_csv(dist_long_path, index=False)
# Update districts wide table
dist_wide_cols = ['district','green_share','green_share_label','green_area_total_ha','forest_area_ha','green_area_parks_ha','total_area_ha']
dist_wide = dist2[dist_wide_cols].copy()
dist_wide_path = OUT_DIR/'berlin_districts_labels_wide.csv'
if dist_wide_path.exists():
    _wd = pd.read_csv(dist_wide_path)
    _wd = _wd.merge(dist_wide, on=['district'], how='outer', suffixes=('', '_new'))
    for c in ['green_share','green_share_label','green_area_total_ha','forest_area_ha','green_area_parks_ha','total_area_ha']:
        if c+'_new' in _wd.columns:
            _wd[c] = _wd[c].combine_first(_wd[c+'_new'])
            _wd = _wd.drop(columns=[c+'_new'])
else:
    _wd = dist_wide
_wd.to_csv(dist_wide_path, index=False)

# Maps
# Support both neighborhood labels (above/below/average) and district labels (high/average/low_green_share)
CAT = {
    'above average':'#1a9641',
    'average':'#a6d96a',
    'below average':'#fee08b',
    'high_green_share':'#1a9641',
    'average_green_share':'#a6d96a',
    'low_green_share':'#fee08b',
}
def style_cat(f, col):
    v = f['properties'].get(col); return {'fillColor': CAT.get(str(v).lower() if isinstance(v,str) else v, '#cccccc'), 'color':'#555','weight':0.5, 'fillOpacity':0.75}
m_nei = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
g_nei = GDF.merge(nei[['district','neighborhood','green_share_label']], on=['district','neighborhood'])
folium.GeoJson(g_nei, style_function=lambda f, c='green_share_label': style_cat(f,c), tooltip=folium.GeoJsonTooltip(fields=['neighborhood','district','green_share_label'])).add_to(m_nei)
m_nei.save(str(OUT_DIR/'parks_map_neighborhoods.html'))
m_dist = folium.Map(location=[52.52,13.405], zoom_start=10, tiles='cartodbpositron')
dist_polys = GDF.dissolve(by=['district'], as_index=False)
g_dist = dist_polys.merge(dist2[['district','green_share_label']], on='district')
folium.GeoJson(g_dist, style_function=lambda f, c='green_share_label': style_cat(f,c), tooltip=folium.GeoJsonTooltip(fields=['district','green_share_label'])).add_to(m_dist)
m_dist.save(str(OUT_DIR/'parks_map_districts.html'))
