In [140]:
import polars as pl
import json

# Create table of places

## Load data for places

In [141]:
places = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/places.csv"
)
places_to_types = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/places_to_types.csv"
)
feature_types = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/feature_types.csv"
)
event_types = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/events_to_types.csv"
)
locs = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/locations.csv"
)
places_to_attestations = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/places_to_attestation.csv"
)
places_to_establishment = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/places_to_establishment.csv",
    ignore_errors=True,
)
feature_class = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/feature_class.csv"
)
# Remove duplicate place entries
locs = locs.unique(subset=["place_id"], keep="first")

## Join tables to create places table

In [142]:
from re import sub


pt_df = places.join(places_to_types, on="place_id", how="left")
pt_df = pt_df.join(locs, on="place_id", how="left").unique(
    subset=["place_id"], keep="first"
)
pt_df = pt_df.rename({"ft_id": "yft_id"})
pt_df = pt_df.join(feature_types, on="yft_id", how="left", suffix="_ft_tp")
pt_df = pt_df.join(places_to_attestations, on="place_id", how="left")
pt_df = pt_df.join(places_to_establishment, on="place_id", how="left")
pt_df = pt_df.join(feature_class, on="fct_id", how="left")
pt_df = pt_df.select(
    [
        "place_id",
        "tr_title",
        "ch_pinyin",
        "latitude",
        "longitude",
        "attestation",
        "ch_title",
        "feature_class",
        "en_title",
        "est_year",
    ]
)
pt_df = pt_df.unique()
pt_df.head()

place_id,tr_title,ch_pinyin,latitude,longitude,attestation,ch_title,feature_class,en_title,est_year
str,str,str,f64,f64,str,str,str,str,i64
"""yrdb2440""","""細柳""","""xiliu""",108.656791,34.316499,"""us_20083""",,,,
"""yrdb2917""","""邵寨鎮""","""shaozhai zhen""",107.830991,35.006979,"""us_80560""","""鎮""","""habitations""","""town""",1111.0
"""yrdb3162""","""陳家林""","""chenjialin""",116.866667,36.75,"""ds_64""",,,,
"""yrdb2""","""交河""","""jiao he""",116.284383,38.018758,"""ds_1019""","""河""","""natural features""","""river""",
"""yrdb838""","""官菜園""","""guancaiyuan""",110.633429,37.411316,"""us_70351""",,,,1582.0


In [143]:
print(places.shape)
print(pt_df.shape)
pt_df = pt_df.sort("est_year", nulls_last=True).unique(
    subset=["place_id"], keep="first"
)
print(pt_df.shape)

(4225, 3)
(4738, 10)
(4225, 10)


## Split into Upstream and Downstream data

In [144]:
upstream = pt_df.filter(pl.col("attestation").str.starts_with("us"))
downstream = pt_df.filter(pl.col("attestation").str.starts_with("ds"))

In [145]:
import requests

# Join Upstream places with infromation
eras = requests.get(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/eras.geojson"
).json()["features"]
dyns = requests.get(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/dynasties.geojson"
).json()["features"]
dyns_df = pl.DataFrame([f["properties"] for f in dyns], orient="row")
eras_df = pl.DataFrame([f["properties"] for f in eras], orient="row")
dyns_df.head()

id,name_en,name_ch,start_cert,start_date,end_cert,end_date
i64,str,str,str,i64,str,i64
1,"""Xia dynasty ""","""夏朝""","""n""",-2070,"""n""",-1600
2,"""Shang dynasty ""","""商朝 ""","""n""",-1600,"""n""",-1046
3,"""Zhou dynasty ""","""周朝 ""","""n""",-1046,"""y""",-256
4,"""Qin dynasty ""","""秦朝 ""","""y""",-221,"""y""",-207
5,"""Han dynasty ""","""漢朝 ""","""y""",-202,"""y""",220


In [146]:
eras_df.head()

id,monarch,era_name_en,era_name_ch,era_start,era_end
str,str,str,str,i64,i64
"""I0019""","""I0001""","""Jianlong""","""建隆""",960,963
"""I0020""","""I0001""","""Qiande""","""乾德""",963,968
"""I0021""","""I0001""","""Kaibao""","""開寶""",968,976
"""I0022""","""I0002""","""Taipingxingguo""","""太平興國""",976,984
"""I0023""","""I0002""","""Yongxi""","""雍熙""",984,987


In [147]:
dyns_df = dyns_df.select(["name_en", "name_ch", "start_date", "end_date"]).rename(
    {"name_en": "dynasty_en", "name_ch": "dynasty_ch"}
)

In [148]:
dyn_en = []
dyn_ch = []
for row in upstream.iter_rows():
    est_year = row[9]
    if est_year is None:
        dyn_en.append(None)
        dyn_ch.append(None)
        continue
    matched_dyn = dyns_df.filter(
        (pl.col("start_date") <= est_year) & (pl.col("end_date") >= est_year)
    )
    if matched_dyn.is_empty():
        dyn_en.append(None)
        dyn_ch.append(None)
    else:
        dyn_en.append(matched_dyn[0, "dynasty_en"].strip())
        dyn_ch.append(matched_dyn[0, "dynasty_ch"].strip())

upstream = upstream.with_columns(
    [pl.Series("dynasty_en", dyn_en), pl.Series("dynasty_ch", dyn_ch)]
)

upstream.head()

place_id,tr_title,ch_pinyin,latitude,longitude,attestation,ch_title,feature_class,en_title,est_year,dynasty_en,dynasty_ch
str,str,str,f64,f64,str,str,str,str,i64,str,str
"""yrdb583""","""夏官營""","""xiaguan ying""",104.164822,35.948885,"""us_81117""","""營""","""habitations""","""encampment""",1820,"""Qing dynasty""","""清朝"""
"""yrdb2440""","""細柳""","""xiliu""",108.656791,34.316499,"""us_20083""",,,,497,"""Kings of Wuxing""","""武興王"""
"""yrdb717""","""始昌城""","""shichang cheng""",105.528745,34.199403,"""us_40192""","""城""","""habitations""","""walled settlement""",497,"""Kings of Wuxing""","""武興王"""
"""yrdb2705""","""街亭""","""jie ting""",106.187724,35.158936,"""us_30026""","""亭""","""habitations""","""neighborhood""",262,"""Three Kingdoms""","""三國"""
"""yrdb905""","""小峽""","""xiaoxia""",101.773278,36.51608,"""us_80325""",,,,1820,"""Qing dynasty""","""清朝"""


In [149]:
# convert upstream to json and write to upstream-data.json

upstream_data = []

for row in upstream.iter_rows():
    id = row[0]
    hz = row[1]
    py = row[2]
    x_coor = row[4]
    y_coor = row[3]
    date = row[9]
    regime = row[10]
    regime_ch = row[11]
    name_type = row[6]
    name_type_en = row[8]
    name_class_en = row[7]

    place_json = {
        "id": id,
        "hz": hz,
        "py": py,
        "x_coor": x_coor,
        "y_coor": y_coor,
        "date": date,
        "regime": regime,
        "regime_ch": regime_ch,
        "name_type": name_type,
        "name_type_en": name_type_en,
        "name_class_en": name_class_en,
    }
    upstream_data.append(place_json)

# with open("upstream-data.json", "w", encoding="utf-8") as f:
#     json.dump(upstream_data, f, ensure_ascii=False, indent=4)

# Create table for events

## Load data for events

In [150]:
import numpy as np

events = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/events.csv"
)
event_types = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/event_types.csv"
)
event_cats = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/event_categories.csv"
)

events_to_places = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/events_to_places.csv"
).rename({"event": "event_id"})
events_to_types = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/events_to_types.csv"
)
events_to_sources = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/sources_to_events.csv",
    ignore_errors=True,
)
sources = pl.read_csv(
    "https://raw.githubusercontent.com/YellowRiverDatabase/geodata/refs/heads/main/relational-data/sources.csv",
    null_values=["", "nan", "NaN", "NA", "null"],
    ignore_errors=True,
)
events_to_sources.head()

src_to_evt_id,source_id,event_id
str,i64,str
"""srev_1""",11001001,"""ev_1"""
"""srev_2""",11001002,"""ev_2"""
"""srev_3""",11001003,"""ev_3"""
"""srev_4""",11001004,"""ev_4"""
"""srev_5""",11001005,"""ev_5"""


In [151]:
events.head(1)

event_id,ch_date,western_date,description,notes
str,str,f64,str,str
"""ev_1""","""史前时代""",-2356.0,,


In [152]:
event_types.head(1)

event_type_id,zh_ch_title,en_title,en_type,evc_id,description
str,str,str,str,str,str
"""evtype_1""","""溢""","""yi""","""Flood""","""evc_1""","""Any time there is a zhang 漲(ra…"


In [153]:
events_to_places.head(1)

evtp_id,place_id,event_id,attestation
str,str,str,str
"""evtp_1""","""yrdb2""","""ev_858""","""ds_1019"""


In [154]:
events_to_types.head(1)

evetotyp_id,event_id,event_type_id
str,str,str
"""evetotyp_1""","""ev_1""","""evtype_1"""


In [155]:
event_cats.head(1)

evc_id,zh_cn_category,en_category
str,str,str
"""evc_1""","""水災""","""Disasters"""


In [156]:
sources.head(1)

source_id,source,page,chinese_date,western_date,old_placename_chinese,modern_placename_chinese,event_type_chinese,event_name,event_description,primary_source_1,primary_source_2,notes
i64,str,str,str,str,str,str,str,str,str,str,str,str
10100001,"""HDSJ""",,"""传说时代""","""约-21世纪初""",,,,"""大禹治水""","""传说中的尧舜时代，黄河流域发生大洪水，为制止洪水泛滥，尧召集…",,,


In [157]:
print(f"events length: {events.shape}")
ev_df = events.join(events_to_types, on="event_id", how="left", suffix="_ett")
ev_df = ev_df.join(event_types, on="event_type_id", how="left")
print(f"ev_df length: {ev_df.shape}")
# wanted columsn from event_types: zh_ch_title,	en_title, en_type
# wanted columns from events: event_id	ch_date	western_date	description	notes
ev_df = ev_df.select(
    [
        "event_id",
        "ch_date",
        "western_date",
        "description",
        "notes",
        "zh_ch_title",
        "en_title",
        "evc_id",
        "en_type",
    ]
)
ev_df = ev_df.rename(
    {
        "ch_date": "event_date_ch",
        "western_date": "event_date_western",
        "description": "event_description",
        "notes": "event_notes",
        "zh_ch_title": "event_type_ch",
        "en_title": "event_type_py",
        "en_type": "event_type_en",
    }
)
print(f"ev_df length: {ev_df.shape}")
ev_df.head()

events length: (3754, 5)
ev_df length: (5349, 12)
ev_df length: (5349, 9)


event_id,event_date_ch,event_date_western,event_description,event_notes,event_type_ch,event_type_py,evc_id,event_type_en
str,str,f64,str,str,str,str,str,str
"""ev_1""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood"""
"""ev_2""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood"""
"""ev_3""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood"""
"""ev_3""","""史前时代""",-2356.0,,,"""災""","""zai""","""evc_1""","""Disaster"""
"""ev_4""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood"""


In [158]:
ev_df = ev_df.join(event_cats, on="evc_id", how="left")
ev_df.head()

event_id,event_date_ch,event_date_western,event_description,event_notes,event_type_ch,event_type_py,evc_id,event_type_en,zh_cn_category,en_category
str,str,f64,str,str,str,str,str,str,str,str
"""ev_1""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood""","""水災""","""Disasters"""
"""ev_2""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood""","""水災""","""Disasters"""
"""ev_3""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood""","""水災""","""Disasters"""
"""ev_3""","""史前时代""",-2356.0,,,"""災""","""zai""","""evc_1""","""Disaster""","""水災""","""Disasters"""
"""ev_4""","""史前时代""",-2356.0,,,"""溢""","""yi""","""evc_1""","""Flood""","""水災""","""Disasters"""


In [159]:
ev_df = ev_df.rename(
    {"zh_cn_category": "type_category_ch", "en_category": "type_category_en"}
)

In [160]:
# group by event_id and aggregate types columns into lists
ev_df = ev_df.group_by(["event_id", "event_date_western"]).agg(
    pl.col("event_date_ch").first(),
    pl.col("event_description").first(),
    pl.col("event_notes").first(),
    pl.col("event_type_ch").implode(),
    pl.col("event_type_py").implode(),
    pl.col("event_type_en").implode(),
    pl.col("evc_id").implode(),
    pl.col("type_category_ch").implode(),
    pl.col("type_category_en").implode(),
)
print(f"original events length: {events.shape}")
print(f"ev_df length after groupby: {ev_df.shape}")
ev_df.head()

original events length: (3754, 5)
ev_df length after groupby: (3754, 11)


event_id,event_date_western,event_date_ch,event_description,event_notes,event_type_ch,event_type_py,event_type_en,evc_id,type_category_ch,type_category_en
str,f64,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str]
"""ev_2064""",1599.0,"""明神宗万历二十七年""",,,"[""決"", ""絕""]","[""jue"", ""jue""]","[""Breach"", ""Extinction (river dries up)""]","[""evc_1"", ""evc_1""]","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]"
"""ev_2136""",1622.0,"""明熹宗天启二年""",,,"[""溢""]","[""yi""]","[""Flood""]","[""evc_1""]","[""水災""]","[""Disasters""]"
"""ev_1083""",1115.0,"""宋徽宗政和四年""",,,"[""修"", ""治""]","[""xiu"", ""zhi""]","[""Repair of Structures"", ""Management""]","[""evc_2"", ""evc_2""]","[""水利"", ""水利""]","[""Management"", ""Management""]"
"""ev_2354""",1672.0,"""清圣祖康熙十一年""",,,"[""決"", ""溢""]","[""jue"", ""yi""]","[""Breach"", ""Flood""]","[""evc_1"", ""evc_1""]","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]"
"""ev_2860""",1762.0,"""清高宗乾隆二十七年""",,,"[""治""]","[""zhi""]","[""Management""]","[""evc_2""]","[""水利""]","[""Management""]"


In [161]:
# Join Sources
print(f"original events length: {events.shape}")
ev_df = ev_df.join(events_to_sources, on="event_id", how="left")
ev_df = ev_df.join(sources, on="source_id", how="left")
# ev_df columns: event_id, event_date_western, event_date_ch	event_description, event_notes, event_type_ch, event_type_py, event_type_en
# source columns: source, page, chinese_date, western_date, old_placename_chinese, modern_placename_chinese, event_type_chinese, event_name, event_description, primary_source_1, primary_source_2, notes
ev_df = ev_df.rename(
    {
        "page": "source_page",
        "chinese_date": "source_ch_date",
        "western_date": "source_western_date",
        "event_type_chinese": "source_event_type_chinese",
        "event_name": "source_event_name",
        "event_description_right": "source_event_description",
    }
)
ev_df.head()

original events length: (3754, 5)


event_id,event_date_western,event_date_ch,event_description,event_notes,event_type_ch,event_type_py,event_type_en,evc_id,type_category_ch,type_category_en,src_to_evt_id,source_id,source,source_page,source_ch_date,source_western_date,old_placename_chinese,modern_placename_chinese,source_event_type_chinese,source_event_name,source_event_description,primary_source_1,primary_source_2,notes
str,f64,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],str,i64,str,str,str,str,str,str,str,str,str,str,str,str
"""ev_2064""",1599.0,"""明神宗万历二十七年""",,,"[""決"", ""絕""]","[""jue"", ""jue""]","[""Breach"", ""Extinction (river dries up)""]","[""evc_1"", ""evc_1""]","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]","""srev_3252""",10409382,"""HHNB""",,"""明神宗万历二十七年""","""1599""",,,"""决""",,"""河决坚城集，故道涸绝。（徐州志）""","""坚城集在砀山县西。（淮系年表）""",,
"""ev_2136""",1622.0,"""明熹宗天启二年""",,,"[""溢""]","[""yi""]","[""Flood""]","[""evc_1""]","[""水災""]","[""Disasters""]","""srev_3355""",10409441,"""HHNB""",,"""明熹宗天启二年""","""1622""",,,"""大水""",,"""秋八月大水。（安东县志）""",,,
"""ev_1083""",1115.0,"""宋徽宗政和四年""",,,"[""修"", ""治""]","[""xiu"", ""zhi""]","[""Repair of Structures"", ""Management""]","[""evc_2"", ""evc_2""]","[""水利"", ""水利""]","[""Management"", ""Management""]","""srev_1653""",10406253,"""HHNB""",,"""宋徽宗政和四年""","""1115""",,,"""修""",,"""置提举修系永桥所。（宋史河渠志）""",,,
"""ev_2354""",1672.0,"""清圣祖康熙十一年""",,,"[""決"", ""溢""]","[""jue"", ""yi""]","[""Breach"", ""Flood""]","[""evc_1"", ""evc_1""]","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]","""srev_3634""",10410112,"""HHNB""",,"""清圣祖康熙十一年""","""1672""",,,"""决""",,"""七八月黄河四溢，两河口堤决。山西坡大水，各处村屋倒塌。（萧县…","""塘池在羊山西（淮系年表）""",,
"""ev_2860""",1762.0,"""清高宗乾隆二十七年""",,,"[""治""]","[""zhi""]","[""Management""]","[""evc_2""]","[""水利""]","[""Management""]","""srev_4203""",10411183,"""HHNB""",,"""清高宗乾隆二十七年""","""1762""",,,"""治""",,"""驾幸徐州阅河。定唐家湾倒钩引河水志涨至一丈一尺五寸乃开，水落…",,,


In [162]:
ev_df = ev_df.select(
    [
        "event_id",
        "event_date_western",
        "event_date_ch",
        "event_description",
        "event_notes",
        "event_type_ch",
        "event_type_py",
        "event_type_en",
        "evc_id",
        "source",
        "source_page",
        "source_ch_date",
        "source_western_date",
        "source_event_type_chinese",
        "source_event_name",
        "source_event_description",
        "type_category_ch",
        "type_category_en",
    ]
)
print(f"ev_df length after join2: {ev_df.shape}")
ev_df.head()

ev_df length after join2: (5235, 18)


event_id,event_date_western,event_date_ch,event_description,event_notes,event_type_ch,event_type_py,event_type_en,evc_id,source,source_page,source_ch_date,source_western_date,source_event_type_chinese,source_event_name,source_event_description,type_category_ch,type_category_en
str,f64,str,str,str,list[str],list[str],list[str],list[str],str,str,str,str,str,str,str,list[str],list[str]
"""ev_2064""",1599.0,"""明神宗万历二十七年""",,,"[""決"", ""絕""]","[""jue"", ""jue""]","[""Breach"", ""Extinction (river dries up)""]","[""evc_1"", ""evc_1""]","""HHNB""",,"""明神宗万历二十七年""","""1599""","""决""",,"""河决坚城集，故道涸绝。（徐州志）""","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]"
"""ev_2136""",1622.0,"""明熹宗天启二年""",,,"[""溢""]","[""yi""]","[""Flood""]","[""evc_1""]","""HHNB""",,"""明熹宗天启二年""","""1622""","""大水""",,"""秋八月大水。（安东县志）""","[""水災""]","[""Disasters""]"
"""ev_1083""",1115.0,"""宋徽宗政和四年""",,,"[""修"", ""治""]","[""xiu"", ""zhi""]","[""Repair of Structures"", ""Management""]","[""evc_2"", ""evc_2""]","""HHNB""",,"""宋徽宗政和四年""","""1115""","""修""",,"""置提举修系永桥所。（宋史河渠志）""","[""水利"", ""水利""]","[""Management"", ""Management""]"
"""ev_2354""",1672.0,"""清圣祖康熙十一年""",,,"[""決"", ""溢""]","[""jue"", ""yi""]","[""Breach"", ""Flood""]","[""evc_1"", ""evc_1""]","""HHNB""",,"""清圣祖康熙十一年""","""1672""","""决""",,"""七八月黄河四溢，两河口堤决。山西坡大水，各处村屋倒塌。（萧县…","[""水災"", ""水災""]","[""Disasters"", ""Disasters""]"
"""ev_2860""",1762.0,"""清高宗乾隆二十七年""",,,"[""治""]","[""zhi""]","[""Management""]","[""evc_2""]","""HHNB""",,"""清高宗乾隆二十七年""","""1762""","""治""",,"""驾幸徐州阅河。定唐家湾倒钩引河水志涨至一丈一尺五寸乃开，水落…","[""水利""]","[""Management""]"


In [163]:
# Group and aggregate sources into lists
ev_df = ev_df.group_by(
    [
        "event_id",
        "event_date_western",
        "event_date_ch",
        "event_description",
        "event_notes",
        "evc_id",
        "event_type_ch",
        "event_type_py",
        "event_type_en",
        "type_category_ch",
        "type_category_en",
    ]
).agg(
    [
        pl.col("source").implode(),
        pl.col("source_page").implode(),
        pl.col("source_ch_date").implode(),
        pl.col("source_western_date").implode(),
        pl.col("source_event_type_chinese").implode(),
        pl.col("source_event_name").implode(),
        pl.col("source_event_description").implode(),
    ]
)

In [166]:
# Create formatted citation from paired lists
ev_df1 = ev_df.with_columns(
    [
        pl.struct(
            [
                "source",
                "source_western_date",
                "source_ch_date",
                "source_event_description",
                "source_page",
            ]
        )
        .map_elements(
            lambda x: "; ".join(
                [
                    f"""{name}{', in ' + west if west is not None else ''} {'('+ch+')' if ch is not None else ''} {'describes it as ' + description if description is not None else ''} {'(p. ' + page + ')' if page is not None else ''}""".strip()
                    for name, west, ch, description, page in zip(
                        x["source"],
                        x["source_western_date"],
                        x["source_ch_date"],
                        x["source_event_description"],
                        x["source_page"],
                    )
                    if name is not None
                ]
            ),
            return_dtype=pl.String,
        )
        .alias("citation")
    ]
)
print(f"ev_df length after citations: {ev_df1.shape}")
ev_df1.head()

ev_df length after citations: (3754, 19)


event_id,event_date_western,event_date_ch,event_description,event_notes,evc_id,event_type_ch,event_type_py,event_type_en,type_category_ch,type_category_en,source,source_page,source_ch_date,source_western_date,source_event_type_chinese,source_event_name,source_event_description,citation
str,f64,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str
"""ev_2873""",1766.0,"""清高宗乾隆三十一年""",,,"[""evc_1"", ""evc_2"", ""evc_2""]","[""決"", ""救"", ""修""]","[""jue"", ""jiu"", ""xiu""]","[""Breach"", ""Emergency Repair"", ""Repair of Structures""]","[""水災"", ""水利"", ""水利""]","[""Disasters"", ""Management"", ""Management""]","[""HHNB"", ""SLSY"", ""SLSY""]","[null, ""334"", ""334""]","[""清高宗乾隆三十一年"", ""乾隆三十一年"", ""乾隆三十一年""]","[""1766"", ""1766"", ""1766""]","[""决/修"", null, null]","[null, null, null]","[""八月铜沛临属南岸黄工以下无工之韩家堂堤溜势汕刷，陆续漫宽一百四十余丈。（南河成案）漫水由陵子孟山等湖注洪泽湖，正河仍有三四分水、江督高晋会同河督李宏督办施工，未五旬，堵筑竣事。（河渠纪闻）"", ""决铜山沛县厅之韩家堂，旋塞。 《清史稿·河渠志》"", ""决铜山沛县厅之韩家堂，旋塞。 《清史稿·河渠志》""]","""HHNB, in 1766 (清高宗乾隆三十一年) desc…"
"""ev_1065""",1107.0,"""徽宗大观元年夏""",,,"[""evc_1""]","[""溢""]","[""yi""]","[""Flood""]","[""水災""]","[""Disasters""]","[""ZHTS""]","[""60""]","[""徽宗大观元年夏""]","[""1107""]",[null],[null],"[""大水""]","""ZHTS, in 1107 (徽宗大观元年夏) descri…"
"""ev_2312""",1666.0,"""清圣祖康熙五年""",,,"[""evc_2""]","[""建""]","[""jian""]","[""New Construction""]","[""水利""]","[""Management""]","[""HHNB""]",[null],"[""清圣祖康熙五年""]","[""1666""]","[""修""]",[null],"[""筑祥符魁星楼堤黑冈堤，封邱于家店月堤，仪封石家楼月堤。（河南通志）""]","""HHNB, in 1666 (清圣祖康熙五年) descri…"
"""ev_2387""",1677.0,"""清圣祖康熙十六年""",,,"[""evc_1"", ""evc_2""]","[""決"", ""修""]","[""jue"", ""xiu""]","[""Breach"", ""Repair of Structures""]","[""水災"", ""水利""]","[""Disasters"", ""Management""]","[""HHNB""]",[null],"[""清圣祖康熙十六年""]","[""1677""]","[""决/修""]",[null],"[""河决毛城铺，萧砀及宿州大水。（淮系年表）""]","""HHNB, in 1677 (清圣祖康熙十六年) descr…"
"""ev_1811""",1517.0,"""明武宗正德十二年""",,,"[""evc_2""]","[""建""]","[""jian""]","[""New Construction""]","[""水利""]","[""Management""]","[""HHNB""]",[null],"[""明武宗正德十二年""]","[""1517""]","[""修""]",[null],"[""总河垄宏筑堤，起自长垣，由黄冈冈抵山东杨家口，延袤二百余里，广百尺，高十有五尺。（明逝宗实录）""]","""HHNB, in 1517 (明武宗正德十二年) descr…"


In [168]:
print(f"ev_df length: {ev_df.shape}")
ev_df2 = ev_df1.join(events_to_places, on="event_id", how="left", suffix="_etp")

print(f"ev_df2 len after join places: {ev_df2.shape}")

ev_df length: (10350, 21)
ev_df2 len after join places: (10350, 22)


In [171]:
ev_df2.filter(pl.col("place_id").is_not_null()).count()

event_id,event_date_western,event_date_ch,event_description,event_notes,evc_id,event_type_ch,event_type_py,event_type_en,type_category_ch,type_category_en,source,source_page,source_ch_date,source_western_date,source_event_type_chinese,source_event_name,source_event_description,citation,evtp_id,place_id,attestation
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
9955,9955,9955,995,379,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955,9955


In [173]:
ev_df2.filter(pl.col("attestation").str.starts_with("us")).count()

event_id,event_date_western,event_date_ch,event_description,event_notes,evc_id,event_type_ch,event_type_py,event_type_en,type_category_ch,type_category_en,source,source_page,source_ch_date,source_western_date,source_event_type_chinese,source_event_name,source_event_description,citation,evtp_id,place_id,attestation
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [186]:
print(downstream.shape)
downstream = downstream.filter(
    (pl.col("latitude").is_not_null()).and_(pl.col("longitude").is_not_null())
)

(1556, 10)


In [187]:
ev_df2.head(1)

event_id,event_date_western,event_date_ch,event_description,event_notes,evc_id,event_type_ch,event_type_py,event_type_en,type_category_ch,type_category_en,source,source_page,source_ch_date,source_western_date,source_event_type_chinese,source_event_name,source_event_description,citation,evtp_id,place_id,attestation
str,f64,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str
"""ev_2873""",1766.0,"""清高宗乾隆三十一年""",,,"[""evc_1"", ""evc_2"", ""evc_2""]","[""決"", ""救"", ""修""]","[""jue"", ""jiu"", ""xiu""]","[""Breach"", ""Emergency Repair"", ""Repair of Structures""]","[""水災"", ""水利"", ""水利""]","[""Disasters"", ""Management"", ""Management""]","[""HHNB"", ""SLSY"", ""SLSY""]","[null, ""334"", ""334""]","[""清高宗乾隆三十一年"", ""乾隆三十一年"", ""乾隆三十一年""]","[""1766"", ""1766"", ""1766""]","[""决/修"", null, null]","[null, null, null]","[""八月铜沛临属南岸黄工以下无工之韩家堂堤溜势汕刷，陆续漫宽一百四十余丈。（南河成案）漫水由陵子孟山等湖注洪泽湖，正河仍有三四分水、江督高晋会同河督李宏督办施工，未五旬，堵筑竣事。（河渠纪闻）"", ""决铜山沛县厅之韩家堂，旋塞。 《清史稿·河渠志》"", ""决铜山沛县厅之韩家堂，旋塞。 《清史稿·河渠志》""]","""HHNB, in 1766 (清高宗乾隆三十一年) desc…","""evtp_9246""","""yrdb3031""","""ds_202"""


In [188]:
yrdb_events = []

for entry in downstream.iter_rows():
    yrdb_id = entry[0]
    tr_title = entry[1]
    ch_pinyin = entry[2]
    lat = entry[3]
    long = entry[4]
    class_en = entry[7]
    type_ch = entry[6]
    type_en = entry[8]
    events = []
    for event in ev_df2.filter(pl.col("place_id") == yrdb_id).iter_rows():
        event_id = event[0]
        en_date_start = event[1]
        ch_date = event[2]
        en_cat = event[10]
        en_type = event[8]
        en_title = event[7]
        ch_cat = event[9]
        ch_title = event[6]
        description = event[17]
        citation = event[18]
        source = event[11]
        src_page = event[12]
        src_ch_date = event[13]
        src_west_date = event[14]

        event_dict = {
            "event_id": event_id,
            "en_date_start": en_date_start,
            "ch_date": ch_date,
            "en_cat": en_cat,
            "en_type": en_type,
            "en_title": en_title,
            "ch_cat": ch_cat,
            "ch_title": ch_title,
            "description": description,
            "citation": citation,
            "source": source,
            "src_page": src_page,
            "src_ch_date": src_ch_date,
            "src_west_date": src_west_date,
        }
        events.append(event_dict)

    # Build place dictionary with nested events
    place_dict = {
        "yrdb_id": yrdb_id,
        "tr_title": tr_title,
        "ch_pinyin": ch_pinyin,
        "lat": lat,
        "long": long,
        "class_en": class_en,
        "type_ch": type_ch,
        "type_en": type_en,
        "events": events,
    }
    yrdb_events.append(place_dict)

# Write to JSON file
with open("yrdb_events.json", "w", encoding="utf-8") as f:
    json.dump(yrdb_events, f, ensure_ascii=False, indent=2)