## 1 Setting

In [1]:
import ast
import os
from collections import Counter, defaultdict
from datetime import date, datetime

import pandas as pd
from bson import DBRef
from tqdm import tqdm

In [2]:
import sys

codefolder = "C:\ProjectCollections\Programs\Australia_Cultural_Data_Engine\codes"
sys.path.append(codefolder)

from acde import MongoDBManipulation as acde_manip
from ausstage import DBExtraction as adb_manip
from general import GeneralFunctions as gen_gf
from general import JsonProcessing as jp
from general import MongoDBManipulation as gen_manip

## 2 Database Connection

In [3]:
ausstage_engine = adb_manip.CreateMySQLEngine()
acde_opr = acde_manip.ACDE_Manipulation()
acde_db = acde_opr.acde_db

## 3 Data Extraction

Brief Introduction

### 3.1 Place (venue)

In [4]:
place_df = pd.read_sql("select * from ausstage.acde_venue", ausstage_engine)

In [5]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11864 entries, 0 to 11863
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ori_id                 11864 non-null  int64  
 1   ori_dbid               11864 non-null  int64  
 2   ori_dbid_unf           11864 non-null  int64  
 3   display_name           11864 non-null  object 
 4   alternative_names      11864 non-null  object 
 5   postcode               11463 non-null  object 
 6   street                 11034 non-null  object 
 7   suburb                 11837 non-null  object 
 8   state                  11864 non-null  object 
 9   country                11864 non-null  object 
 10  longitude              10517 non-null  object 
 11  latitude               10520 non-null  object 
 12  capacity               2137 non-null   float64
 13  contact                7917 non-null   object 
 14  phone                  8224 non-null   object 
 15  fa

In [6]:
removeFields_place = [
    "country",
    "state",
    "suburb",
    "street",
    "postcode",
    "longitude",
    "latitude",
    "contributors_modified",
    "contributors_created",
    "yyyyfirst_date",
    "mmfirst_date",
    "ddfirst_date",
    "yyyylast_date",
    "mmlast_date",
    "ddlast_date",
]

collection_places = {}
curr_lvl = "place"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

#####
# Construct fields and load to acde.place
#####
with tqdm(total=place_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for record in place_df.to_dict("records"):
        if record.get("alternative_names"):
            record["alternative_names"] = record["alternative_names"].split("; ")
        record["address"] = {
            "country": record.get("country"),
            "state": record.get("state"),
            "suburb": record.get("suburb"),
            "street": record.get("street"),
            "postcode": record.get("postcode"),
        }
        record["geo_coord"] = {
            "longitude": float(record.get("longitude"))
            if isinstance(record.get("longitude"), float)
            else None,
            "latitude": float(record.get("latitude"))
            if isinstance(record.get("latitude"), float)
            else None,
        }
        record["date_range"] = {
            "date_start": {
                "year": record.get("yyyyfirst_date"),
                "month": record.get("mmfirst_date").lstrip("0")
                if record.get("mmfirst_date")
                else None,
                "day": record.get("ddfirst_date").lstrip("0")
                if record.get("ddfirst_date")
                else None,
            },
            "date_end": {
                "year": record.get("yyyylast_date"),
                "month": record.get("mmlast_date").lstrip("0")
                if record.get("mmlast_date")
                else None,
                "day": record.get("ddlast_date").lstrip("0")
                if record.get("ddlast_date")
                else None,
            },
        }
        for date_field in ["date_modified", "date_created"]:
            record[date_field] = {
                "year": str(record.get(date_field).year)
                if record.get(date_field)
                else None,
                "month": str(record.get(date_field).month)
                if record.get(date_field)
                else None,
                "day": str(record.get(date_field).day)
                if record.get(date_field)
                else None,
            }
        record["contributors"] = list(
            set([record["contributors_modified"], record["contributors_created"]])
            - set([None, ""])
        )
        for f in removeFields_place:
            record.pop(f)
        record = jp.clean_empty_values(record)
        collection_places[record.get("ori_dbid")] = record
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
        pbar.update(1)

del place_df

Loading place level: 100%|█████████████████████████████████████████████████████| 11864/11864 [00:09<00:00, 1296.29it/s]


### 3.2 Resource (Item & datasource)

#### item

In [7]:
item_df = pd.read_sql("select * from ausstage.acde_item;", ausstage_engine)

item_df["sourceid"] = item_df["sourceid"].fillna("").apply(lambda x: int(x) if x else x)
item_df["secondary_genres"] = item_df.secondary_genres.apply(
    lambda x: [] if x is None else ast.literal_eval(x)
)
item_df["content_indicators"] = item_df.content_indicators.apply(
    lambda x: [] if x is None else ast.literal_eval(x)
)

In [8]:
item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73344 entries, 0 to 73343
Data columns (total 54 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ori_id                 73344 non-null  int64 
 1   ori_dbid               73344 non-null  int64 
 2   catalogue              32458 non-null  object
 3   institution            19243 non-null  object
 4   description            27829 non-null  object
 5   description_abstract   30035 non-null  object
 6   item_condition         8147 non-null   object
 7   detail_comments        27520 non-null  object
 8   donated_purchased      26195 non-null  object
 9   aquired_from           27729 non-null  object
 10  storage                27536 non-null  object
 11  provenance             27191 non-null  object
 12  significance           26729 non-null  object
 13  comments               27439 non-null  object
 14  item_url               29605 non-null  object
 15  format_extent      

In [9]:
#####
# Construct fields and load to acde.resource
#####

removeFields_item = [
    "catalogue",
    "institution",
    "donated_purchased",
    "storage",
    "aquired_from",
    "provenance",
    "significance",
    "item_condition",
    "comments",
    "detail_comments",
    "language",
    "format_extent",
    "format_medium",
    "format_mimetype",
    "format",
    "ident_isbn",
    "ident_ismn",
    "ident_issn",
    "ident_sici",
    "volume",
    "issue",
    "page",
    "citation",
    "publisher",
    "publisher_location",
    "item_url",
    "created_date",
    "copyright_date",
    "issued_date",
    "accessioned_date",
    "terminated_date",
    "date_notes",
    "rights_access_rights",
    "rights",
    "rights_holder",
    "primary_type",
    "secondary_type",
    "contributors_created",
    "contributors_modified",
]

collection_items = {}

for record in item_df.fillna("").to_dict("records"):
    record["types"] = [
        {
            "primary_type": record.get("primary_type"),
            "secondary_type": record.get("secondary_type"),
        }
    ]
    record["acquisition_info"] = {
        "catalogue": record.get("catalogue"),
        "institution": record.get("institution"),
        "donated_purchased": record.get("donated_purchased"),
        "storage": record.get("storgae"),
        "from": record.get("aquired_from"),
        "provenance": record.get("provenance"),
        "significance ": record.get("significance"),
        "condition": record.get("item_condition"),
        "comments": record.get("comments "),
        "detail_comments": record.get("detail_comments"),
    }
    record["format_info"] = {
        "language": record.get("language"),
        "medium": record.get("format_medium"),
        "extent": record.get("format_extent"),
        "mimetype": record.get("format_mimetype"),
        "format": record.get("format"),
    }
    record["identifier_info"] = {
        "isbn": record.get("ident_isbn"),
        "ismn": record.get("ident_ismn"),
        "issn": record.get("ident_issn"),
        "sici": record.get("ident_sici"),
    }
    record["date_info"] = {
        "note": record.get("date_notes"),
    }
    for date_field in [
        "created_date",
        "copyright_date",
        "issued_date",
        "accessioned_date",
        "terminated_date",
    ]:
        record["date_info"][date_field] = {
            "year": str(record.get(date_field).year)
            if record.get(date_field)
            else None,
            "month": str(record.get(date_field).month)
            if record.get(date_field)
            else None,
            "day": str(record.get(date_field).day) if record.get(date_field) else None,
        }
    record["source_info"] = {
        "publisher": {
            "name": record.get("publisher"),
            "place": {"display_name": record.get("publisher_location")},
        },
        "volume": record.get("volume"),
        "issue": record.get("issue"),
        "page": record.get("page"),
        "citation": record.get("citation"),
        "url": record.get("item_url"),
    }
    record["right_info"] = {
        "description": record.get("rights"),
        "owner": record.get("rights_holder"),
        "access_rights": record.get("rights_access_rights"),
    }
    for date_field in ["date_modified", "date_created"]:
        record[date_field] = {
            "year": str(record.get(date_field).year)
            if record.get(date_field)
            else None,
            "month": str(record.get(date_field).month)
            if record.get(date_field)
            else None,
            "day": str(record.get(date_field).day) if record.get(date_field) else None,
        }
    if record.get("alternative_titles"):
        record["alternative_titles"] = record["alternative_titles"].split("; ")
    record["contributors"] = list(
        set([record["contributors_modified"], record["contributors_created"]])
        - set([None, ""])
    )
    for f in removeFields_item:
        record.pop(f)
    record = jp.clean_empty_values(record)
    collection_items[record.get("ori_dbid")] = record


# Add Source, only take title, alternative_titles, types, description
for ori_dbid, record in collection_items.items():
    if record.get("sourceid"):
        source_record = collection_items[record.get("sourceid")]
        source_record_brief = jp.clean_empty_values(
            {
                "ori_dbid": record.get("ori_dbid"),
                "title": record.get("title"),
                "alternative_titles": record.get("alternative_titles"),
                "description": record.get("description"),
                "types": record.get("types"),
                "content_indicators": record.get("content_indicators"),
                "secondary_genres": record.get("secondary_genres"),
            }
        )
        record["source"] = source_record_brief
        record.pop("sourceid")

collection_items = list(collection_items.values())

curr_lvl = "resource"
acde_db[curr_lvl].delete_many({"data_source": "AusStage", "_class_ori": "item"})

#####
# Load item (resource)
#####
with tqdm(total=len(collection_items), desc=f"Loading {curr_lvl} level") as pbar:
    for record in collection_items:
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
        pbar.update(1)

del item_df

Loading resource level: 100%|██████████████████████████████████████████████████| 73344/73344 [00:50<00:00, 1458.45it/s]


#### datasource

In [10]:
#####
# For Data Source
#####

datasrc_df = pd.read_sql("select * from ausstage.acde_datasrc;", ausstage_engine)
# actually, DATASOURCE is a kind of type...
# DATASOURCE >> type/title/categorization
# DATASOURCEDESCRIPTION >> title/url/date
datasrc_df = datasrc_df.drop(
    ["EVENTID", "DATASOURCEEVLINKID"], axis=1
).drop_duplicates()

In [11]:
datasrc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21890 entries, 0 to 149820
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ori_id         21890 non-null  int64 
 1   ori_dbid       21890 non-null  int64 
 2   title          21841 non-null  object
 3   is_collection  21890 non-null  object
 4   description    612 non-null    object
 5   _class         21890 non-null  object
 6   _class_ori     21890 non-null  object
 7   data_source    21890 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.5+ MB


In [12]:
curr_lvl = "resource"
acde_db[curr_lvl].delete_many({"data_source": "AusStage", "_class_ori": "datasource"})

#####
# Load to acde.resource
#####
with tqdm(total=datasrc_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for record in datasrc_df.to_dict("records"):
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
        pbar.update(1)

del datasrc_df

Loading resource level: 100%|██████████████████████████████████████████████████| 21890/21890 [00:13<00:00, 1629.96it/s]


### 3.3 Work

In [13]:
work_df = pd.read_sql("select * from ausstage.acde_work;", ausstage_engine)

In [14]:
work_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19972 entries, 0 to 19971
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ori_id                 19972 non-null  int64 
 1   ori_dbid               19972 non-null  int64 
 2   title                  19972 non-null  object
 3   alternative_titles     13390 non-null  object
 4   country                13882 non-null  object
 5   yyyydate_first_known   6884 non-null   object
 6   mmdate_first_known     6752 non-null   object
 7   dddate_first_known     6740 non-null   object
 8   date_created           19972 non-null  object
 9   contributors_created   19972 non-null  object
 10  date_modified          4448 non-null   object
 11  contributors_modified  4448 non-null   object
 12  ori_url                19972 non-null  object
 13  data_source            19972 non-null  object
 14  _class                 19972 non-null  object
 15  _class_ori         

In [15]:
removeFields_work = [
    "country",
    "contributors_modified",
    "contributors_created",
    "yyyydate_first_known",
    "mmdate_first_known",
    "dddate_first_known",
]

collection_works = []

curr_lvl = "work"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

#####
# Construct fields and load to acde.work level
#####
with tqdm(total=work_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for record in work_df.to_dict("records"):
        if record.get("alternative_titles"):
            record["alternative_titles"] = record["alternative_titles"].split("; ")
        record["coverage_range"] = {
            "place": {"address": {"country": record.get("country")}},
            "date_range": {
                "date_start": {
                    "year": record.get("yyyydate_first_known"),
                    "month": record.get("mmdate_first_known").lstrip("0")
                    if record.get("mmdate_first_known")
                    else None,
                    "day": record.get("dddate_first_known").lstrip("0")
                    if record.get("dddate_first_known")
                    else None,
                }
            },
        }
        for date_field in ["date_modified", "date_created"]:
            record[date_field] = {
                "year": str(record.get(date_field).year)
                if record.get(date_field)
                else None,
                "month": str(record.get(date_field).month)
                if record.get(date_field)
                else None,
                "day": str(record.get(date_field).day)
                if record.get(date_field)
                else None,
            }
        record["contributors"] = list(
            set([record["contributors_modified"], record["contributors_created"]])
            - set([None, ""])
        )
        for f in removeFields_work:
            record.pop(f)
        record = jp.clean_empty_values(record)
        # collection_works.append(record)
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
        pbar.update(1)

del work_df

Loading work level: 100%|██████████████████████████████████████████████████████| 19972/19972 [00:15<00:00, 1317.22it/s]


### 3.4 Event

In [16]:
event_df = pd.read_sql("select * from ausstage.acde_event;", ausstage_engine).drop(
    ["event_start_year", "event_end_year"], axis=1
)
event_df["production_nationality"] = event_df.production_nationality.apply(
    lambda x: [] if x is None else ast.literal_eval(x)
)
event_df["text_nationality"] = event_df.text_nationality.apply(
    lambda x: [] if x is None else ast.literal_eval(x)
)
event_df["content_indicators"] = event_df.content_indicators.apply(
    lambda x: [] if x is None else ast.literal_eval(x)
)

In [17]:
#####
# For genres in event
#####

event_genres_df = pd.read_sql(
    "select * from ausstage.acde_event_genres;", ausstage_engine
)
event_genres_df["types"] = event_genres_df.apply(
    lambda x: (x["primary_type"], x["secondary_type"]), axis=1
)
event_genres = (
    event_genres_df.groupby("EVENTID")["types"]
    .apply(lambda x: list(set(x)))
    .apply(
        lambda x: [{"primary_type": pair[0], "secondary_type": pair[1]} for pair in x]
    )
    .fillna("")
)

#####
# For contributor functions in event
#####

event_contfunc_df = pd.read_sql(
    "select * from ausstage.acde_event_contfunc;", ausstage_engine
)

event_contfunc_df = (
    event_contfunc_df.groupby("EVENTID")["contr_functions"]
    .agg(lambda x: dict(Counter(x)))
    # .apply(lambda x: {k: v for k, v in x.items() if k is not None})
    .apply(
        lambda x: [
            {"function_name": k, "function_num": v}
            for k, v in x.items()
            if k is not None
        ]
    )
    .to_frame("function_names")
    .merge(
        event_contfunc_df.query("PRIMARY_CREATOR == 'yes'")
        .groupby("EVENTID")["contr_functions"]
        .agg(lambda x: list(x))
        .apply(lambda x: sorted([i for i in x if i is not None]))
        .to_frame("primary_creators"),
        how="left",
        left_index=True,
        right_index=True,
    )
)
event_contfunc_df = event_contfunc_df.fillna("")

#####
# For organisation functions in event
#####
event_orgfunc_df = pd.read_sql(
    "select ORGANISATIONID, eventid, function_title, function_type from ausstage.acde_org_evfunc;",
    ausstage_engine,
).drop_duplicates()

eof_prod_df = (
    event_orgfunc_df.query('function_type == "production"')
    .groupby("eventid")["function_title"]
    .agg(lambda x: dict(Counter(x)))
    .apply(
        lambda x: [
            {"function_name": k, "function_num": v}
            for k, v in x.items()
            if k is not None
        ]
    )
    .to_frame("production_functions")
    .copy()
)
eof_art_df = (
    event_orgfunc_df.query('function_type == "artistic"')
    .groupby("eventid")["function_title"]
    .agg(lambda x: dict(Counter(x)))
    .apply(
        lambda x: [
            {"function_name": k, "function_num": v}
            for k, v in x.items()
            if k is not None
        ]
    )
    .to_frame("artistic_functions")
    .copy()
)
event_orgfunc_df = (
    eof_prod_df.merge(eof_art_df, left_index=True, right_index=True, how="left")
    .apply(
        lambda x: {
            "production_functions": x["production_functions"]
            if x["production_functions"]
            else {},
            "artistic_functions": x["artistic_functions"]
            if x["artistic_functions"]
            else {},
        },
        axis=1,
    )
    .to_frame("all_functions_org")
)

#####
# Merging functions and genres(types) to event
#####
event_df = event_df.merge(
    event_genres, left_on="ori_dbid", right_index=True, how="left"
)
event_df = event_df.merge(
    event_contfunc_df, left_on="ori_dbid", right_index=True, how="left"
)
event_df = event_df.merge(
    event_orgfunc_df, left_on="ori_dbid", right_index=True, how="left"
)

In [18]:
event_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124916 entries, 0 to 124915
Data columns (total 43 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ori_id                  124916 non-null  int64 
 1   ori_dbid                124916 non-null  int64 
 2   ori_dbid_unf            124916 non-null  int64 
 3   same_name_as_org        124916 non-null  object
 4   title                   124916 non-null  object
 5   umbrella                92731 non-null   object
 6   description             110986 non-null  object
 7   description_source      65243 non-null   object
 8   part_of_a_tour          124916 non-null  object
 9   world_premier           124916 non-null  object
 10  YYYYFIRST_DATE          124916 non-null  object
 11  MMFIRST_DATE            122783 non-null  object
 12  DDFIRST_DATE            121092 non-null  object
 13  YYYYLAST_DATE           115823 non-null  object
 14  MMLAST_DATE             114681 non-n

In [19]:
removeFields_event = [
    "YYYYFIRST_DATE",
    "MMFIRST_DATE",
    "DDFIRST_DATE",
    "YYYYLAST_DATE",
    "MMLAST_DATE",
    "DDLAST_DATE",
    "YYYYOPENING_NIGHT",
    "MMOPENING_NIGHT",
    "DDOPENING_NIGHT",
    "YYYYDATE_ENTERED",
    "MMDATE_ENTERED",
    "DDDATE_ENTERED",
    "YYYYDATE_UPDATED",
    "MMDATE_UPDATED",
    "DDDATE_UPDATED",
    "function_names",
    "primary_creators",
    "venueid",
    "contributors_created",
    "contributors_modified",
    "all_functions_org",
]

collection_events = []

curr_lvl = "event"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

#####
# Construct fields and load to acde.event level
#####
with tqdm(total=event_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:

    for record in event_df.to_dict("records"):
        event_venue = adb_manip.lookup_place("venueid", record, collection_places)
        record["coverage_ranges"] = [
            {
                "date_range": {
                    "date_start": {
                        "year": record.get("YYYYFIRST_DATE"),
                        "month": record.get("MMFIRST_DATE").lstrip("0")
                        if record.get("MMFIRST_DATE")
                        else None,
                        "day": record.get("DDFIRST_DATE").lstrip("0")
                        if record.get("DDFIRST_DATE")
                        else None,
                    },
                    "date_end": {
                        "year": record.get("YYYYLAST_DATE"),
                        "month": record.get("MMLAST_DATE").lstrip("0")
                        if record.get("MMLAST_DATE")
                        else None,
                        "day": record.get("DDLAST_DATE").lstrip("0")
                        if record.get("DDLAST_DATE")
                        else None,
                    },
                },
                "place": event_venue,
            }
        ]
        record["opening_night"] = {
            "year": record.get("YYYYOPENING_NIGHT"),
            "month": record.get("MMOPENING_NIGHT").lstrip("0")
            if record.get("MMOPENING_NIGHT")
            else None,
            "day": record.get("DDOPENING_NIGHT").lstrip("0")
            if record.get("DDOPENING_NIGHT")
            else None,
        }
        record["functions"] = {
            "staff_functions": {
                "functions": record.get("function_names"),
                "primary_creators": record.get("primary_creators"),
            },
            "organization_functions": {"functions": record.get("all_functions_org")},
        }
        if record.get("altertative_titles"):
            record["altertative_titles"] = record["altertative_titles"].split("; ")
        record["date_created"] = {
            "year": record.get("YYYYDATE_ENTERED"),
            "month": record.get("MMDATE_ENTERED").lstrip("0")
            if record.get("MMDATE_ENTERED")
            else None,
            "day": record.get("DDDATE_ENTERED").lstrip("0")
            if record.get("DDDATE_ENTERED")
            else None,
        }
        record["date_modified"] = {
            "year": record.get("YYYYDATE_UPDATED"),
            "month": record.get("MMDATE_UPDATED").lstrip("0")
            if record.get("MMDATE_UPDATED")
            else None,
            "day": record.get("DDDATE_UPDATED").lstrip("0")
            if record.get("DDDATE_UPDATED")
            else None,
        }
        record["contributors"] = list(
            set([record["contributors_modified"], record["contributors_created"]])
            - set([None, ""])
        )
        for f in removeFields_event:
            record.pop(f)
        record = jp.clean_empty_values(record)
        # collection_events.append(record)
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
            break
        pbar.update(1)

del event_df, event_orgfunc_df, event_genres, event_contfunc_df

Loading event level: 100%|███████████████████████████████████████████████████| 124916/124916 [02:01<00:00, 1024.54it/s]


### 3.5 Person (contributor)

In [20]:
person_df = pd.read_sql("select * from ausstage.acde_cont;", ausstage_engine)

career_df = pd.read_sql("select * from ausstage.acde_cont_evfunc;", ausstage_engine)

In [21]:
person_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184586 entries, 0 to 184585
Data columns (total 37 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ori_id                 184586 non-null  int64  
 1   ori_dbid               184586 non-null  int64  
 2   ori_dbid_unf           184586 non-null  int64  
 3   display_name           184586 non-null  object 
 4   family_name            184586 non-null  object 
 5   middle_name            150934 non-null  object 
 6   first_name             184549 non-null  object 
 7   given_names            184168 non-null  object 
 8   prefix                 150934 non-null  object 
 9   suffix                 150934 non-null  object 
 10  gender                 183595 non-null  object 
 11  nationality            160020 non-null  object 
 12  alternative_names      151276 non-null  object 
 13  ori_address            150975 non-null  object 
 14  country                172068 non-nu

In [22]:
career_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016553 entries, 0 to 1016552
Data columns (total 14 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   CONTRIBUTORID       1016553 non-null  int64 
 1   eventid             1016553 non-null  int64 
 2   event_title         1016553 non-null  object
 3   productionid        1016553 non-null  int64 
 4   is_primary_creator  40860 non-null    object
 5   note                750375 non-null   object
 6   title               1010749 non-null  object
 7   YYYYFIRST_DATE      1016553 non-null  object
 8   MMFIRST_DATE        1006463 non-null  object
 9   DDFIRST_DATE        997353 non-null   object
 10  YYYYLAST_DATE       971638 non-null   object
 11  MMLAST_DATE         963391 non-null   object
 12  DDLAST_DATE         958101 non-null   object
 13  venueid             1016553 non-null  int64 
dtypes: int64(4), object(10)
memory usage: 108.6+ MB


In [23]:
#####
# Construct career periods
#####
career_df = (
    career_df.set_index(["CONTRIBUTORID"])
    .apply(
        lambda x: {
            "occupation": {"title": x["title"]},
            "coverage_range": {
                "place": adb_manip.lookup_place("venueid", x, collection_places),
                "date_range": {
                    "date_start": {
                        "year": x["YYYYFIRST_DATE"],
                        "month": x["MMFIRST_DATE"].lstrip("0")
                        if x["MMFIRST_DATE"]
                        else None,
                        "day": x["DDFIRST_DATE"].lstrip("0")
                        if x["DDFIRST_DATE"]
                        else None,
                    },
                    "date_end": {
                        "year": x["YYYYLAST_DATE"],
                        "month": x["MMLAST_DATE"].lstrip("0")
                        if x["MMLAST_DATE"]
                        else None,
                        "day": x["DDLAST_DATE"].lstrip("0")
                        if x["DDLAST_DATE"]
                        else None,
                    },
                },
            },
            "event_contributed": {
                "ori_dbid": x["eventid"],
                "ori_dbid_unf": x["productionid"],
                "title": x["event_title"],
            },
            "is_primary_creator": x["is_primary_creator"],
            "note": x["note"],
        },
        axis=1,
    )
    .to_frame("career_periods")
    .groupby("CONTRIBUTORID")["career_periods"]
    .agg(list)
)

person_df = person_df.merge(career_df, left_on="ori_dbid", right_index=True, how="left")
person_df = person_df.fillna("")

In [24]:
#####
# Construct fields and load to acde.person
#####

removeFields_person = [
    "first_name",
    "middle_name",
    "given_names",
    "family_name",
    "suffix",
    "prefix",
    "country",
    "state",
    "ori_address",
    "suburb",
    "postcode",
    "place_of_birth_id",
    "place_of_death_id",
    "YYYYDATE_OF_BIRTH",
    "MMDATE_OF_BIRTH",
    "DDDATE_OF_BIRTH",
    "YYYYDATE_OF_DEATH",
    "MMDATE_OF_DEATH",
    "DDDATE_OF_DEATH",
    "career_periods",
    "contributors_modified",
    "contributors_created",
]

collection_persons = {}
curr_lvl = "person"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

#####
# Construct and load event level
#####
with tqdm(total=person_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for record in person_df.to_dict("records"):
        if record.get("alternative_names"):
            record["alternative_names"] = record["alternative_names"].split("; ")
        record["primary_name"] = {
            "first_name": record.get("first_name"),
            "middle_name": record.get("middle_name"),
            "given_names": record.get("given_names"),
            "family_name": record.get("family_name"),
            "suffix": record.get("suffix"),
            "prefix": record.get("prefix"),
        }
        record["residences"] = [
            {
                "coverage_range": {
                    "place": {
                        "address": {
                            "country": record.get("country"),
                            "state": record.get("state"),
                            "suburb": record.get("suburb"),
                            "postcode": record.get("postcode"),
                            "ori_address": record.get("ori_address"),
                        }
                    },
                }
            },
        ]
        record["career"] = {"career_periods": record.get("career_periods")}
        for attr in ["death", "birth"]:
            place_record = adb_manip.lookup_place(
                f"place_of_{attr}_id", record, collection_places
            )
            record[attr] = {
                "coverage": {
                    "place": place_record,
                    "date": {
                        "year": record.get(f"YYYYDATE_OF_{attr.upper()}"),
                        "month": record.get(f"MMDATE_OF_{attr.upper()}").lstrip("0")
                        if record.get(f"MMDATE_OF_{attr.upper()}")
                        else None,
                        "day": record.get(f"DDDATE_OF_{attr.upper()}").lstrip("0")
                        if record.get(f"DDDATE_OF_{attr.upper()}")
                        else None,
                    },
                }
            }
        for date_field in ["date_modified", "date_created"]:
            record[date_field] = {
                "year": str(record.get(date_field).year)
                if record.get(date_field)
                else None,
                "month": str(record.get(date_field).month)
                if record.get(date_field)
                else None,
                "day": str(record.get(date_field).day)
                if record.get(date_field)
                else None,
            }
        record["contributors"] = list(
            set([record["contributors_modified"], record["contributors_created"]])
            - set([None, ""])
        )
        for f in removeFields_person:
            record.pop(f)
        record = jp.clean_empty_values(record)
        # collection_persons[record.get("ori_dbid")] = record
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
        pbar.update(1)

del person_df, career_df

Loading person level: 100%|███████████████████████████████████████████████████| 184586/184586 [03:54<00:00, 788.35it/s]


### 3.6 Organization (organisation)

In [25]:
#####
# Data Extraction
#####

org_df = pd.read_sql("select * from ausstage.acde_org;", ausstage_engine)

org_func_df = pd.read_sql("select * from ausstage.acde_org_evfunc;", ausstage_engine)

In [26]:
org_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19890 entries, 0 to 19889
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ori_id                 19890 non-null  int64  
 1   ori_dbid               19890 non-null  int64  
 2   ori_dbid_unf           19890 non-null  int64  
 3   primary_name           19890 non-null  object 
 4   alternative_names      19890 non-null  object 
 5   ori_address            15646 non-null  object 
 6   country                19324 non-null  object 
 7   state                  19890 non-null  object 
 8   suburb                 16894 non-null  object 
 9   postcode               14437 non-null  object 
 10  contact                15495 non-null  object 
 11  phones                 19890 non-null  object 
 12  fax                    15380 non-null  object 
 13  email                  15408 non-null  object 
 14  web_links              15569 non-null  object 
 15  no

In [27]:
org_func_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226592 entries, 0 to 226591
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ORGANISATIONID  226592 non-null  int64 
 1   eventid         226592 non-null  int64 
 2   event_title     226592 non-null  object
 3   productionid    226592 non-null  int64 
 4   YYYYFIRST_DATE  226592 non-null  object
 5   MMFIRST_DATE    223721 non-null  object
 6   DDFIRST_DATE    221705 non-null  object
 7   YYYYLAST_DATE   215057 non-null  object
 8   MMLAST_DATE     213282 non-null  object
 9   DDLAST_DATE     212276 non-null  object
 10  function_title  226592 non-null  object
 11  function_type   226592 non-null  object
 12  venueid         226592 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 22.5+ MB


In [28]:
#####
# Construct operation_periods
#####

org_func_df = (
    org_func_df.set_index(["ORGANISATIONID"])
    .apply(
        lambda x: {
            "function": {"title": x["function_title"], "type": x["function_type"]},
            "coverage_range": {
                "place": adb_manip.lookup_place("venueid", x, collection_places),
                "date_range": {
                    "date_start": {
                        "year": x["YYYYFIRST_DATE"],
                        "month": x["MMFIRST_DATE"].lstrip("0")
                        if x["MMFIRST_DATE"]
                        else None,
                        "day": x["DDFIRST_DATE"].lstrip("0")
                        if x["DDFIRST_DATE"]
                        else None,
                    },
                    "date_end": {
                        "year": x["YYYYLAST_DATE"],
                        "month": x["MMLAST_DATE"].lstrip("0")
                        if x["MMLAST_DATE"]
                        else None,
                        "day": x["DDLAST_DATE"].lstrip("0")
                        if x["DDLAST_DATE"]
                        else None,
                    },
                },
            },
            "event_contributed": {
                "ori_dbid": x["eventid"],
                "ori_dbid_unf": x["productionid"],
                "title": x["event_title"],
            },
        },
        axis=1,
    )
    .to_frame("operation_periods")
    .groupby("ORGANISATIONID")["operation_periods"]
    .agg(list)
)

org_df = org_df.merge(org_func_df, left_on="ori_dbid", right_index=True, how="left")
org_df = org_df.fillna("")

In [29]:
#####
# Construct fields and load to acde.organization
#####
removeFields_org = [
    "country",
    "state",
    "ori_address",
    "suburb",
    "postcode",
    "organisation_type",
    # "place_of_demise",
    # 'place_of_origin',
    "yyyyfirst_date",
    "mmfirst_date",
    "ddfirst_date",
    "yyyylast_date",
    "mmlast_date",
    "ddlast_date",
    "operation_periods",
    "contributors_modified",
    "contributors_created",
]

collection_orgs = {}
curr_lvl = "organization"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

#####
# Construct and load event level
#####
with tqdm(total=org_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for record in org_df.to_dict("records"):
        record["locations"] = [
            {
                "coverage_range": {
                    "place": {
                        "address": {
                            "country": record.get("country"),
                            "state": record.get("state"),
                            "suburb": record.get("suburb"),
                            "postcode": record.get("postcode"),
                            "ori_address": record.get("ori_address"),
                        }
                    },
                    "date_range": {
                        "date_start": {
                            "year": record.get("yyyyfirst_date"),
                            "month": record.get("mmfirst_date").lstrip("0")
                            if record.get("mmfirst_date")
                            else None,
                            "day": record.get("ddfirst_date").lstrip("0")
                            if record.get("ddfirst_date")
                            else None,
                        },
                        "date_end": {
                            "year": record.get("yyyylast_date"),
                            "month": record.get("mmlast_date").lstrip("0")
                            if record.get("mmlast_date")
                            else None,
                            "day": record.get("ddlast_date").lstrip("0")
                            if record.get("ddlast_date")
                            else None,
                        },
                    },
                },
            },
            {
                "coverage_range": {
                    "place": {"display_name": record.get("place_of_demise"),},
                    "date_range": {
                        "date_end": {
                            "year": record.get("yyyylast_date"),
                            "month": record.get("mmlast_date").lstrip("0")
                            if record.get("mmlast_date")
                            else None,
                            "day": record.get("ddlast_date").lstrip("0")
                            if record.get("ddlast_date")
                            else None,
                        },
                    },
                },
            },
            {
                "coverage_range": {
                    "place": {"display_name": record.get("place_of_origin"),},
                    "date_range": {
                        "date_start": {
                            "year": record.get("yyyyfirst_date"),
                            "month": record.get("mmfirst_date").lstrip("0")
                            if record.get("mmfirst_date")
                            else None,
                            "day": record.get("ddfirst_date").lstrip("0")
                            if record.get("ddfirst_date")
                            else None,
                        },
                    },
                },
            },
        ]
        record["operation"] = {"operation_periods": record.get("operation_periods")}
        record["types"] = [record.get("organisation_type")]
        if record.get("alternative_names"):
            record["alternative_names"] = record["alternative_names"].split("; ")
        for date_field in ["date_modified", "date_created"]:
            record[date_field] = {
                "year": str(record.get(date_field).year)
                if record.get(date_field)
                else None,
                "month": str(record.get(date_field).month)
                if record.get(date_field)
                else None,
                "day": str(record.get(date_field).day)
                if record.get(date_field)
                else None,
            }
        record["contributors"] = list(
            set([record["contributors_modified"], record["contributors_created"]])
            - set([None, ""])
        )
        for f in removeFields_org:
            record.pop(f)
        record = jp.clean_empty_values(record)
        # collection_orgs[record.get("ori_dbid")] = record
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(record.get("ori_dbid"))
            break
        pbar.update(1)

del org_df, org_func_df

Loading organization level: 100%|███████████████████████████████████████████████| 19890/19890 [00:31<00:00, 636.49it/s]


### 3.7 Relationship

* venue - venue venuevenuelink

* *resource - resource acde_item.sourceid*

* resource - work itemworklink

* resource - place itemvenuelink

* resource - event datasourceevlink

* resource - person itemconlink

* resource - oraganization item.institutionid

* work - contributor workconlink

* work - oraganization workorglink

* work - work workworklink

* *event - venue acde_event.venueid*

* *event - contributor - function acde_event_contfunc*

* event - work eventworklink

* event - event eventeventlink

* *person - event - occupation(function)*

* person - organization conorglink

* person - person contribcontriblink

* organisation - event orgevlink

* organisation - organisation orgorglink

##### 3.7.1 Load AusStage Relationships into ACDE Database

In [30]:
loading_objs = [
    obj for obj in acde_opr.Class_RelatedFN_mapping.keys() if obj != "recognition"
]

curr_dbid_mapping = acde_opr.acde_extRcd2dict(
    db=acde_db,
    data_source="AusStage",
    loading_objs=loading_objs,
    selected_fields=["_class_ori", "ori_dbid"],
)

person_fetching: 100%|█████████████████████████████████████████████████████| 184586/184586 [00:01<00:00, 105648.93it/s]
organization_fetching: 100%|██████████████████████████████████████████████████| 19890/19890 [00:00<00:00, 99716.13it/s]
work_fetching: 100%|██████████████████████████████████████████████████████████| 19972/19972 [00:00<00:00, 30857.83it/s]
event_fetching: 100%|███████████████████████████████████████████████████████| 124916/124916 [00:01<00:00, 84000.15it/s]
resource_fetching: 100%|█████████████████████████████████████████████████████| 95234/95234 [00:00<00:00, 142526.96it/s]
place_fetching: 100%|█████████████████████████████████████████████████████████| 11864/11864 [00:00<00:00, 84969.25it/s]


In [31]:
#####
# Extract all relevant relationships
# reconstruct the field structures
# and load them into acde.relationship.
# `curr_dbid` will be added into `subject` and `object`
#####

# Load relationship extraction SQL script
with open(
    os.path.join(codefolder, "ausstage", "Extract_Relations.sql"), mode="r"
) as sql_script:
    extract_scripts = sql_script.read()


curr_lvl = "relationship"
acde_db[curr_lvl].delete_many({"data_source": "AusStage"})

for script in extract_scripts.split(";"):
    if script:
        pass
    else:
        break
    curr_relation = pd.read_sql(f"""{script}""", ausstage_engine)
    subj_oritype = list(curr_relation.subject__class_ori.unique())[0]
    obj_oritype = list(curr_relation.object__class_ori.unique())[0]
    with tqdm(
        total=curr_relation.shape[0], desc=f"{subj_oritype}_{obj_oritype}",
    ) as pbar:
        for record in curr_relation.to_dict("records"):
            new_record = defaultdict(dict)
            # reconstruct the format into regular
            # ACD-E relationship structure
            for k, v in record.items():
                k_parts = k.split("_", 1)
                k_preffix = k_parts[0]
                k_suffix = k_parts[-1]
                if k_preffix in ["subject", "object", "predicate", "time"]:
                    if k_suffix.startswith("date"):
                        k_parts = k_suffix.rsplit("_", 1)
                        k_mid = k_parts[0]  # date_start / date_end
                        k_end = k_parts[-1]  # year / month / day
                        if new_record[k_preffix].get(k_mid):
                            new_record[k_preffix][k_mid].update({k_end: v})
                        else:
                            new_record[k_preffix] = {k_mid: {k_end: v}}
                    else:
                        new_record[k_preffix][k_suffix] = v
                else:
                    new_record[k] = v
            is_exist = True
            for curr_type in ["subject", "object"]:
                curr_dbid = curr_dbid_mapping[record[f"{curr_type}__class"]].get(
                    (
                        record[f"{curr_type}__class_ori"],
                        record[f"{curr_type}_ori_dbid"],
                    )
                )
                if curr_dbid:
                    new_record[curr_type]["curr_dbid"] = curr_dbid
                else:
                    is_exist = False
                    break
            if is_exist:
                # insert relation to database
                acde_db[curr_lvl].insert_one(new_record)
                # # collect related objects
                # related_objs_collection[new_record[curr_type]["_class"]][
                #     new_record[curr_type]["curr_dbid"]
                # ][f'related_{new_record[other_type]["_class"]}s'].append(
                #     DBRef(new_record[other_type]["_class"], relatsh_id.id)
                # )
                pbar.update(1)
    del curr_relation

venue_venue: 100%|███████████████████████████████████████████████████████████████| 1609/1609 [00:01<00:00, 1398.91it/s]
event_event: 100%|███████████████████████████████████████████████████████████████| 7406/7406 [00:05<00:00, 1428.71it/s]
contributor_event: 100%|███████████████████████████████████████████████████| 1010748/1010748 [12:13<00:00, 1378.34it/s]
contributor_organisation: 100%|██████████████████████████████████████████████████| 4617/4617 [00:02<00:00, 1578.70it/s]
contributor_contributor: 100%|███████████████████████████████████████████████████| 2514/2514 [00:01<00:00, 1576.45it/s]
contributor_venue: 100%|███████████████████████████████████████████████████████████| 694/694 [00:00<00:00, 1353.89it/s]
contributor_venue: 100%|███████████████████████████████████████████████████████████| 368/368 [00:00<00:00, 1294.68it/s]
organisation_organisation: 100%|███████████████████████████████████████████████████| 557/557 [00:00<00:00, 1473.71it/s]
organisation_event: 100%|███████████████

##### 3.7.2 Add All Related Objects to Root Objects

In [32]:
#####
# Clean the existing related_XXXX fields from 'AusStage'
#####

gen_manip.mdb_remove_fields(
    acde_db,
    "relationship",
    {"data_source": "AusStage",},
    remove_fields=list(acde_opr.Class_RelatedFN_mapping.values()),
)

#####
# Update the original records with references
# to the relationships of their related records using DBRefs.
#####

acde_opr.acde_update_related_DBRef(data_source="AusStage", db=acde_db)

Extracting related objects from relationship collection: 100%|████████████| 1749357/1749357 [01:02<00:00, 28080.32it/s]
place_related_objects_update: 100%|████████████████████████████████████████████| 11492/11492 [00:10<00:00, 1076.99it/s]
event_related_objects_update: 100%|██████████████████████████████████████████| 124745/124745 [01:53<00:00, 1102.30it/s]
person_related_objects_update: 100%|█████████████████████████████████████████| 182121/182121 [02:32<00:00, 1197.68it/s]
organization_related_objects_update: 100%|██████████████████████████████████████| 15899/15899 [00:16<00:00, 975.83it/s]
work_related_objects_update: 100%|█████████████████████████████████████████████| 19832/19832 [00:14<00:00, 1370.09it/s]
resource_related_objects_update: 100%|█████████████████████████████████████████| 72740/72740 [00:52<00:00, 1394.72it/s]


The DBrefs of the relationships have been successfully updated to `related_XXX` fields!


In [33]:
loading_colls = set(
    coll for coll in acde_opr.Class_RelatedFN_mapping.keys() if coll != "recognition"
)
proj_cond = {
    "predicate": 1,
    "subject": 1,
    "object": 1,
    "_id": 0,
    "relationship_dbid": "$_id",
    "relation_class": 1,
    "data_source": 1,
}

data_source = "AusStage"

#####
# Update the original records having relationship DBRefs with lookup documents.
#####

# # Method 1
# mdb_manip.acde_add_related_objects(
#     acde_db, data_source="AusStage", loading_objs=list(loading_objs)
# )

# # Method 2
acde_opr.acde_update_related_fields(
    data_source=data_source,
    proj_cond=proj_cond,
    loading_colls=loading_colls,
    db=acde_db,
)

person_related_people: 100%|█████████████████████████████████████████████████████| 1876/1876 [00:01<00:00, 1197.99it/s]
person_related_organizations: 100%|██████████████████████████████████████████████| 4345/4345 [00:03<00:00, 1411.78it/s]
person_related_places: 100%|███████████████████████████████████████████████████████| 818/818 [00:00<00:00, 1125.11it/s]
person_related_resources: 100%|████████████████████████████████████████████████| 10723/10723 [00:10<00:00, 1033.75it/s]
person_related_works: 100%|██████████████████████████████████████████████████████| 8475/8475 [00:08<00:00, 1004.55it/s]
person_related_events: 100%|██████████████████████████████████████████████████| 178206/178206 [03:04<00:00, 964.26it/s]
organization_related_people: 100%|████████████████████████████████████████████████████| 63/63 [00:00<00:00, 150.39it/s]
organization_related_organizations: 100%|███████████████████████████████████████████| 718/718 [00:00<00:00, 970.28it/s]


organization_related_places doesn't have any records.


organization_related_resources: 100%|███████████████████████████████████████████████| 141/141 [00:00<00:00, 190.07it/s]
organization_related_works: 100%|██████████████████████████████████████████████████| 602/602 [00:00<00:00, 1012.90it/s]
organization_related_events: 100%|██████████████████████████████████████████████| 15738/15738 [00:23<00:00, 661.89it/s]
place_related_people: 100%|████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 1114.33it/s]


place_related_organizations doesn't have any records.


place_related_places: 100%|██████████████████████████████████████████████████████| 1876/1876 [00:01<00:00, 1373.96it/s]
place_related_resources: 100%|███████████████████████████████████████████████████| 1124/1124 [00:00<00:00, 1221.03it/s]


place_related_works doesn't have any records.


place_related_events: 100%|█████████████████████████████████████████████████████| 11223/11223 [00:13<00:00, 803.73it/s]
resource_related_people: 100%|█████████████████████████████████████████████████| 47516/47516 [00:32<00:00, 1478.59it/s]
resource_related_organizations: 100%|██████████████████████████████████████████| 19243/19243 [00:12<00:00, 1529.25it/s]
resource_related_places: 100%|███████████████████████████████████████████████████| 3210/3210 [00:02<00:00, 1526.13it/s]
resource_related_resources: 100%|██████████████████████████████████████████████| 54292/54292 [00:39<00:00, 1373.90it/s]
resource_related_works: 100%|████████████████████████████████████████████████████| 1938/1938 [00:01<00:00, 1431.98it/s]
resource_related_events: 100%|█████████████████████████████████████████████████████████| 61/61 [00:07<00:00,  8.38it/s]
work_related_people: 100%|█████████████████████████████████████████████████████| 18441/18441 [00:12<00:00, 1453.99it/s]
work_related_organizations: 100%|███████

work_related_places doesn't have any records.


work_related_resources: 100%|████████████████████████████████████████████████████| 1667/1667 [00:01<00:00, 1390.57it/s]
work_related_works: 100%|██████████████████████████████████████████████████████████| 158/158 [00:00<00:00, 1287.95it/s]
work_related_events: 100%|█████████████████████████████████████████████████████| 19369/19369 [00:15<00:00, 1248.45it/s]
event_related_people: 100%|███████████████████████████████████████████████████| 105964/105964 [02:07<00:00, 828.75it/s]
event_related_organizations: 100%|███████████████████████████████████████████| 107002/107002 [01:23<00:00, 1280.06it/s]
event_related_places: 100%|██████████████████████████████████████████████████| 124745/124745 [01:28<00:00, 1413.19it/s]
event_related_resources: 100%|███████████████████████████████████████████████| 114146/114146 [01:24<00:00, 1349.42it/s]
event_related_works: 100%|█████████████████████████████████████████████████████| 57591/57591 [00:44<00:00, 1308.04it/s]
event_related_events: 100%|█████████████

The lookup documents of DBRefs have been successfully updated to `related_XXX` fields!





## Learning Notes

https://stackoverflow.com/questions/2952366/dump-csv-from-sqlalchemy

https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator

https://stackoverflow.com/questions/53480458/sqlalchemy-view-handling