## 1 Setting

In [1]:
import itertools
import json
import os
import pprint
import re
import sqlite3
import sys
from collections import Counter, defaultdict

import pandas as pd
from tqdm import tqdm

codefolder = "C:/ProjectCollections/Programs/Australia_Cultural_Data_Engine/codes"

data_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data/circusoz"

sys.path.append(codefolder)
from acde import MongoDBManipulation as acde_manip
from general import GeneralFunctions as gf
from general import JsonProcessing as jp
from general import MongoDBManipulation as gen_manip

circuzoz_conn = sqlite3.connect(os.path.join(data_folder, "circuzOz_draft.db"))
acde_opr = acde_manip.ACDE_Manipulation()
acde_db = acde_opr.acde_db

## Place Loading

In [2]:
curr_lvl = "place"
acde_db[curr_lvl].delete_many({"data_source": "CircusOZ"})

venue_df = pd.read_sql("""SELECT * from acde_venue;""", con=circuzoz_conn,).fillna("")
venue_loading_df = venue_df[venue_df["name"] != "UNKNOWN"][
    ["data_source", "_class", "_class_ori", "name", "address", "type"]
].copy()
venue_loading_df[["country", "postcode", "state", "suburb", "street"]] = (
    venue_loading_df["address"]
    .apply(lambda x: x.split(", ", 4)[::-1] if x else [None])
    .apply(pd.Series)
)
venue_df = venue_df.merge(
    venue_loading_df[["country", "postcode", "state", "suburb", "street"]],
    left_index=True,
    right_index=True,
    how="left",
)
venue_loading_df = venue_loading_df.drop_duplicates()

removeFields_place = [
    "name",
    "country",
    "state",
    "suburb",
    "street",
    "postcode",
]

with tqdm(total=venue_loading_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for idx, record in enumerate(venue_loading_df.to_dict("records")):
        record["display_name"] = record.get("name")
        record["address"] = {
            "country": record.get("country"),
            "state": record.get("state"),
            "suburb": record.get("suburb"),
            "street": record.get("street"),
            "postcode": record.get("postcode"),
            "ori_address": record.get("address"),
        }
        for f in removeFields_place:
            record.pop(f)
        record = jp.clean_empty_values(record)
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            print(idx, record)
        pbar.update(1)

Loading place level: 100%|█████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 1560.92it/s]


In [3]:
collection_places = {}

for record in venue_df[
    [
        "event_number",
        "name",
        "address",
        "type",
        "country",
        "postcode",
        "state",
        "suburb",
        "street",
    ]
].to_dict("records"):
    record["address"] = {
        "country": record.get("country"),
        "state": record.get("state"),
        "suburb": record.get("suburb"),
        "street": record.get("street"),
        "postcode": record.get("postcode"),
        "ori_address": record.get("address"),
    }
    record = jp.clean_empty_values(record)
    en = record.get("event_number")
    record = {
        k: v
        for k, v in record.items()
        if k
        not in ["event_number", "country", "postcode", "state", "suburb", "street",]
    }
    collection_places[en] = record

## Event Loading

In [4]:
event_df = pd.read_sql("""SELECT * from acde_event;""", con=circuzoz_conn,)
event_loading_df = event_df.drop("person_number", axis=1).drop_duplicates().copy()
event_loading_df["umbrella"] = event_loading_df[["umbrella_1", "umbrella_2"]].apply(
    lambda x: list(set([x[0], x[1]]) - set([None])), axis=1
)

removeFields_event = [
    "date_end_year",
    "date_end_month",
    "date_end_day",
    "date_start_year",
    "date_start_month",
    "date_start_day",
    "umbrella_1",
    "umbrella_2",
    "type",
]

curr_lvl = "event"
acde_db[curr_lvl].delete_many({"data_source": "CircusOZ"})

with tqdm(total=event_loading_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for idx, record in enumerate(event_loading_df.to_dict("records")):
        event_venue = collection_places.get(record.get("ori_id"))
        record["types"] = [record.get("type")]
        record["coverage_ranges"] = [
            {
                "date_range": {
                    "date_start": {
                        "year": record.get("date_start_year"),
                        "month": record.get("date_start_month").lstrip("0")
                        if record.get("date_start_month")
                        else None,
                        "day": record.get("date_start_day").lstrip("0")
                        if record.get("date_start_day")
                        else None,
                    },
                    "date_end": {
                        "year": record.get("date_end_year"),
                        "month": record.get("date_end_month").lstrip("0")
                        if record.get("date_end_month")
                        else None,
                        "day": record.get("date_end_day").lstrip("0")
                        if record.get("date_end_day")
                        else None,
                    },
                },
                "place": event_venue,
            }
        ]
        if record.get("altertative_titles"):
            record["altertative_titles"] = record["altertative_titles"].split("; ")
        for f in removeFields_event:
            record.pop(f)
        record = jp.clean_empty_values(record)
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            break
        pbar.update(1)

Loading event level: 100%|█████████████████████████████████████████████████████████| 480/480 [00:00<00:00, 1436.54it/s]


## Person Loading

In [5]:
person_df = pd.read_sql("""SELECT * from acde_person;""", con=circuzoz_conn,)
person_df["family_name"] = person_df.family_name.str.capitalize()
person_df["given_names"] = person_df.given_names.str.capitalize()
person_loading_df = (
    person_df[
        ["ori_id", "given_names", "family_name", "display_name", "alternative_names",]
    ]
    .groupby("ori_id")
    .agg(set)
    .applymap(lambda x: list(x - set([None])))
    .applymap(lambda x: x[0] if x else None)
    .copy()
)
person_loading_df["alternative_names"] = person_loading_df["alternative_names"].apply(
    lambda x: x.split(";") if x else []
)
pa_df = pd.read_sql("""SELECT * from acde_pa;""", con=circuzoz_conn,)
pa_df = (
    pa_df.melt(
        id_vars=["ori_id", "id_(AS)", "gender",],
        var_name="ori_attr_name",
        value_name="detailed_role",
    )
    .merge(
        pd.read_sql("SELECT * from ROLE_AS;", con=circuzoz_conn,),
        left_on=["ori_attr_name", "detailed_role"],
        right_on=["ori_attr_name", "ori_attr_value"],
        how="left",
    )
    .merge(
        pd.read_sql(
            """SELECT
	"ROLE CATEGORIES" as broad_role,
	"ROLE.NUMBER"
from
	ROLE;""",
            con=circuzoz_conn,
        ),
        on="ROLE.NUMBER",
        how="left",
    )
    .drop(["ori_attr_name", "ROLE.NUMBER", "ori_attr_value"], axis=1)
)
pf_df = pd.read_sql("""SELECT * from acde_pf;""", con=circuzoz_conn,)
pc_df = pd.read_sql("""SELECT * from acde_pc;""", con=circuzoz_conn,)

person_loading_dep_df = pd.concat([pa_df, pc_df, pf_df], axis=0)
person_loading_dep_df["longterm_roles"] = (
    person_loading_dep_df[["detailed_role", "broad_role"]]
    .fillna("Unknown")
    .apply(lambda x: tuple([x[1], x[0]]), axis=1)
)
person_loading_dep_df = (
    person_loading_dep_df.drop(["detailed_role", "broad_role"], axis=1)
    .fillna("")
    .groupby("ori_id")
    .agg(set)
    .applymap(lambda x: list(x - set([""])) if len(x - set([""])) != 0 else [])
)
person_loading_dep_df[
    ["id_(AS)", "gender", "videoid_(CO-vids)", "id_(FMP)", "notes_(FMP)"]
] = person_loading_dep_df[
    ["id_(AS)", "gender", "videoid_(CO-vids)", "id_(FMP)", "notes_(FMP)"]
].applymap(
    lambda x: x[0] if x else None
)
person_loading_dep_df["longterm_roles"] = person_loading_dep_df["longterm_roles"].apply(
    lambda x: [
        {"broad_role": k, "detailed_role": v}
        for k, v in x
        if not (k == "Unknown" and v == "Unknown")
    ]
)
person_loading_df = person_loading_df.merge(
    person_loading_dep_df, left_on="ori_id", right_index=True, how="left"
).reset_index()
person_loading_df[["data_source", "_class", "_class_ori"]] = [
    "CircusOZ",
    "person",
    "person",
]

In [6]:
person_loading_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ori_id             749 non-null    object 
 1   given_names        747 non-null    object 
 2   family_name        746 non-null    object 
 3   display_name       749 non-null    object 
 4   alternative_names  749 non-null    object 
 5   id_(AS)            202 non-null    float64
 6   gender             202 non-null    object 
 7   videoid_(CO-vids)  45 non-null     object 
 8   id_(FMP)           621 non-null    object 
 9   notes_(FMP)        298 non-null    object 
 10  longterm_roles     744 non-null    object 
 11  data_source        749 non-null    object 
 12  _class             749 non-null    object 
 13  _class_ori         749 non-null    object 
dtypes: float64(1), object(13)
memory usage: 82.0+ KB


In [7]:
curr_lvl = "person"
acde_db[curr_lvl].delete_many({"data_source": "CircusOZ"})

removeFields_person = [
    "given_names",
    "family_name",
]

with tqdm(total=person_loading_df.shape[0], desc=f"Loading {curr_lvl} level") as pbar:
    for idx, record in enumerate(person_loading_df.to_dict("records")):
        record["primary_name"] = {
            "given_names": record.get("given_names"),
            "family_name": record.get("family_name"),
        }
        for f in removeFields_person:
            record.pop(f)
        record = jp.clean_empty_values(record)
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(record)
        except Exception as e:
            print(e)
            break
        pbar.update(1)

Loading person level: 100%|████████████████████████████████████████████████████████| 749/749 [00:00<00:00, 1411.64it/s]


## Relationship Loading

In [8]:
# event - venue
ev_df = (
    (
        venue_df.query("name != 'UNKNOWN'")[["event_number", "name"]]
        .merge(
            pd.DataFrame(
                acde_db["place"].find(
                    {"data_source": "CircusOZ"},
                    {
                        "venue_curr_dbid": "$_id",
                        "venue_label": "$display_name",
                        "venue__class": "$_class",
                        "venue__class_ori": "$_class_ori",
                        "_id": 0,
                    },
                )
            ),
            left_on="name",
            right_on="venue_label",
            how="left",
        )
        .merge(
            pd.DataFrame(
                acde_db["event"].find(
                    {"data_source": "CircusOZ"},
                    {
                        "event_curr_dbid": "$_id",
                        "event_ori_id": "$ori_id",
                        "_id": 0,
                        "event_label": "$title",
                        "event__class": "$_class",
                        "event__class_ori": "$_class_ori",
                    },
                )
            ),
            left_on="event_number",
            right_on="event_ori_id",
            how="left",
        )
        .drop(["event_number", "name"], axis=1)
    )
    .dropna(subset=["venue_curr_dbid", "event_curr_dbid"])
    .fillna("")
)
ev_df[["relation_class", "data_source", "_class"]] = [
    "Event_RelatedPlace",
    "CircusOZ",
    "relationship",
]

In [9]:
# event - person
ep_df = (
    event_df[["ori_id", "person_number"]]
    .dropna()
    .merge(
        pd.DataFrame(
            acde_db["event"].find(
                {"data_source": "CircusOZ"},
                {
                    "event_curr_dbid": "$_id",
                    "ori_id": 1,
                    "_id": 0,
                    "event_label": "$title",
                    "event__class": "$_class",
                    "event__class_ori": "$_class_ori",
                },
            )
        ),
        on="ori_id",
        how="left",
    )
    .merge(
        pd.DataFrame(
            acde_db["person"].find(
                {"data_source": "CircusOZ"},
                {
                    "person_curr_dbid": "$_id",
                    "person_number": "$ori_id",
                    "_id": 0,
                    "person_label": "$display_name",
                    "person__class": "$_class",
                    "person__class_ori": "$_class_ori",
                },
            )
        ),
        on="person_number",
        how="left",
    )
    .rename({"ori_id": "event_ori_id", "person_number": "person_ori_id"}, axis=1)
).fillna("")
ep_df[["relation_class", "data_source", "_class"]] = [
    "Person_RelatedEvent",
    "CircusOZ",
    "relationship",
]

In [10]:
curr_lvl = "relationship"
acde_db[curr_lvl].delete_many({"data_source": "CircusOZ"})
# event - venue
for record in ev_df.to_dict("records"):
    for obj_type, obj_prefix in {"subject": "event", "object": "venue"}.items():
        record[obj_type] = {
            "curr_dbid": record.get(f"{obj_prefix}_curr_dbid"),
            "label": record.get(f"{obj_prefix}_label"),
            "_class": record.get(f"{obj_prefix}__class"),
            "_class_ori": record.get(f"{obj_prefix}__class_ori"),
        }
        record["predicate"] = {
            "term": "was hosted in",
            "reverse_term": "is the host place of",
        }
    record = {k: v for k, v in record.items() if not k.startswith(("event", "venue"))}
    try:
        # insert new record
        acde_db[curr_lvl].insert_one(record)
    except Exception as e:
        print(e)
        break
# event - person
for record in ep_df.to_dict("records"):
    for obj_type, obj_prefix in {"object": "event", "subject": "person"}.items():
        record[obj_type] = {
            "curr_dbid": record.get(f"{obj_prefix}_curr_dbid"),
            "label": record.get(f"{obj_prefix}_label"),
            "_class": record.get(f"{obj_prefix}__class"),
            "_class_ori": record.get(f"{obj_prefix}__class_ori"),
        }
        record["predicate"] = {
            "term": "contributed to",
            "reverse_term": "was contributed by",
        }
    record = {k: v for k, v in record.items() if not k.startswith(("event", "person"))}
    try:
        # insert new record
        acde_db[curr_lvl].insert_one(record)
    except Exception as e:
        print(e)
        break

In [11]:
#####
# Clean the existing related_XXXX fields from 'AusStage'
#####

gen_manip.mdb_remove_fields(
    acde_db,
    "relationship",
    {"data_source": "CircusOZ",},
    remove_fields=list(acde_opr.Class_RelatedFN_mapping.values()),
)

#####
# Update the original records with references
# to the relationships of their related records using DBRefs.
#####

acde_opr.acde_update_related_DBRef(data_source="CircusOZ", db=acde_db)

Extracting related objects from relationship collection: 100%|██████████████████████| 733/733 [00:01<00:00, 716.12it/s]
event_related_objects_update: 100%|████████████████████████████████████████████████| 456/456 [00:00<00:00, 1197.65it/s]
place_related_objects_update: 100%|████████████████████████████████████████████████| 137/137 [00:00<00:00, 1194.50it/s]
person_related_objects_update: 100%|██████████████████████████████████████████████████| 72/72 [00:00<00:00, 937.55it/s]

The DBrefs of the relationships have been successfully updated to `related_XXX` fields!





In [12]:
loading_colls = set(
    coll
    for coll in acde_opr.Class_RelatedFN_mapping.keys()
    if coll in ("event", "person", "place")
)
proj_cond = {
    "predicate": 1,
    "subject": 1,
    "object": 1,
    "_id": 0,
    "relationship_dbid": "$_id",
    "relation_class": 1,
    "data_source": 1,
}

data_source = "CircusOZ"

#####
# Update the original records having relationship DBRefs with lookup documents.
#####
acde_opr.acde_update_related_fields(
    data_source=data_source,
    proj_cond=proj_cond,
    loading_colls=loading_colls,
    db=acde_db,
)

event_related_events doesn't have any records.


event_related_people: 100%|████████████████████████████████████████████████████████| 445/445 [00:00<00:00, 1362.21it/s]
event_related_places: 100%|████████████████████████████████████████████████████████| 246/246 [00:00<00:00, 1394.73it/s]
person_related_events: 100%|█████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 1203.24it/s]


person_related_people doesn't have any records.
person_related_places doesn't have any records.


place_related_events: 100%|████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 1493.29it/s]

place_related_people doesn't have any records.
place_related_places doesn't have any records.
The lookup documents of DBRefs have been successfully updated to `related_XXX` fields!



