# [Digital Archive of Queensland Architecture](https://qldarch.net/) Data Loading

## 0. Setting

In [1]:
import os
import pprint
from collections import defaultdict

from tqdm import tqdm

pp = pprint.PrettyPrinter(indent=2)

In [2]:
import sys

codefolder = "C:/ProjectCollections/Programs/Australia_Cultural_Data_Engine/codes"

data_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data/digital_archive_of_queensland_architecture"

sys.path.append(codefolder)
from acde import MongoDBManipulation as acde_manip
from daqa import DataManipulation as daqa_manip
from general import GeneralFunctions as gen_gf
from general import JsonProcessing as gen_jp
from general import MongoDBManipulation as gen_manip

## 1. DAQA MongoDB Connection

In [3]:
acde_opr = acde_manip.ACDE_Manipulation()
daqa_etl = daqa_manip.DAQA_DbManipulation()

## 1. Transfering & Loading Original Records

### 1.1 Person

In [4]:
loading_obj = "person"

person_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
person_firstL_attrs = list(set(person_firstL_attrs) - set(["inserted_ts_acde"]))
export_firstL_attrs = person_firstL_attrs

person_Group_interviews = {
    attr: {"$first": f"${attr}"}
    for attr in person_firstL_attrs
    if attr not in ["interviews"]
}
person_Group_interviews.update(
    {"_id": "$_id", "interviews": {"$addToSet": {"$first": "$interviews"}},}
)

In [5]:
loading_dbname = daqa_etl.class_type_mapping[loading_obj]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {"$unwind": {"path": "$interviews", "preserveNullAndEmptyArrays": True}},
            {
                "$lookup": {
                    "from": "interview",
                    "localField": "interviews",
                    "foreignField": "id",
                    "pipeline": [
                        {
                            "$lookup": {
                                "from": "interview",
                                "localField": "interview_dbref.$id",
                                "foreignField": "_id",
                                "pipeline": [{"$project": {"id": 1, "label": 1,},},],
                                "as": "interview",
                            },
                        },
                        {
                            "$project": {
                                "_id": 1,
                                "id": 1,
                                "type": 1,
                                "label": 1,
                                "ori_url": 1,
                            },
                        },
                    ],
                    "as": "interviews",
                }
            },
            {"$group": person_Group_interviews},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "alternative_names": {
                        "$cond": {
                            "if": {
                                "$and": [
                                    {"$ne": ["$label", "$preflabel"]},
                                    {"$ne": ["$preflabel", None]},
                                    {"$ne": ["$preflabel", ""]},
                                ]
                            },
                            "then": [{"display_name": "$preflabel"}],
                            "else": [],
                        }
                    },
                    "primary_name": {
                        "given_names": "$firstname",
                        "family_name": "$lastname",
                    },
                    "display_name": "$label",
                    "_class_ori": "$type",
                    "_class": "person",
                    "summary": 1,
                    "gender": 1,
                    "birth": 1,
                    "death": 1,
                    "is_architect": "$architect",
                    "is_practiceInQueensland": "$practicedinqueensland",
                    "longterm_roles": [
                        {
                            "broad_role": {
                                "$cond": {
                                    "if": {"$eq": ["$architect", True]},
                                    "then": "architect",
                                    "else": "non-architect",
                                }
                            }
                        }
                    ],
                    "education_trainings": 1,
                    "career": 1,
                    # "relationships": 1,
                    # "media": 1,
                    # "interviews": 1,
                    # "associatedMedia": 1,
                    "note": 1,
                    "ori_url_data": 1,
                    "influence_on_QLD_architecture": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_person_loading: 100%|████████████████████████████████████████████████████████| 1103/1103 [00:01<00:00, 921.24it/s]


### 1.2 Work (structure)

In [6]:
loading_obj = "structure"

export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

In [7]:
loading_dbname = daqa_etl.class_type_mapping[loading_obj]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "_class_ori": "$type",
                    "_class": "work",
                    "description": "$summary",
                    "coverage_range": {
                        "place": {
                            "geo_coord": {
                                "latitude": "$latitude",
                                "longitude": "$longitude",
                            },
                            "display_name": "$label",
                            "address": {
                                "country": "$coord_loc_details.address.country",
                                "state": "$coord_loc_details.address.state",
                                "city": "$coord_loc_details.address.city",
                                "suburb": "$coord_loc_details.address.suburb",
                                "postcode": "$coord_loc_details.address.postcode",
                                "ori_address": "$location",
                            },
                        },
                        "date_range": {"date_end": "$completion", "date_start": None,},
                    },
                    "is_australian": "$australian",
                    "is_demolished": "$demolished",
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "note": 1,
                    "ori_url_data": 1,
                    "coord_loc_details": 1,
                    "completionpd": 1,
                    "typology": 1,
                    "typologies": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_structure_loading: 100%|████████████████████████████████████████████████████| 2203/2203 [00:01<00:00, 1402.70it/s]


### 1.3 Event

In [8]:
loading_obj = "event"

export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

In [9]:
loading_dbname = daqa_etl.class_type_mapping[loading_obj]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})
test = []
with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        # daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        test.append(obj_record)
        pbar.update(1)

DAQA_event_loading: 100%|███████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 10395.42it/s]


### 1.4 Recognition (award)

In [10]:
loading_obj = "award"

export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

In [11]:
loading_dbname = daqa_etl.class_type_mapping[loading_obj]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_award_loading: 100%|████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 1473.64it/s]


### 1.5 Place

In [12]:
loading_obj = "place"

export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

In [13]:
loading_dbname = daqa_etl.class_type_mapping[loading_obj]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "display_name": "$label",
                    "address": 1,
                    "geo_coord": 1,
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_place_loading: 100%|████████████████████████████████████████████████████████| 1939/1939 [00:01<00:00, 1524.75it/s]


### 1.6 Organization (education/government/organisation/firm)

In [14]:
loading_dbname = daqa_etl.related_obj_mapping["related_organizations"]["acde_level"]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

<pymongo.results.DeleteResult at 0x213fad6bf70>

In [15]:
for loading_obj in set(
    daqa_etl.related_obj_mapping["related_organizations"]["target"]
) - set(["firm"]):

    export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
    export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

    with tqdm(
        total=daqa_etl.daqa_db[loading_obj].count_documents({}),
        desc=f"DAQA_{loading_obj}_loading",
    ) as pbar:
        for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
            [
                {"$project": {attr: 1 for attr in export_firstL_attrs}},
                {
                    "$project": {
                        "data_source": "DAQA",
                        "ori_url": 1,
                        "ori_id": "$id",
                        "_id": 0,
                        "ori_dbid": "$_id",
                        "date_created": "$created",
                        "pubts": 1,
                        "contributors": ["$owner"],
                        "record_status": {
                            "$cond": {
                                "if": {"$eq": ["$is_published", True]},
                                "then": "published",
                                "else": "unpublished",
                            }
                        },
                        "version": 1,
                        "primary_name": "$label",
                        "_class_ori": "$type",
                        "_class": loading_dbname,
                        # "relationships": 1,
                        # "media": 1,
                        "note": 1,
                        # "associatedMedia": 1,
                        "ori_url_data": 1,
                    }
                },
            ]
        ):
            # remove all attributes having empty values
            obj_record = gen_jp.clean_empty_values(obj_record)
            daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
            pbar.update(1)

DAQA_organisation_loading: 100%|██████████████████████████████████████████████████████| 15/15 [00:00<00:00, 856.75it/s]
DAQA_education_loading: 100%|████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 1524.15it/s]
DAQA_government_loading: 100%|█████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1206.30it/s]


In [16]:
loading_obj = "firm"
export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "primary_name": "$label",
                    "operation": {"date_start": "$start", "date_end": "$end"},
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    "is_australian": "$australian",
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_firm_loading: 100%|███████████████████████████████████████████████████████████| 907/907 [00:00<00:00, 1633.01it/s]


### 1.7 Resource (publication/article/interview/media)

In [17]:
loading_dbname = daqa_etl.related_obj_mapping["related_resources"]["acde_level"]
daqa_etl.acde_db[loading_dbname].delete_many({"data_source": "DAQA"})

<pymongo.results.DeleteResult at 0x213f8145850>

In [18]:
loading_obj = "publication"
export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "description": "$summary",
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    # "relationships": 1,
                    # "media": 1,
                    "note": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_publication_loading: 100%|██████████████████████████████████████████████████████| 46/46 [00:00<00:00, 1132.36it/s]


In [19]:
loading_obj = "article"
export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$created",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "description": "$summary",
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    "authoring_info": {"authors": {"name": "$authors"}},
                    "annotations": 1,
                    "date_info": {"date_published": "$published"},
                    "source_info": {
                        "issue": "$issue",
                        "pages": "$pages",
                        "periodical": "$periodical",
                        "volume": "$volume",
                    },
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_article_loading: 100%|████████████████████████████████████████████████████████| 783/783 [00:00<00:00, 1446.80it/s]


In [20]:
loading_obj = "interview"
export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$published",
                    "pubts": 1,
                    "contributors": ["$owner"],
                    "record_status": {
                        "$cond": {
                            "if": {"$eq": ["$is_published", True]},
                            "then": "published",
                            "else": "unpublished",
                        }
                    },
                    "version": 1,
                    "title": "$label",
                    "description": "$summary",
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    "authoring_info": {
                        "coverage": {
                            "place": {
                                "display_name": "$location",
                                "address": {"ori_address": "$location"},
                            }
                        }
                    },
                    "transcript": 1,
                    "note": 1,
                    "interviewer": 1,
                    "interviewee": 1,
                    # "relationships": 1,
                    # "media": 1,
                    # "associatedMedia": 1,
                    "ori_url_data": 1,
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_interview_loading: 100%|█████████████████████████████████████████████████████████| 92/92 [00:00<00:00, 169.00it/s]


In [21]:
loading_obj = "media"
export_firstL_attrs = gen_manip.mdb_get_firstL_fields(daqa_etl.daqa_db[loading_obj])
export_firstL_attrs = list(set(export_firstL_attrs) - set(["inserted_ts_acde"]))

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for obj_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {"$project": {attr: 1 for attr in export_firstL_attrs}},
            {
                "$project": {
                    "data_source": "DAQA",
                    "ori_url": 1,
                    "ori_id": "$id",
                    "_id": 0,
                    "ori_dbid": "$_id",
                    "date_created": "$published",
                    "download_url": 1,
                    "category": 1,
                    "contributors": ["$creator"],
                    "title": "$label",
                    "description": 1,
                    "_class_ori": "$type",
                    "_class": loading_dbname,
                    "coverage": {
                        "place": {
                            "display_name": "$location",
                            "address": {"ori_address": "$location"},
                        }
                    },
                    "format_info": {"mimetype": "$mimetype"},
                    "identifier_info": {"note": "$identifier"},
                    "right_info": {"copyright": {"owner": "$rights"}},
                    "source_info": {
                        "digital_file_name": "$filename",
                        "size": "$filesize",
                    },
                    # "depicts": 1,
                    "project_number": "$projectnumber",
                }
            },
        ]
    ):
        # remove all attributes having empty values
        obj_record = gen_jp.clean_empty_values(obj_record)
        daqa_etl.acde_db[loading_dbname].insert_one(obj_record)
        pbar.update(1)

DAQA_media_loading: 100%|████████████████████████████████████████████████████████| 6775/6775 [00:04<00:00, 1640.06it/s]


### 1.8 Relationship

In [22]:
loading_objs = [obj for obj in acde_opr.Class_RelatedFN_mapping.keys()]

curr_dbid_mapping = acde_opr.acde_extRcd2dict(
    db=daqa_etl.acde_db,
    data_source="DAQA",
    loading_objs=loading_objs,
    selected_fields=["_class_ori", "ori_dbid"],
)

person_fetching: 100%|███████████████████████████████████████████████████████████| 1103/1103 [00:00<00:00, 6698.45it/s]
organization_fetching: 100%|██████████████████████████████████████████████████████| 967/967 [00:00<00:00, 37205.58it/s]
work_fetching: 100%|████████████████████████████████████████████████████████████| 2203/2203 [00:00<00:00, 46852.68it/s]
event_fetching: 0it [00:00, ?it/s]
recognition_fetching: 100%|██████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 4475.60it/s]
resource_fetching: 100%|████████████████████████████████████████████████████████| 7696/7696 [00:00<00:00, 55072.68it/s]
place_fetching: 100%|███████████████████████████████████████████████████████████| 1939/1939 [00:00<00:00, 40685.95it/s]


In [23]:
loading_obj = "relationship"
daqa_etl.acde_db[loading_obj].delete_many({"data_source": "DAQA"})
missing_objs = defaultdict(set)

with tqdm(
    total=daqa_etl.daqa_db[loading_obj].count_documents({}),
    desc=f"DAQA_{loading_obj}_loading",
) as pbar:
    for relation_record in daqa_etl.daqa_db[loading_obj].aggregate(
        [
            {
                "$addFields": {
                    "data_source": "DAQA",
                    "ori_id": "$relationshipid",
                    "ori_dbid": "$_id",
                    "subject": {
                        "ori_id": "$subject",
                        "ori_dbid": "$subject_dbid",
                        "label": "$subjectlabel",
                        "_class_ori": "$subjecttype",
                        "ori_url": "$subject_ori_url",
                    },
                    "object": {
                        "ori_id": "$object",
                        "ori_dbid": "$object_dbid",
                        "label": "$objectlabel",
                        "_class_ori": "$objecttype",
                        "ori_url": "$object_ori_url",
                        "category": "$object_category",
                    },
                    "predicate": {
                        "term": "$relationship",
                        "reverse_term": "$relationship_reverse_term",
                    },
                    "_class": loading_obj,
                }
            },
            {
                "$project": {
                    f: 0
                    for f in {
                        "_id",
                        "note",
                        "relationship",
                        "relationshipid",
                        "subjectlabel",
                        "subjecttype",
                        "subject_ori_url",
                        "subject_dbid",
                        "objectlabel",
                        "objecttype",
                        "object_ori_url",
                        "object_dbid",
                        "object_category",
                        "relationship_reverse_term",
                    }
                }
            },
        ]
    ):
        is_inserted = True
        for e_type in ["subject", "object"]:
            if relation_record.get(e_type):
                relation_record[e_type]["_class"] = daqa_etl.class_type_mapping[
                    relation_record[e_type]["_class_ori"]
                ]
                try:
                    et_cdbid = curr_dbid_mapping[relation_record[e_type]["_class"]][
                        (
                            relation_record[e_type]["_class_ori"],
                            relation_record[e_type]["ori_dbid"],
                        )
                    ]
                    relation_record[e_type]["curr_dbid"] = et_cdbid
                except:
                    missing_objs[relation_record[e_type]["_class"]].add(
                        relation_record[e_type]["ori_dbid"]
                    )
                    is_inserted = False
                # try:
                #     relation_record[e_type].update(
                #         daqa_etl.acde_db[relation_record[e_type]["_class"]].find_one(
                #             {"ori_id": relation_record[e_type]["ori_id"]},
                #             {"curr_dbid": "$_id", "_id": 0},
                #         )
                #     )
                # except:
                #     missing_objs[relation_record[e_type]["_class"]].append(
                #         relation_record[e_type]["ori_id"]
                #     )
                #     is_inserted = False
                #     break

        if relation_record.get("subject") and relation_record.get("object"):
            relation_record[
                "relation_class"
            ] = f"{relation_record['subject']['_class'].capitalize()}_Related{relation_record['object']['_class'].capitalize()}"
        else:
            is_inserted = False

        if is_inserted:
            # remove all attributes having empty values
            relation_record = gen_jp.clean_empty_values(relation_record)
            daqa_etl.acde_db[loading_obj].insert_one(relation_record)
        else:
            missing_objs["relationship"].add(relation_record["ori_dbid"])
        pbar.update(1)

DAQA_relationship_loading: 100%|███████████████████████████████████████████████| 17549/17549 [00:11<00:00, 1586.23it/s]


In [24]:
for obj_type, obj_ids in missing_objs.items():
    total_doc_num = daqa_etl.daqa_db[obj_type].count_documents({})
    missing_doc_num = len(obj_ids)
    print("#" * 20)
    print(
        f"""
{obj_type} was found {total_doc_num} records.
It has missing/invalid {missing_doc_num} records according to relationships.
        """
    )

####################

event was found 21 records.
It has missing/invalid 11 records according to relationships.
        
####################

relationship was found 17549 records.
It has missing/invalid 98 records according to relationships.
        
####################

topic was found 44 records.
It has missing/invalid 14 records according to relationships.
        


### 1.9 Add Related Objects to Root Objects

In [25]:
#####
# Clean the existing related_XXXX fields from 'AusStage'
#####

gen_manip.mdb_remove_fields(
    daqa_etl.acde_db,
    "relationship",
    {"data_source": "DAQA",},
    remove_fields=list(acde_opr.Class_RelatedFN_mapping.values()),
)

#####
# Update the original records with references
# to the relationships of their related records using DBRefs.
#####

acde_opr.acde_update_related_DBRef(data_source="DAQA", db=daqa_etl.acde_db)

Extracting related objects from relationship collection: 100%|████████████████| 17451/17451 [00:01<00:00, 11332.00it/s]
person_related_objects_update: 100%|███████████████████████████████████████████████| 944/944 [00:00<00:00, 1241.94it/s]
resource_related_objects_update: 100%|███████████████████████████████████████████| 6911/6911 [00:04<00:00, 1521.78it/s]
organization_related_objects_update: 100%|█████████████████████████████████████████| 728/728 [00:00<00:00, 1433.11it/s]
work_related_objects_update: 100%|███████████████████████████████████████████████| 2183/2183 [00:01<00:00, 1442.15it/s]
recognition_related_objects_update: 100%|████████████████████████████████████████████| 24/24 [00:00<00:00, 1333.06it/s]
place_related_objects_update: 100%|██████████████████████████████████████████████| 1924/1924 [00:01<00:00, 1579.21it/s]

The DBrefs of the relationships have been successfully updated to `related_XXX` fields!





In [26]:
loading_colls = set(coll for coll in acde_opr.Class_RelatedFN_mapping.keys())
proj_cond = {
    "predicate": 1,
    "subject": 1,
    "object": 1,
    "_id": 0,
    "relationship_dbid": "$_id",
    "relation_class": 1,
    "data_source": 1,
}

data_source = "DAQA"
# # Method 1
# acde_opr.acde_add_related_objects(
#     daqa_etl.acde_db, data_source="DAQA", loading_objs=loading_objs
# )
# Method 2
acde_opr.acde_update_related_fields(
    data_source=data_source,
    proj_cond=proj_cond,
    loading_colls=loading_colls,
    db=daqa_etl.acde_db,
)

event_related_events doesn't have any records.
event_related_organizations doesn't have any records.
event_related_works doesn't have any records.
event_related_places doesn't have any records.
event_related_recognitions doesn't have any records.
event_related_people doesn't have any records.
event_related_resources doesn't have any records.
person_related_events doesn't have any records.


person_related_organizations: 100%|████████████████████████████████████████████████| 464/464 [00:00<00:00, 1258.82it/s]
person_related_works: 100%|████████████████████████████████████████████████████████| 281/281 [00:00<00:00, 1083.82it/s]
person_related_places: 100%|███████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1011.41it/s]
person_related_recognitions: 100%|███████████████████████████████████████████████████| 25/25 [00:00<00:00, 1207.13it/s]
person_related_people: 100%|███████████████████████████████████████████████████████| 642/642 [00:00<00:00, 1212.43it/s]
person_related_resources: 100%|████████████████████████████████████████████████████| 383/383 [00:00<00:00, 1244.82it/s]


resource_related_events doesn't have any records.


resource_related_organizations: 100%|██████████████████████████████████████████████| 103/103 [00:00<00:00, 1211.80it/s]
resource_related_works: 100%|████████████████████████████████████████████████████| 4878/4878 [00:03<00:00, 1540.37it/s]
resource_related_places: 100%|████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 591.91it/s]
resource_related_recognitions: 100%|██████████████████████████████████████████████████| 11/11 [00:00<00:00, 758.88it/s]
resource_related_people: 100%|█████████████████████████████████████████████████████| 495/495 [00:00<00:00, 1127.29it/s]
resource_related_resources: 100%|████████████████████████████████████████████████| 1557/1557 [00:01<00:00, 1365.77it/s]


place_related_events doesn't have any records.
place_related_organizations doesn't have any records.


place_related_works: 100%|███████████████████████████████████████████████████████| 1843/1843 [00:01<00:00, 1561.98it/s]


place_related_places doesn't have any records.
place_related_recognitions doesn't have any records.


place_related_people: 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1335.02it/s]
place_related_resources: 100%|███████████████████████████████████████████████████████| 77/77 [00:00<00:00, 1513.12it/s]


organization_related_events doesn't have any records.


organization_related_organizations: 100%|██████████████████████████████████████████| 352/352 [00:00<00:00, 1389.79it/s]
organization_related_works: 100%|███████████████████████████████████████████████████| 205/205 [00:00<00:00, 898.58it/s]


organization_related_places doesn't have any records.


organization_related_recognitions: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 501.23it/s]
organization_related_people: 100%|█████████████████████████████████████████████████| 615/615 [00:00<00:00, 1344.35it/s]
organization_related_resources: 100%|████████████████████████████████████████████████| 90/90 [00:00<00:00, 1313.43it/s]


work_related_events doesn't have any records.


work_related_organizations: 100%|████████████████████████████████████████████████| 1297/1297 [00:00<00:00, 1459.94it/s]


work_related_works doesn't have any records.


work_related_places: 100%|███████████████████████████████████████████████████████| 1843/1843 [00:01<00:00, 1432.71it/s]
work_related_recognitions: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1002.94it/s]
work_related_people: 100%|███████████████████████████████████████████████████████| 1785/1785 [00:01<00:00, 1462.36it/s]
work_related_resources: 100%|████████████████████████████████████████████████████| 1392/1392 [00:01<00:00, 1210.53it/s]


recognition_related_events doesn't have any records.


recognition_related_organizations: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 503.16it/s]
recognition_related_works: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1032.06it/s]


recognition_related_places doesn't have any records.
recognition_related_recognitions doesn't have any records.


recognition_related_people: 100%|████████████████████████████████████████████████████| 24/24 [00:00<00:00, 1365.39it/s]
recognition_related_resources: 100%|█████████████████████████████████████████████████| 10/10 [00:00<00:00, 1118.72it/s]

The lookup documents of DBRefs have been successfully updated to `related_XXX` fields!



