## 0. Setting

In [1]:
import copy
import os
import pprint
from collections import Counter, defaultdict

import schedule
from bson import DBRef, ObjectId
from tqdm import tqdm

pp = pprint.PrettyPrinter(indent=2)

In [2]:
import sys

codefolder = "C:/ProjectCollections/Programs/Australia_Cultural_Data_Engine/codes"

sys.path.append(codefolder)
from acde import MongoDBManipulation as acde_manip
from daao import MongoDBManipulation as daao_manip
from general import GeneralFunctions as gen_gf
from general import JsonProcessing as gen_jp
from general import MongoDBManipulation as gen_manip

## 1. DAAO DB Meta Collection

1. get all collections and corresponding classes
2. get field information of each collection (Removed, All, Date Objects)

In [3]:
daao_meta = daao_manip.DAAO_MetaCollection()

### 1.0 reset collection names

In [4]:
for coll_name in daao_meta.daao_db.list_collection_names():
    if coll_name.endswith("_ori"):
        new_name = coll_name.split("_")[0]
        daao_meta.daao_db.drop_collection(new_name)
        daao_meta.daao_db[coll_name].rename(new_name)
        print(
            f"{coll_name} is renamed as {new_name}. The original {new_name} collection is deleted."
        )

### 1.1 DAAO Class Extraction

In [5]:
class_names = daao_meta.extract_class()

### 1.2 DAAO Field Name Extraction

In [6]:
daao_objs_info = daao_meta.extract_fieldName()

person: 100%|███████████████████████████████████████████████████████████████████| 30693/30693 [00:38<00:00, 806.42it/s]
personGroup: 100%|███████████████████████████████████████████████████████████████| 4660/4660 [00:01<00:00, 2685.00it/s]
event: 100%|███████████████████████████████████████████████████████████████████| 21906/21906 [00:08<00:00, 2478.12it/s]
eventGroup: 100%|████████████████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 2354.54it/s]
recognition: 100%|███████████████████████████████████████████████████████████████| 5481/5481 [00:01<00:00, 3363.23it/s]
work: 100%|████████████████████████████████████████████████████████████████████| 23780/23780 [00:06<00:00, 3682.72it/s]
collection: 100%|█████████████████████████████████████████████████████████████| 11747/11747 [00:00<00:00, 14341.81it/s]
ansicOcc_menu: 100%|██████████████████████████████████████████████████████████████| 825/825 [00:00<00:00, 27571.46it/s]
eventType_menu: 100%|███████████████████

## 2. Clean Date Format


### Rename Original Collections

In [7]:
coll_date_objs = defaultdict(set)

for cls_name, info in daao_objs_info.items():
    if info.get("date_objects") and cls_name != "relationship":
        coll_date_objs[info["collection_name"]] |= set(info.get("date_objects"))
for coll_name in coll_date_objs.keys():
    coll_date_objs[coll_name] = set(f"{f}._date" for f in coll_date_objs[coll_name])

for coll_name, date_fields in coll_date_objs.items():
    print("#" * 20)
    print(coll_name, ">>", ", ".join(date_fields))

####################
xparty >> periods_active.end._date, trainings.coverage_range.date_range.start._date, death.coverage.date._date, other_occupations.coverage_range.date_range.end._date, birth.coverage.date._date, trainings.coverage_range.date_range.end._date, residences.coverage_range.date_range.end._date, arrivals.date._date, periods_active.start._date, residences.coverage_range.date_range.start._date, other_occupations.coverage_range.date_range.start._date
####################
xactivity >> coverages.date_range.start._date, coverages.date_range.end._date, dates._date, coverages.date._date
####################
xwork >> date.end._date, date.start._date
####################
externalresource >> date._date


In [8]:
for coll_name in coll_date_objs.keys():
    daao_meta.daao_db[coll_name].rename(f"{coll_name}_ori")
    print(f"{coll_name} is renamed as {coll_name}_ori.")

xparty is renamed as xparty_ori.
xactivity is renamed as xactivity_ori.
xwork is renamed as xwork_ori.
externalresource is renamed as externalresource_ori.


### xparty

In [9]:
curr_coll = "xparty_ori"
clone_coll = curr_coll.split("_")[0]
curr_date_fields = coll_date_objs[clone_coll]

In [10]:
xparty_firstL_fields = list(set(f.split(".")[0] for f in curr_date_fields)) + [
    "date_modified",
    "date_created",
]
xparty_unwind_dates = defaultdict(list)
for f in curr_date_fields:
    if not f.startswith(("death", "birth")):
        xparty_unwind_dates[f.split(".")[0]].append(f)

xparty_Stage1_ProjectDates = {"$project": {f: 1 for f in xparty_firstL_fields}}

xparty_Stage2_ReconstructDateArray = []
for f_firstL, date_fields in xparty_unwind_dates.items():
    xparty_Stage2_ReconstructDateArray.append(
        {"$unwind": {"path": f"${f_firstL}", "preserveNullAndEmptyArrays": True,}}
    )
    for date_field in date_fields:
        xparty_Stage2_ReconstructDateArray.append(
            {
                "$set": {
                    date_field: {
                        "$dateFromString": {
                            "dateString": f"${date_field}",
                            "onError": None,
                        },
                    }
                }
            }
        )
        xparty_Stage2_ReconstructDateArray.append(
            {
                "$set": {
                    date_field: {
                        "year": {"$toString": {"$year": f"${date_field}"}},
                        "month": {"$toString": {"$month": f"${date_field}"}},
                        "day": {"$toString": {"$dayOfMonth": f"${date_field}"}},
                    }
                }
            }
        )
    xparty_Stage2_ReconstructDateArray += daao_manip.daao_construct_groupStage(
        xparty_firstL_fields, [f_firstL], regroup_opr="$push"
    )

xparty_Stage3_String2Date = {
    "$set": {
        f: {"$dateFromString": {"dateString": f"${f}", "onError": None,},}
        for f in curr_date_fields
        if f.startswith(("death", "birth"))
    }
}
xparty_Stage4_ReconstructDates = {
    "$set": {
        f: {
            "year": {"$toString": {"$year": f"${f}"}},
            "month": {"$toString": {"$month": f"${f}"}},
            "day": {"$toString": {"$dayOfMonth": f"${f}"}},
        }
        for f in [
            f
            for f in curr_date_fields
            - set(
                date_field
                for date_fields in xparty_unwind_dates.values()
                for date_field in date_fields
            )
        ]
        + ["date_modified", "date_created"]
    }
}

xparty_cleanDatePipeline = (
    [xparty_Stage1_ProjectDates]
    + xparty_Stage2_ReconstructDateArray
    + [xparty_Stage3_String2Date, xparty_Stage4_ReconstructDates,]
)

In [11]:
pp.pprint(xparty_cleanDatePipeline)

[ { '$project': { 'arrivals': 1,
                  'birth': 1,
                  'date_created': 1,
                  'date_modified': 1,
                  'death': 1,
                  'other_occupations': 1,
                  'periods_active': 1,
                  'residences': 1,
                  'trainings': 1}},
  {'$unwind': {'path': '$periods_active', 'preserveNullAndEmptyArrays': True}},
  { '$set': { 'periods_active.end._date': { '$dateFromString': { 'dateString': '$periods_active.end._date',
                                                                 'onError': None}}}},
  { '$set': { 'periods_active.end._date': { 'day': { '$toString': { '$dayOfMonth': '$periods_active.end._date'}},
                                            'month': { '$toString': { '$month': '$periods_active.end._date'}},
                                            'year': { '$toString': { '$year': '$periods_active.end._date'}}}}},
  { '$set': { 'periods_active.start._date': { '$dateFromString': { 'd

In [12]:
# clone
daao_meta.daao_db.drop_collection(clone_coll)
for doc in daao_meta.daao_db[curr_coll].find():
    rsp = daao_meta.daao_db[clone_coll].insert_one(doc)
print(f"The `{curr_coll}` collection is cloned as `{clone_coll}`")

# extract date fields and reconstruct as ymd format
for doc in tqdm(
    daao_meta.daao_db[clone_coll].aggregate(
        xparty_cleanDatePipeline, allowDiskUse=True,
    ),
    total=daao_meta.daao_db[clone_coll].count_documents({}),
    desc=f"{clone_coll} Date Field Format Cleansing",
    leave=True,
):
    daao_meta.daao_db[clone_coll].update_one({"_id": doc.get("_id")}, {"$set": doc})

The `xparty_ori` collection is cloned as `xparty`


xparty Date Field Format Cleansing: 100%|██████████████████████████████████████| 35353/35353 [00:32<00:00, 1093.90it/s]


### xactivity

In [13]:
curr_coll = "xactivity_ori"
clone_coll = curr_coll.split("_")[0]
curr_date_fields = coll_date_objs[clone_coll]

In [14]:
xactivity_firstL_fields = list(set(f.split(".")[0] for f in curr_date_fields)) + [
    "date_modified",
    "date_created",
]
xactivity_unwind_dates = defaultdict(list)
for f in curr_date_fields:
    xactivity_unwind_dates[f.split(".")[0]].append(f)

xactivity_Stage1_ProjectDates = {"$project": {f: 1 for f in xactivity_firstL_fields}}

xactivity_Stage2_ReconstructDateArray = []
for f_firstL, date_fields in xactivity_unwind_dates.items():
    xactivity_Stage2_ReconstructDateArray.append(
        {"$unwind": {"path": f"${f_firstL}", "preserveNullAndEmptyArrays": True,}}
    )
    for date_field in date_fields:
        xactivity_Stage2_ReconstructDateArray.append(
            {
                "$set": {
                    date_field: {
                        "$dateFromString": {
                            "dateString": f"${date_field}",
                            "onError": None,
                        },
                    }
                }
            }
        )
        xactivity_Stage2_ReconstructDateArray.append(
            {
                "$set": {
                    date_field: {
                        "year": {"$toString": {"$year": f"${date_field}"}},
                        "month": {"$toString": {"$month": f"${date_field}"}},
                        "day": {"$toString": {"$dayOfMonth": f"${date_field}"}},
                    }
                }
            }
        )
    xactivity_Stage2_ReconstructDateArray += daao_manip.daao_construct_groupStage(
        xactivity_firstL_fields, [f_firstL], regroup_opr="$push"
    )

xactivity_Stage3_ReconstructDates = {
    "$set": {
        f: {
            "year": {"$toString": {"$year": f"${f}"}},
            "month": {"$toString": {"$month": f"${f}"}},
            "day": {"$toString": {"$dayOfMonth": f"${f}"}},
        }
        for f in [
            f
            for f in set(curr_date_fields)
            - set(
                date_field
                for date_fields in xactivity_unwind_dates.values()
                for date_field in date_fields
            )
        ]
        + ["date_modified", "date_created"]
    }
}

xactivity_cleanDatePipeline = (
    [xactivity_Stage1_ProjectDates]
    + xactivity_Stage2_ReconstructDateArray
    + [xactivity_Stage3_ReconstructDates]
)

In [15]:
pp.pprint(xactivity_cleanDatePipeline)

[ { '$project': { 'coverages': 1,
                  'date_created': 1,
                  'date_modified': 1,
                  'dates': 1}},
  {'$unwind': {'path': '$coverages', 'preserveNullAndEmptyArrays': True}},
  { '$set': { 'coverages.date_range.start._date': { '$dateFromString': { 'dateString': '$coverages.date_range.start._date',
                                                                         'onError': None}}}},
  { '$set': { 'coverages.date_range.start._date': { 'day': { '$toString': { '$dayOfMonth': '$coverages.date_range.start._date'}},
                                                    'month': { '$toString': { '$month': '$coverages.date_range.start._date'}},
                                                    'year': { '$toString': { '$year': '$coverages.date_range.start._date'}}}}},
  { '$set': { 'coverages.date_range.end._date': { '$dateFromString': { 'dateString': '$coverages.date_range.end._date',
                                                             

In [16]:
# clone
daao_meta.daao_db.drop_collection(clone_coll)
for doc in daao_meta.daao_db[curr_coll].find():
    rsp = daao_meta.daao_db[clone_coll].insert_one(doc)
print(f"The `{curr_coll}` collection is cloned as `{clone_coll}`")

# extract date fields and reconstruct as ymd format
for doc in tqdm(
    daao_meta.daao_db[clone_coll].aggregate(
        xactivity_cleanDatePipeline, allowDiskUse=True,
    ),
    total=daao_meta.daao_db[clone_coll].count_documents({}),
    desc=f"{clone_coll} Date Field Format Cleansing",
    leave=True,
):
    daao_meta.daao_db[clone_coll].update_one({"_id": doc.get("_id")}, {"$set": doc})

The `xactivity_ori` collection is cloned as `xactivity`


xactivity Date Field Format Cleansing: 100%|███████████████████████████████████| 27481/27481 [00:19<00:00, 1440.26it/s]


### xwork

In [17]:
curr_coll = "xwork_ori"
clone_coll = curr_coll.split("_")[0]
curr_date_fields = coll_date_objs[clone_coll]

In [18]:
xwork_Stage1_ProjectDates = {
    "$project": {
        f: 1
        for f in ["date_modified", "date_created"]
        + list(set(f.split(".")[0] for f in curr_date_fields))
    }
}

xwork_Stage2_String2Date = {
    "$set": {
        f: {"$dateFromString": {"dateString": f"${f}", "onError": None,},}
        for f in curr_date_fields
    }
}
xwork_Stage3_ConvertDates = {
    "$set": {
        f: {
            "year": {"$toString": {"$year": f"${f}"}},
            "month": {"$toString": {"$month": f"${f}"}},
            "day": {"$toString": {"$dayOfMonth": f"${f}"}},
        }
        for f in list(curr_date_fields) + ["date_modified", "date_created"]
    }
}
xwork_cleanDatePipeline = [
    xwork_Stage1_ProjectDates,
    xwork_Stage2_String2Date,
    xwork_Stage3_ConvertDates,
]

In [19]:
pp.pprint(xwork_cleanDatePipeline)

[ {'$project': {'date': 1, 'date_created': 1, 'date_modified': 1}},
  { '$set': { 'date.end._date': { '$dateFromString': { 'dateString': '$date.end._date',
                                                       'onError': None}},
              'date.start._date': { '$dateFromString': { 'dateString': '$date.start._date',
                                                         'onError': None}}}},
  { '$set': { 'date.end._date': { 'day': { '$toString': { '$dayOfMonth': '$date.end._date'}},
                                  'month': { '$toString': { '$month': '$date.end._date'}},
                                  'year': { '$toString': { '$year': '$date.end._date'}}},
              'date.start._date': { 'day': { '$toString': { '$dayOfMonth': '$date.start._date'}},
                                    'month': { '$toString': { '$month': '$date.start._date'}},
                                    'year': { '$toString': { '$year': '$date.start._date'}}},
              'date_created': { 'day':

In [20]:
# clone
daao_meta.daao_db.drop_collection(clone_coll)
for doc in daao_meta.daao_db[curr_coll].find():
    rsp = daao_meta.daao_db[clone_coll].insert_one(doc)
print(f"The `{curr_coll}` collection is cloned as `{clone_coll}`")

# extract date fields and reconstruct as ymd format
for doc in tqdm(
    daao_meta.daao_db[clone_coll].aggregate(
        xwork_cleanDatePipeline, allowDiskUse=True,
    ),
    total=daao_meta.daao_db[clone_coll].count_documents({}),
    desc=f"{clone_coll} Date Field Format Cleansing",
    leave=True,
):
    daao_meta.daao_db[clone_coll].update_one({"_id": doc.get("_id")}, {"$set": doc})

The `xwork_ori` collection is cloned as `xwork`


xwork Date Field Format Cleansing: 100%|███████████████████████████████████████| 35527/35527 [00:22<00:00, 1596.36it/s]


### externalresource

In [21]:
curr_coll = "externalresource_ori"
clone_coll = curr_coll.split("_")[0]
curr_date_fields = coll_date_objs[clone_coll]

In [22]:
extlrsc_Stage1_ProjectDates = {
    "$project": {
        f: 1
        for f in ["date_modified", "date_created"]
        + list(set(f.split(".")[0] for f in curr_date_fields))
    }
}

extlrsc_Stage2_String2Date = {
    "$set": {
        f: {"$dateFromString": {"dateString": f"${f}", "onError": None,},}
        for f in curr_date_fields
    }
}
extlrsc_Stage3_ConvertDates = {
    "$set": {
        f: {
            "year": {"$toString": {"$year": f"${f}"}},
            "month": {"$toString": {"$month": f"${f}"}},
            "day": {"$toString": {"$dayOfMonth": f"${f}"}},
        }
        for f in list(curr_date_fields) + ["date_modified", "date_created"]
    }
}
extlrsc_cleanDatePipeline = [
    extlrsc_Stage1_ProjectDates,
    extlrsc_Stage2_String2Date,
    extlrsc_Stage3_ConvertDates,
]

In [23]:
pp.pprint(extlrsc_cleanDatePipeline)

[ {'$project': {'date': 1, 'date_created': 1, 'date_modified': 1}},
  { '$set': { 'date._date': { '$dateFromString': { 'dateString': '$date._date',
                                                   'onError': None}}}},
  { '$set': { 'date._date': { 'day': { '$toString': { '$dayOfMonth': '$date._date'}},
                              'month': {'$toString': {'$month': '$date._date'}},
                              'year': {'$toString': {'$year': '$date._date'}}},
              'date_created': { 'day': { '$toString': { '$dayOfMonth': '$date_created'}},
                                'month': { '$toString': { '$month': '$date_created'}},
                                'year': { '$toString': { '$year': '$date_created'}}},
              'date_modified': { 'day': { '$toString': { '$dayOfMonth': '$date_modified'}},
                                 'month': { '$toString': { '$month': '$date_modified'}},
                                 'year': { '$toString': { '$year': '$date_modified'}}}}}]

In [24]:
# clone
daao_meta.daao_db.drop_collection(clone_coll)
for doc in daao_meta.daao_db[curr_coll].find():
    rsp = daao_meta.daao_db[clone_coll].insert_one(doc)
print(f"The `{curr_coll}` collection is cloned as `{clone_coll}`")

# extract date fields and reconstruct as ymd format
for doc in tqdm(
    daao_meta.daao_db[clone_coll].aggregate(
        extlrsc_cleanDatePipeline, allowDiskUse=True,
    ),
    total=daao_meta.daao_db[clone_coll].count_documents({}),
    desc=f"{clone_coll} Date Field Format Cleansing",
    leave=True,
):
    daao_meta.daao_db[clone_coll].update_one({"_id": doc.get("_id")}, {"$set": doc})

The `externalresource_ori` collection is cloned as `externalresource`


externalresource Date Field Format Cleansing: 100%|████████████████████████████| 52597/52597 [00:30<00:00, 1707.89it/s]
