## 0. Setting

In [2]:
import json
import os
import pprint
import re
import time
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path

import jsonlines
import numpy as np
import pandas as pd
import pymongo
from bson import ObjectId, json_util
from pymongo import MongoClient
from tqdm import tqdm, trange

In [1]:
import sys

codefolder = (
    "C:/ProjectCollections/Programs/Australia_Cultural_Data_Engine/codes"
)
sys.path.append(codefolder)
from daao import JsonProcessing as daao_jp
from daao import MongoDBManipulation as daao_manip
from daao import PlaceManipulation as daao_pm
from general import GeneralFunctions as gen_gf
from general import JsonProcessing as gen_jp

In [3]:
data_folder = (
    "D:/Program_Data/Australia_Cultural_Data_Engine_Data/design_art_australia_online"
)
js_folder = os.path.join(data_folder, "exported_json")
csv_folder = os.path.join(data_folder, "exported_csv")
pp = pprint.PrettyPrinter(indent=2)

## 1. MongoDB Connection

In [4]:
client = MongoClient("mongodb://localhost:27017/")
daao3 = client.daao3
daao_place = daao3.place

##  2. DAAO Place (of Event) Fetching

### Place Data Retrieval

In [5]:
place_basic_projections = [
    "_id",
    "name",
    "centroid",
    "url",
    "address",
    "country",
]

#####
#
# Construct xparty Projection + Lookup (Grouping) Stage
#
#####
place_plstage = daao_manip.construct_FinalProjectionStage(
    keep_list=place_basic_projections
)

pp.pprint(place_plstage)

[ { '$project': { '_id': 1,
                  'address': 1,
                  'centroid': 1,
                  'country': 1,
                  'name': 1,
                  'url': 1}}]


In [6]:
places = []
places_flatten = []

with tqdm(total=35353, position=0, leave=True) as t:
    for place_doc in daao_place.aggregate(place_plstage, allowDiskUse=True,):
        place_doc = gen_jp.clean_empty_values(place_doc)
        place_doc = JsonProcessing.GetYearFromDateString(place_doc)
        places.append(place_doc)
        places_flatten.append(gen_jp.flatten_json_iterative_solution(place_doc))
        t.update(1)

 47%|██████████████████████████████████▍                                      | 16691/35353 [00:00<00:01, 18214.07it/s]


In [7]:
places_df = pd.json_normalize(places_flatten)

In [8]:
places_df.head()

Unnamed: 0,_id,centroid.latitude,centroid._types.0,centroid._cls,centroid.longitude,centroid.city,centroid.city_district,centroid.country,centroid.country_code,centroid.postcode,centroid.state,centroid.suburb,address,name,country,url
0,4da194f4b538b73d930005f4,-33.868901,GeoLocation,GeoLocation,151.207091,Sydney,Unknown,Australia,au,2000,New South Wales,Sydney,"Sydney, NSW",,,
1,4da194f5b538b73d930005fa,-27.470933,GeoLocation,GeoLocation,153.023502,Unknown,Brisbane City,Australia,au,4004,Queensland,Brisbane City,"Brisbane, Qld",,,
2,4da194f5b538b73d930005fb,43.6525,GeoLocation,GeoLocation,-79.381667,Toronto,Unknown,Canada,ca,M5H 3R3,Ontario,Unknown,"Toronto, Canada",,,
3,4da194f5b538b73d930005fc,40.714353,GeoLocation,GeoLocation,-74.005973,New York,Unknown,United States,us,10007,New York,Manhattan,"New York, USA",,,
4,4da194f5b538b73d930005fd,-37.813187,GeoLocation,GeoLocation,144.96298,Melbourne,Unknown,Australia,au,3000,Victoria,Melbourne,"Melbourne, Vic.",,,


### Exploration

In [9]:
places_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16691 entries, 0 to 16690
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   _id                     16691 non-null  object 
 1   centroid.latitude       14019 non-null  float64
 2   centroid._types.0       14019 non-null  object 
 3   centroid._cls           14019 non-null  object 
 4   centroid.longitude      14019 non-null  float64
 5   centroid.city           16691 non-null  object 
 6   centroid.city_district  16691 non-null  object 
 7   centroid.country        16691 non-null  object 
 8   centroid.country_code   16691 non-null  object 
 9   centroid.postcode       16691 non-null  object 
 10  centroid.state          16691 non-null  object 
 11  centroid.suburb         16691 non-null  object 
 12  address                 16141 non-null  object 
 13  name                    2548 non-null   object 
 14  country                 1776 non-null 

In [10]:
places_df.describe(include="all")

Unnamed: 0,_id,centroid.latitude,centroid._types.0,centroid._cls,centroid.longitude,centroid.city,centroid.city_district,centroid.country,centroid.country_code,centroid.postcode,centroid.state,centroid.suburb,address,name,country,url
count,16691,14019.0,14019,14019,14019.0,16691,16691,16691,16691,16691,16691,16691,16141,2548,1776,35
unique,16691,,1,1,,706,603,110,110,2801,369,1563,14992,2293,171,14
top,4da194f4b538b73d930005f4,,GeoLocation,GeoLocation,,Unknown,Unknown,Australia,au,Unknown,New South Wales,Unknown,"Sydney, NSW",The Hart Centre of Arts,Australia,http://www.hart.com.cn/
freq,1,,14019,14019,,6406,14270,10099,10099,3738,3891,6916,129,21,967,21
mean,,-13.33018,,,105.488099,,,,,,,,,,,
std,,35.454553,,,74.375595,,,,,,,,,,,
min,,-55.983333,,,-175.198242,,,,,,,,,,,
25%,,-35.308056,,,114.109497,,,,,,,,,,,
50%,,-33.85628,,,144.964595,,,,,,,,,,,
75%,,14.583333,,,151.200318,,,,,,,,,,,


## 3. Place Normalization 

### Brief Description

To denormalize the irregular place information into a standard format with united names (country-state-city-suburb-(postcode)), reverse-geocoding and manual geo information extraction are implemented in this process.

There are 3 stages in DAAO place normalization:

【Stage 1】<br>
Get a group of standardized location candidates (aka geo-candidates) from reverse-geocoding (Step 1, 2, 3).<br>
【Stage 2】<br>
Extract the other group of standardized location candidates (aka extracted-candidates) from the cleaned raw location text (Step 4, 5, 6).<br> 
【Stage 3】<br> 
By comparing with the geo-candidates and extracted-candidates, the location unite would be "inferred" from country level to (Australian) suburb level (Step 7, 8, 9).<br>

1. [Reverse Geocoding](#reverse-geocoding)<br>
This section will get the (potential) **standardized** details of geo information of existing place, either from centroid coordinates or address and name.<br>
If the coordinates/address and name information doesn't exist in ACDE_Coordinate2Loc.jsonl/ACDE_Loc2Coordinate.jsonl, the coordinates/address and name will be reverse-geocoded and stored into these jsonl files.<br>
    *Reference*: [Get the city, state, and country names from latitude and longitude using Python](https://www.geeksforgeeks.org/get-the-city-state-and-country-names-from-latitude-and-longitude-using-python/)<br>

2. [Merge Reverse-Geocoding Data into Original Place Data](#merge-reverse-geo-into-original-place)<br>
<br>
3. [Extract Location Candidates from Reverse-Geocoding Information](#get-ptnl-loc)<br>
This section extracts standardized location combination (country, state, city, suburb, postcode) candidates from reverse-geocoded data getting from [Reverse Geocoding](#reverse-geocoding). <br>

4. [Place Normalizer Initialization](#place-normalizer-initiallization)<br>

5. [Raw Input Cleansing](#raw-input-cleansing)<br>
This section cleans the original country/name/address from database, and then finds the longest place information, which can be treated as "the best location information".<br>
The initial cleasing process including:<br>
    1. Redundant punctuation removal.<br>
    2. State/Country abbreviation standardization >> expansion.<br>
    3. Spelling Checking (country only regarding the time efficiency).<br>

6. [Potential Place Extraction](#ptnl-places-extraction)<br>
This section extracts the potential location components by comparing with the names in (standardized) country/state/city/(Australian) suburb datasets.<br>

7. [Merge Extracted-Candidates & Geo-Candidates](#merge-two-groups-candidates)<br>
This section merges 2 types of candidates for the following inferring.<br>

8. [Location Components Inferring](#loc-comp-infer)<br>
This section infers the location components from country level to Australian suburb level (if there's any)<br>

9. [Redundancy Removal](#redundancy-removal)

### <a  name="reverse-geocoding">Reverse Geocoding<a>

In [11]:
from geopy.geocoders import Nominatim

locinfo_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data/general"

centroid_fn = "ACDE_Coordinate2Loc.jsonl"
centroid_fpath = os.path.join(locinfo_folder, centroid_fn)
Loc2Cor_fn = "ACDE_Loc2Coordinate.jsonl"
Loc2Cor_fpath = os.path.join(locinfo_folder, Loc2Cor_fn)

#### Reverse Geocoding: Centroid Coordinates

This section will create/update ACDE_Coordinate2Loc.jsonl, which is the mapping table for the existing coordinates in place collection of DAAO database. If there's a new pair of coodinates, it will be added into the table.

1. 
github: tyne@tynedaile.com and nat@cutters.org
narrative:

In [12]:
#####
# Load existing coordinates
#####
existing_centroids = set()
my_file = Path(centroid_fpath)
if my_file.is_file():
    with jsonlines.open(centroid_fpath) as js_reader:
        for obj in js_reader:
            existing_centroids.add(tuple(obj.get("ori_coordinates")))

# Initialize geocoder
geolocator = Nominatim(user_agent="geoapiExercises")

all_centroids = (
    places_df[["centroid.latitude", "centroid.longitude"]].drop_duplicates().dropna()
)

with jsonlines.open(centroid_fpath, mode="a") as js_writer, tqdm(
    total=all_centroids.shape[0]
) as pbar:
    for idx, lat, long in all_centroids[
        ["centroid.latitude", "centroid.longitude"]
    ].itertuples():
        loc_info = {"ori_coordinates": [lat, long]}
        if (lat, long) not in existing_centroids:
            rsp_loc = geolocator.reverse(
                f"{lat},{long}", addressdetails=True, language="en"
            )
            if rsp_loc:
                loc_info.update(rsp_loc.raw)
            js_writer.write(loc_info)
            # time.sleep(1)
        pbar.update(1)

100%|██████████████████████████████████████████████████████████████████████████| 5307/5307 [00:00<00:00, 376591.12it/s]


#### Reverse Geocoding: Address/Name

In [13]:
#####
# Reload coordinates info
#####

all_Coord2Loc = []

with jsonlines.open(centroid_fpath) as js_reader:
    for obj in js_reader:
        all_Coord2Loc.append(obj)

all_Coord2Loc = pd.json_normalize(all_Coord2Loc)
all_Coord2Loc["ori_coordinates"] = all_Coord2Loc["ori_coordinates"].apply(tuple)
Coord2Ctycode = (
    all_Coord2Loc[["ori_coordinates", "address.country_code"]]
    .fillna("")
    .set_index("ori_coordinates")
    .to_dict()
    .get("address.country_code")
)

#####
# Get all locations from retrieved places
#####

all_locs = (
    places_df[["country", "name", "address"]]
    .unstack()
    .dropna()
    .str.strip()
    .drop_duplicates()
)
all_locs = (
    all_locs.to_frame("loc_info")
    .reset_index(level=0)
    .merge(
        places_df[["centroid.latitude", "centroid.longitude"]],
        left_index=True,
        right_index=True,
        how="left",
    )
)

In [14]:
#####
# Load existing locations
#####
existing_locs = set()
Loc2Cor_file = Path(Loc2Cor_fpath)
if Loc2Cor_file.is_file():
    with jsonlines.open(Loc2Cor_fpath) as js_reader:
        for obj in js_reader:
            existing_locs.add(obj.get("search_query"))
            existing_locs.add(obj.get("raw_loc_info"))
    existing_locs.discard(None)

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapiExercises")

with jsonlines.open(Loc2Cor_fpath, mode="a") as js_writer, tqdm(
    total=len(all_locs)
) as pbar:
    for (idx, _, raw_loc, lat, lon) in all_locs.itertuples():
        search_ctycode = Coord2Ctycode.get((lat, lon))
        loc_info = {
            "search_query": raw_loc,
            "search_ctycode": search_ctycode,
            "rsp": [],
        }
        potential_locs = [raw_loc]
        if raw_loc not in existing_locs:
            if "," in raw_loc:
                potential_names = raw_loc.split(",", 1)
                potential_locs = potential_locs + potential_names
                potential_locs = [i.strip() for i in potential_locs]
            for curr_idx, curr_loc in enumerate(potential_locs):
                if curr_idx == 0:
                    rsp_loc = geolocator.geocode(
                        curr_loc,
                        addressdetails=True,
                        language="en",
                        country_codes=search_ctycode,
                        timeout=None,
                    )
                    if rsp_loc:
                        loc_info["rsp"].append(rsp_loc.raw)
                        js_writer.write(loc_info)
                        break
                    else:
                        js_writer.write(loc_info)
                elif curr_idx == 1:
                    if curr_loc not in existing_locs:
                        rsp_loc = geolocator.geocode(
                            curr_loc,
                            addressdetails=True,
                            language="en",
                            country_codes=search_ctycode,
                            timeout=None,
                            exactly_one=False,
                        )
                        loc_info.update(
                            {"search_query": curr_loc, "raw_loc_info": raw_loc}
                        )
                        if rsp_loc:
                            loc_info["rsp"] = loc_info["rsp"] + [
                                rsp.raw for rsp in rsp_loc
                            ]
                        js_writer.write(loc_info)
                        # here's a bug
                        existing_locs.add(curr_loc)
                else:
                    # Independent section, New Ballarat General Cemetery, Ballarat, Vic.
                    # If this part can recursively check the part, that might be better
                    # But not worthy
                    if curr_loc not in existing_locs:
                        rsp_loc = geolocator.geocode(
                            curr_loc,
                            addressdetails=True,
                            language="en",
                            country_codes=search_ctycode,
                            timeout=None,
                            exactly_one=True,
                        )
                        loc_info.update(
                            {"search_query": curr_loc, "raw_loc_info": raw_loc}
                        )
                        if rsp_loc:
                            loc_info["rsp"].append(rsp_loc.raw)
                        js_writer.write(loc_info)
                        existing_locs.add(curr_loc)
                # time.sleep(0.9)
        pbar.update(1)
        # if idx >= 80:
        #     break

100%|████████████████████████████████████████████████████████████████████████| 17126/17126 [00:00<00:00, 300446.50it/s]


### <a name=merge-reverse-geo-into-original-place>Merge Reverse-Geocoding into Original DAAO Place Data</a>

In [15]:
######
# Reload Existing Geo Information
######

locinfo_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data/general"

centroid_fn = "ACDE_Coordinate2Loc.jsonl"
centroid_fpath = os.path.join(locinfo_folder, centroid_fn)
Loc2Cor_fn = "ACDE_Loc2Coordinate.jsonl"
Loc2Cor_fpath = os.path.join(locinfo_folder, Loc2Cor_fn)

c2l_cols = [
    "ori_latitude",
    "ori_longitude",
    "address.country",
    "address.country_code",
    "address.state",
    "address.city",
    "address.city_district",
    "address.suburb",
    "address.postcode",
]
l2c_cols = [
    "search_query",
    "raw_loc_info" "display_name",
    "address.region",
    "address.country",
    "address.country_code",
    "address.state_district",
    "address.state",
    "address.county",
    "address.city_district",
    "address.city",
    "address.suburb",
    "address.town",
    "address.road",
    "address.house_number",
    "address.postcode",
]

existing_loc2cor = []
existing_cor2loc = []

with jsonlines.open(Loc2Cor_fpath) as js_reader:
    for obj in js_reader:
        existing_loc2cor.append(obj)

with jsonlines.open(centroid_fpath) as js_reader:
    for obj in js_reader:
        existing_cor2loc.append(obj)

existing_cor2loc = pd.json_normalize(existing_cor2loc, meta="address")
existing_cor2loc[["ori_latitude", "ori_longitude"]] = existing_cor2loc[
    "ori_coordinates"
].apply(pd.Series)

existing_loc2cor = pd.json_normalize(existing_loc2cor)

In [16]:
places_df_corrd2loc = places_df.merge(
    existing_cor2loc[c2l_cols].set_index(["ori_latitude", "ori_longitude"]),
    right_index=True,
    left_on=["centroid.latitude", "centroid.longitude"],
    how="left",
).copy()

# places_df_corrd2loc.columns = [
#     col_name.replace("address.", "centroid.")
#     for col_name in places_df_corrd2loc.columns.to_list()
# ]

In [17]:
valid_loc2corrd_rsp_addr = daao_pm.FindPtnlLoc2Corrd(
    places_df_corrd2loc, "address", existing_loc2cor
)

valid_loc2corrd_rsp_name = daao_pm.FindPtnlLoc2Corrd(
    places_df_corrd2loc, "name", existing_loc2cor
)

In [18]:
# concatenate address results and name results
valid_loc2corrd_rsp = pd.concat(
    [valid_loc2corrd_rsp_addr, valid_loc2corrd_rsp_name], axis=0
)
ori_valid_loc2corrd_rsp = valid_loc2corrd_rsp.copy()

# Processing valid_loc2corrd_rsp
valid_loc2corrd_rsp = valid_loc2corrd_rsp.groupby("index")["loc2corrd_rsp"].agg(sum)
valid_loc2corrd_rsp = valid_loc2corrd_rsp.apply(daao_pm.SelectUniqueLoc2Corrd).to_frame(
    "loc2corrd_rsp"
)

#####
# Do merging only
# Don't do the index aggregation here (keep original long df)
# Attention to duplicates search_query (but different raw_loc_info)
#####
ori_valid_loc2corrd_rsp = ori_valid_loc2corrd_rsp.merge(
    existing_loc2cor[["search_query", "raw_loc_info"]],
    right_index=True,
    left_on="loc2_cor_idx",
    how="left",
)

Finally, we get **valid_loc2corrd_rsp**, which is the valid location information from existing address/name,  and **places_df_corrd2loc**, which is the location information reverse-geocoded from centorids.

### <a name='get-loc-candidates'>Extract Location Candidates from Reverse-Geocoding Information</a>

In [19]:
ptnlLoc_fromGeocode = (
    places_df_corrd2loc[
        [
            "centroid.country",
            "centroid.state",
            "centroid.city",
            "centroid.suburb",
            "centroid.postcode",
        ]
    ]
    .merge(valid_loc2corrd_rsp, left_index=True, right_index=True, how="left")
    .apply(
        lambda x: daao_pm.GetPtnlLoc_Geo(
            [
                x["centroid.country"],
                x["centroid.state"],
                x["centroid.city"],
                x["centroid.suburb"],
                x["centroid.postcode"],
            ],
            x["loc2corrd_rsp"],
            ("country", "state", "city", "suburb", "postcode"),
        ),
        axis=1,
    )
).to_frame("ptnlLoc_fromGeocode")

### <a name='place-normalizer-initiallization'>Place Normalizer Initialization</a>

Initialize the normalizer for following sections. The supplement geo datasets are [World Cities Database](https://simplemaps.com/data/world-cities).

In [20]:
geo_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data/supplemental_datasets"

In [21]:
p_normalizer = daao_pm.DAAO_PlaceNormalization(geo_folder=geo_folder)

### <a name='raw-input-cleansing'>Raw Input (address/name/country) Cleansing</a>

In [22]:
# Correcting PRC
places_df["country"] = places_df.country.fillna("").apply(
    lambda x: "China"
    if "," not in x and "PRC" in "".join([char for char in x if char.isupper()])
    else x
)
####
# Initial Cleaning Process
####
ori_places = (
    places_df[["country", "address", "name"]]
    .rename({"country": "1_country", "address": "3_address", "name": "2_name"}, axis=1)
    .stack()
    .to_frame("place_info")
    .reset_index()
)
cleaned_places = (
    ori_places.sort_values("level_1").drop_duplicates("place_info", keep="first").copy()
)

cty_idx = cleaned_places.query('level_1=="1_country"').index
cleaned_places["cleaned_place_info"] = ""

cleaned_places.loc[cty_idx, "cleaned_place_info"] = cleaned_places.loc[
    cty_idx, "place_info"
].apply(lambda x: p_normalizer.place_initProcess(x, spell_check=True))

other_idx = cleaned_places.query('level_1!="1_country"').index
cleaned_places.loc[other_idx, "cleaned_place_info"] = cleaned_places.loc[
    other_idx, "place_info"
].apply(lambda x: p_normalizer.place_initProcess(x, spell_check=False))
ori_places = ori_places.reset_index()
cleaned_places = cleaned_places.drop(["level_1", "level_0"], axis=1)
cleaned_places = ori_places.merge(
    cleaned_places, on="place_info", suffixes=["_ori", ""]
).drop("place_info", axis=1)
cleaned_places = cleaned_places.pivot(
    index="level_0", columns="level_1", values="cleaned_place_info"
).fillna("")
cleaned_places = cleaned_places.rename(
    {"1_country": "country", "3_address": "address", "2_name": "name"}, axis=1
)
cleaned_places = cleaned_places.add_prefix("cleaned_")
cleaned_places = cleaned_places.applymap(
    lambda x: x.replace("Washington District of Columbia", "Washington")
)
###
# Fix S.H. Ervin Gallery...
###
for col in ("cleaned_country", "cleaned_name", "cleaned_address"):
    cleaned_places[col] = cleaned_places[col].apply(
        lambda x: x.replace("SH ", "S.H. ").replace("S H ", "S.H. ")
        if "Ervin" in x
        else x
    )

####
# Get union of place-part tokens
####
cleaned_places["merged_place_parts"] = cleaned_places.apply(
    lambda x: daao_pm.merging_places_toParts(x), axis=1
)
cleaned_places["merged_place_parts_NoStreet"] = cleaned_places[
    "merged_place_parts"
].apply(
    lambda x: set(
        part for part in x if not re.search(" (street|road|Rd)", part.lower())
    )
)
cleaned_places["merged_place_tokens"] = cleaned_places.apply(
    lambda x: set(
        token.strip("(),[].")
        for part in x.merged_place_parts_NoStreet
        for token in part.split()
    ),
    axis=1,
)
# Get the longest place info (presumably the best)???

cleaned_places["longest_place_info"] = cleaned_places.apply(
    p_normalizer.pick_place, axis=1
)

Finally, we get **cleaned_places**

### <a name=ptnl-places-extraction>Potential Places Extraction</a>

In [23]:
# Initial Extraction and searching for countries, states, city and AU suburbs
for set_name, loc_pool in [
    ("cty_set", p_normalizer.cty_set),
    ("state_set", p_normalizer.state_set),
    ("city_set", p_normalizer.city_set),
    ("au_suburb_set", p_normalizer.au_suburb_set),
]:
    #
    cleaned_places[
        [f"extracted_{set_name}", f"extracted_{set_name}_status"]
    ] = cleaned_places.apply(
        lambda x: daao_pm.extractPtnlPlace(
            x.merged_place_parts,
            x.merged_place_parts_NoStreet,
            x.merged_place_tokens,
            loc_pool,
        ),
        result_type="expand",
        axis=1,
    )

# Fix UK relative places
cleaned_places[
    ["extracted_cty_set", "extracted_cty_set_status"]
] = cleaned_places.apply(
    lambda x: (
        (set(x.extracted_cty_set) - {"U", "e", "k", "n", "o", "r", "s", "u", "w"})
        | {"United Kingdom"},
        "Accurate",
    )
    if (set(x.extracted_state_set) & {"Scotland", "England"})
    or (x.extracted_city_set == {"London"})
    else (x.extracted_cty_set, x.extracted_cty_set_status),
    result_type="expand",
    axis=1,
)

# Fix "Australian" for Australia
cleaned_places[
    ["extracted_cty_set", "extracted_cty_set_status"]
] = cleaned_places.apply(
    lambda x: ({"Australia"}, "Accurate")
    if ("Australian" in x.merged_place_tokens)
    or (
        set(
            (
                x.extracted_cty_set_status,
                x.extracted_state_set_status,
                x.extracted_city_set_status,
            )
        )
        == {"Unknown"}
        and x.extracted_au_suburb_set_status == "Accurate"
    )
    else (x.extracted_cty_set, x.extracted_cty_set_status),
    result_type="expand",
    axis=1,
)

### <a name=merge-two-groups-candidates>Merge Extracted-Candidates & Geo-Candidates</a>

In [24]:
extracted_locs = [
    "extracted_cty_set",
    "extracted_cty_set_status",
    "extracted_state_set",
    "extracted_state_set_status",
    "extracted_city_set",
    "extracted_city_set_status",
    "extracted_au_suburb_set",
    "extracted_au_suburb_set_status",
]
ptnlLocs_overall = (
    cleaned_places[extracted_locs]
    .merge(ptnlLoc_fromGeocode, left_index=True, right_index=True)
    .apply(
        lambda x: daao_pm.GetPtnlLoc(
            x["extracted_cty_set"],
            x["extracted_state_set"],
            x["extracted_state_set_status"],
            x["extracted_city_set"],
            x["extracted_city_set_status"],
            x["extracted_au_suburb_set"],
            x["extracted_au_suburb_set_status"],
            x["ptnlLoc_fromGeocode"],
        ),
        result_type="expand",
        axis=1,
    )
    .copy()
)
# Assign column names
ptnlLocs_overall.columns = ["ptnlLoc_fromGeocode"] + extracted_locs
# Fix 'Washington' and 'District of Columnbia'
ptnlLocs_overall[
    ["extracted_state_set", "extracted_state_set_status"]
] = ptnlLocs_overall[["extracted_state_set", "extracted_state_set_status"]].apply(
    lambda x: (x["extracted_state_set"] - set(["Washington"]), "Accurate")
    if "District of Columbia" in x["extracted_state_set"]
    and "Washington" in x["extracted_state_set"]
    else x,
    axis=1,
)

### <a name=loc-comp-infer>Location Components Inferring</a>

#### Country Level

In [25]:
# Infer country
ptnlLocs_overall[
    ["extracted_cty_set_inferred", "extracted_cty_set_status_inferred"]
] = ptnlLocs_overall.apply(
    lambda x: p_normalizer.inferCty(
        x.extracted_cty_set,
        x.extracted_state_set,
        x.extracted_state_set_status,
        x.extracted_city_set,
        x.extracted_city_set_status,
        x.extracted_au_suburb_set,
        x.extracted_au_suburb_set_status,
    )
    if x.extracted_cty_set_status != "Accurate"
    else (x.extracted_cty_set, x.extracted_cty_set_status),
    result_type="expand",
    axis=1,
)

#### State Level

In [26]:
ptnlLocs_overall[
    ["extracted_state_set_inferred", "extracted_state_set_status_inferred"]
] = ptnlLocs_overall.apply(
    lambda x: p_normalizer.inferState(
        x.extracted_cty_set_inferred,
        x.extracted_cty_set_status_inferred,
        x.extracted_state_set,
        x.extracted_state_set_status,
        x.extracted_city_set,
        x.extracted_city_set_status,
        x.extracted_au_suburb_set,
        x.extracted_au_suburb_set_status,
    ),
    result_type="expand",
    axis=1,
)

In [27]:
# Fix United Kingdom
ptnlLocs_overall.extracted_state_set_inferred = ptnlLocs_overall.extracted_state_set_inferred.apply(
    lambda x: x - {"United Kingdom"} if isinstance(x, set) else x
)

#### City Level

In [28]:
ptnlLocs_overall[
    ["extracted_city_set_inferred", "extracted_city_set_status_inferred"]
] = ptnlLocs_overall.apply(
    lambda x: p_normalizer.inferCity(
        x.extracted_cty_set_inferred,
        x.extracted_state_set_inferred,
        x.extracted_state_set_status_inferred,
        x.extracted_city_set,
        x.extracted_city_set_status,
        x.extracted_au_suburb_set,
        x.extracted_au_suburb_set_status,
    ),
    result_type="expand",
    axis=1,
)

#### AU Suburb Level

In [30]:
ptnlLocs_overall[
    ["extracted_au_suburb_set_inferred", "extracted_au_suburb_set_status_inferred"]
] = ptnlLocs_overall.apply(
    lambda x: p_normalizer.inferSuburb(
        x.extracted_cty_set_inferred,
        x.extracted_state_set_inferred,
        x.extracted_state_set_status_inferred,
        x.extracted_city_set_inferred,
        x.extracted_city_set_status_inferred,
        x.extracted_au_suburb_set,
        x.extracted_au_suburb_set_status,
    ),
    result_type="expand",
    axis=1,
)

#### Inferred Attributes Selection

In [38]:
ptnlLocs_overall = ptnlLocs_overall[
    [
        "ptnlLoc_fromGeocode",
        "extracted_cty_set_inferred",
        "extracted_cty_set_status_inferred",
        "extracted_state_set_inferred",
        "extracted_state_set_status_inferred",
        "extracted_city_set_inferred",
        "extracted_city_set_status_inferred",
        "extracted_au_suburb_set_inferred",
        "extracted_au_suburb_set_status_inferred",
    ]
]

### <a name=redundancy-removal>Redundancy Removal</a>

After inferring, information in each level contains entropy from extracted text that exactly exists in original text.

If there's still additional part that can be found in *Coord2Loc* dataframe, that means it's redundant. Just remove it.

In [39]:
for inferred_attr, c2l_attr in [
    ("extracted_state_set_inferred", "centroid.state"),
    ("extracted_city_set_inferred", "centroid.city"),
    ("extracted_au_suburb_set_inferred", "centroid.suburb"),
]:
    ptnlLocs_overall[inferred_attr] = (
        ptnlLocs_overall[[inferred_attr]]
        .merge(places_df_corrd2loc[c2l_attr], left_index=True, right_index=True)
        .apply(
            lambda x: daao_pm.RemoveRedCoord2Loc(x[inferred_attr], x[c2l_attr]), axis=1
        )
    )

## Update Place Collection

In [41]:
venues_df = places_df_corrd2loc.merge(
    cleaned_places, left_index=True, right_index=True
).copy()
venues_df = venues_df.merge(ptnlLocs_overall, left_index=True, right_index=True).copy()
venues_df = venues_df.merge(
    ptnlLocs_validCorrd2Loc, left_index=True, right_index=True, how="left"
).copy()

venues_df["valid_loc2corrd_rsps"] = venues_df["valid_loc2corrd_rsps"].apply(
    lambda x: [] if isinstance(x, float) else x
)
# Unify the Format
for col_name in venues_df.columns:
    if col_name.endswith("_inferred") or col_name.startswith(("merged_", "extracted_")):
        venues_df[col_name] = venues_df[col_name].apply(
            lambda x: [] if isinstance(x, str) else list(x)
        )
    elif col_name.startswith("centroid."):
        venues_df[col_name] = venues_df[col_name].fillna("Unknown")

In [42]:
# set the columns that need to be updated
dbupdate_cols = [
    "_id",
    "centroid.city",
    "centroid.city_district",
    "centroid.country",
    "centroid.country_code",
    "centroid.postcode",
    "centroid.state",
    "centroid.suburb",
    "longest_place_info",
    "extracted_cty_set_inferred",
    "extracted_state_set_inferred",
    "extracted_city_set_inferred",
    "extracted_au_suburb_set_inferred",
    "cleaned_country",
    "cleaned_name",
    "cleaned_address",
    "extracted_cty_set_status_inferred",
    "extracted_state_set_status_inferred",
    "extracted_city_set_status_inferred",
    "extracted_au_suburb_set_status_inferred",
    "valid_loc2corrd_rsps",
    "merged_place_parts",
    "merged_place_parts_NoStreet",
    "merged_place_tokens",
    "extracted_cty_set",
    "extracted_cty_set_status",
    "extracted_state_set",
    "extracted_state_set_status",
    "extracted_city_set",
    "extracted_city_set_status",
    "extracted_au_suburb_set",
    "extracted_au_suburb_set_status",
]
venues_updateDict = venues_df[dbupdate_cols].to_dict("records")

In [43]:
# update the place collection
with tqdm(total=len(venues_updateDict)) as pbar:
    for record in venues_updateDict:
        daao_place.update_one(
            filter={"_id": record.get("_id")},
            update={"$set": {k: v for k, v in record.items() if k != "_id"}},
        )
        pbar.update(1)

100%|██████████████████████████████████████████████████████████████████████████| 16691/16691 [00:14<00:00, 1160.01it/s]


In [45]:
# get all possible remove field in place collection
# just in case we might want to remove them
all_placeFields = daao_manip.daao_getAllFields(
    daao3, "place", filter_dict={}, pbar_desc="place_update"
)

remove_fields = {
    i: 1
    for i in all_placeFields["all_fields"]
    if (
        i in dbupdate_cols
        and ((i != "_id" and "." not in i) or i.startswith("centroid."))
    )
}
remove_fields

place_update: 100%|████████████████████████████████████████████████████████████| 16691/16691 [00:07<00:00, 2141.66it/s]


{'centroid.city': 1,
 'longest_place_info': 1,
 'extracted_city_set_inferred': 1,
 'extracted_au_suburb_set_status_inferred': 1,
 'extracted_cty_set_status_inferred': 1,
 'extracted_au_suburb_set_status': 1,
 'extracted_city_set_status': 1,
 'merged_place_parts_NoStreet': 1,
 'extracted_state_set': 1,
 'extracted_city_set': 1,
 'centroid.country': 1,
 'extracted_cty_set_status': 1,
 'extracted_cty_set_inferred': 1,
 'extracted_city_set_status_inferred': 1,
 'cleaned_country': 1,
 'extracted_au_suburb_set': 1,
 'centroid.suburb': 1,
 'extracted_cty_set': 1,
 'cleaned_address': 1,
 'extracted_au_suburb_set_inferred': 1,
 'centroid.state': 1,
 'merged_place_tokens': 1,
 'extracted_state_set_status': 1,
 'cleaned_name': 1,
 'extracted_state_set_inferred': 1,
 'centroid.city_district': 1,
 'centroid.country_code': 1,
 'extracted_state_set_status_inferred': 1,
 'merged_place_parts': 1,
 'valid_loc2corrd_rsps': 1,
 'centroid.postcode': 1}

## Export as static files (Optional)

In [None]:
export_cols = [
    "_id",
    "centroid.latitude",
    "centroid._types.0",
    "centroid._cls",
    "centroid.longitude",
    "address",
    "name",
    "country",
    "url",
    "centroid.city",
    "centroid.city_district",
    "centroid.country",
    "centroid.country_code",
    "centroid.postcode",
    "centroid.state",
    "centroid.suburb",
    "longest_place_info",
    "extracted_cty_set_inferred",
    "extracted_state_set_inferred",
    "extracted_city_set_inferred",
    "extracted_au_suburb_set_inferred",
    "cleaned_country",
    "cleaned_name",
    "cleaned_address",
    "extracted_cty_set_status_inferred",
    "extracted_state_set_status_inferred",
    "extracted_city_set_status_inferred",
    "extracted_au_suburb_set_status_inferred",
    "valid_loc2corrd_rsps",
    "merged_place_parts",
    "merged_place_parts_NoStreet",
    "merged_place_tokens",
    "extracted_cty_set",
    "extracted_cty_set_status",
    "extracted_state_set",
    "extracted_state_set_status",
    "extracted_city_set",
    "extracted_city_set_status",
    "extracted_au_suburb_set",
    "extracted_au_suburb_set_status",
]
venues_dict = venues_df[export_cols].to_dict("records")

In [80]:
data_folder = (
    "D:/Program_Data/Australia_Cultural_Data_Engine_Data/design_art_australia_online"
)
js_folder = os.path.join(data_folder, "exported_json")
csv_folder = os.path.join(data_folder, "exported_csv")
curr_ts = gen_gf.time_now()
with open(
    os.path.join(js_folder, f"daao_venue_level_{gen_gf.time_now()}.json"),
    encoding="utf-8",
    mode="w",
) as f_w:
    json.dump(json.loads(json_util.dumps(venues_dict)), f_w, indent=2)

venues_df.to_csv(
    os.path.join(csv_folder, f"daao_venue_level_{gen_gf.time_now()}.csv"), index=False
)