In [1]:
# connect Google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip3 install -U geopy 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 13.2 MB/s 
Installing collected packages: geopy
  Attempting uninstall: geopy
    Found existing installation: geopy 1.17.0
    Uninstalling geopy-1.17.0:
      Successfully uninstalled geopy-1.17.0
Successfully installed geopy-2.3.0


In [3]:
# create working dir
!mkdir ./data

In [54]:
# pd data pre load
import pandas as pd
import os
import numpy as np

class DatasetLoader():
	def __init__(self, filepath, filetype, sheet_name=None, usecols=None, output_dir="./data", drop_subset_duplicates=None):
		data = None
		if not os.path.exists(filepath):
			raise ValueError('file does not exist!')
		if filetype == "xlsx":
			try:
				data = pd.read_excel(filepath, sheet_name=sheet_name, usecols=usecols)
			except RuntimeError as e:
				print(e.args)
		elif filetype == "csv":
			data = pd.read_csv(filepath)
		if data.empty:
			raise RuntimeError('data empty!')
		if drop_subset_duplicates:
			data = data.drop_duplicates(subset=drop_subset_duplicates, keep="first")
		if not os.path.exists(output_dir):
			os.mkdir(output_dir)
		data.to_csv(os.path.join(output_dir, "data.csv"), index=None)

	def get_dataset(self, file_path="./data/data.csv", objtype="df", split=0.8, filter_subset=None):
		if not os.path.exists(file_path):
			raise ValueError('file does not exist!')
		data = None
		if objtype == "df":
			data = pd.read_csv(file_path)
			if filter_subset:
				data = data.dropna(subset=filter_subset)
		elif objtype == "ds":
			file_format = os.path.splitext(file_path)[-1]
			if file_format in ["csv", "json"]:
				data = data.load_dataset("csv", data_files=file_path)
			else:
				raise ValueError('file format not supported')
			if filter_subset:
				data = data.filter(lambda x: x[f] for f in filter_subset)
		else:
			# return list
			pass
		return data


In [55]:
file_path = "/content/drive/MyDrive/data/RM_Wyoming_georef_training_2022-07-11.xlsx"
data_dir = "/content/data"

obj_df = DatasetLoader(
    filepath = file_path,
    filetype = "xlsx",
    sheet_name = "RM_Woming_georef_training",
    usecols = [0, 16, 17, 18, 19, 23, 26, 27],
    output_dir = data_dir,
    drop_subset_duplicates = ['Country','State', 'County', 'Locality', 'GeorefLatitude', 'GeorefLongitude']
)

data_df = obj_df.get_dataset(file_path = os.path.join(data_dir, "data.csv"), objtype = "df", split = 0.8, filter_subset = ["Locality", 'GeorefLatitude', 'GeorefLongitude'])
# print(data_df)

In [None]:
!pip3 install -U spacy
!python3 -m spacy download en_core_web_trf

In [56]:
# NER
import spacy
from geopy.geocoders import Nominatim

# init spacy
trf = spacy.load("en_core_web_trf")

# test
# data_df = data_df[0:500]
print(data_df)

# collect places name
def collect_places(locality):
  res = []
  docs = trf(locality)
  label_ls = ('GEO','GPE','ORG','FAC','LOC')
  for ent in docs.ents:
    if ent.label_ in label_ls:
      res.append(ent.text)
  return "|".join(res)

spacy_df = data_df[["ID", "Locality", "Country", "State", "County", "GeorefLatitude", "GeorefLongitude"]].copy()
spacy_df = pd.concat([spacy_df, pd.DataFrame(columns=["SpacyNER", "prediction_mean", "prediction_smallest"])], sort=False)
print(spacy_df)


          ID Country    State      County  \
0     129957  U.S.A.  Wyoming        Park   
1     199041  U.S.A.  Wyoming  Sweetwater   
2     199207  U.S.A.  Wyoming      Carbon   
3     199643  U.S.A.  Wyoming       Crook   
4     199816  U.S.A.  Wyoming    Big Horn   
...      ...     ...      ...         ...   
1548  921265  U.S.A.  Wyoming    Sublette   
1549  922154  U.S.A.  Wyoming       Teton   
1550  922900  U.S.A.  Wyoming     Fremont   
1551  925313  U.S.A.  Wyoming      Carbon   
1552  925387  U.S.A.  Wyoming     Fremont   

                                               Locality  \
0     Yellowstone Plateau: Yellowstone National Park...   
1     Washakie Basin: steeply tilted sandstone ridge...   
2     Medicine Bow Range: isolated fen complex betwe...   
3     Sand Creek crossing, just upstream, west side ...   
4                                       Horseshoe Bend.   
...                                                 ...   
1548             Wyoming National Forest: Snid

In [57]:
iso3166_file = "/content/drive/MyDrive/data/ISO3166-1_countryCode.csv"
country_codes_df = pd.read_csv(iso3166_file)
# print(country_codes_df)

In [62]:
# geopy

# get coordinates
geolocator = Nominatim(user_agent="test_geo")

# country code
# code_idx = country_codes_df[(country_codes_df["country name"]=="U.S.A.")].index
# print(code_idx)

def get_country_code(country_name):
  code = None
  code_idx = country_codes_df[(country_codes_df["country name"]==country_name)].index.tolist()
  if len(code_idx)>0:
    # print(code_idx)
    # code = country_codes_df.iloc[code_idx[0], 0]
    code = country_codes_df.loc[code_idx[0], "country code"]
    # print(code)
  else:
    pass
  return code

# def get_coordinates(places, countryCode, state, county):
#   granularity_level = "state"
#   corrds_res, smallest_res = [], []
#   county_, state_ = None, None
#   locs = places.split("|")
#   for loc in locs:
#     locations = geolocator.geocode(loc, exactly_one=False, country_codes=countryCode, language="english", namedetails=True, addressdetails=True)
#     print(loc, locations)
#     if locations:
#       for location in locations:
#         # print(location.raw)
#         print(location.raw['address'], location.latitude, location.longitude)
#         if location.latitude and location.longitude:
#           state_ = location.raw['address']["state"]
#           country_code_ = location.raw['address']["country_code"]
#           if country_code == countryCode and state_ == state:
#             if "county" in location.raw['address']:
#               county_ = country_code_ = location.raw['address']["county"]
#               if county_ == county:
#                 if granularity_level == "state":
#                   granularity_level = "county"
#                   smallest_res = [[location.latitude, location.longitude]]
#                 else:
#                   smallest_res.append([location.latitude, location.longitude])
#             else:
#               if granularity_level == "state":
#                 smallest_res.append([location.latitude, location.longitude])
#               else:
#                 pass
#             corrds_res.append([location.latitude, location.longitude])
#        # print(f"placeName: {loc} location: {location} latitude: {location.latitude} longitude: {location.longitude}")
#     return np.array(corrds_res, dtype=np.float32), np.array(smallest_res, dtype=np.float32)

def get_coordinates(places, countryCode, state, county):
  def process_loc(loc, state, county):
    addr = loc.raw['address']
    remove_ls = ["ISO3166-2-lvl4", "country", "postcode", "country_code"]
    res = None
    for i in remove_ls:
      addr.pop(i, "Not have the key")
    if "county" in addr and county == addr["county"]:
      res = loc
    else:
      if "state" in addr and state == addr["state"]:
        res = loc
    return res
  
  corrds_res, smallest_res = [], []
  locs = places.split("|")
  for loc in locs:
    locations = geolocator.geocode(loc, exactly_one=False, country_codes=countryCode, language="english", namedetails=True, addressdetails=True, timeout=2)
    # print(loc, locations)
    if locations:
      for location in locations:
        # print(location.raw)
        # print(location.raw['address'], location.latitude, location.longitude)
        if location.latitude and location.longitude:
          # print(f"placeName: {loc} location: {location} latitude: {location.latitude} longitude: {location.longitude}")
          corrds_res.append([location.latitude, location.longitude])
          res = process_loc(location, state, county)
          if res:
            smallest_res.append([location, len(res)])
    return np.array(corrds_res, dtype=np.float32), smallest_res


In [59]:
# mean coordinates
def compute_mean(locs):
  return np.mean(locs, axis=0)

def compute_smallest(locs):
  smallest = None
  res = [None, 0]
  for loc in locs:
    if loc[1] > res[1]:
      res = loc
  return np.array([res[0].latitude, res[0].longitude])

In [None]:
import time

for index, row in spacy_df.iterrows():
  places = collect_places(row["Locality"])
  # # print(index, row)
  if places:
    spacy_df.loc[index, "SpacyNER"] = places
    country_code = get_country_code(row["Country"])
    # geo_res = get_coordinates(places=places, country_codes=row["Country"], county=row["County"], state=row["State"])
    corrds_res, smallest_res = get_coordinates(places=places, countryCode=country_code, state=row["State"], county=row["County"])
    if len(corrds_res) > 0:
      spacy_df.loc[index, "prediction_mean"] = np.array2string(compute_mean(corrds_res), separator=",")[1:-1]
    if len(smallest_res) > 0:
      spacy_df.loc[index, "prediction_smallest"] = np.array2string(compute_smallest(smallest_res), separator=",")[1:-1]
    print(index, "done!")
    time.sleep(1.5)

spacy_df.to_csv(os.path.join("./data/", "result.csv"), index=None)

In [96]:
# evaluate
result_file = "./data/result.csv"
result_df = pd.read_csv(result_file)
# print(result_df)

# offset for evaluate
offset = 2

# compute accuracy
empty_counts = 0
accuracy_counts = 0
total = len(result_df)

def compute_accuracy(data, pred, offset_=0.0):
  pred = pred.split(",")
  lat_val, lon_val = False, False
  lat_range = [data[0]-offset_, data[0]+offset_]
  lon_range = [data[1]-offset_, data[1]+offset_]
  pred = np.array(pred, dtype=np.float32)
  if pred[0] > lat_range[0] and pred[0] < lat_range[1]: lat_val = True
  if pred[1] > lon_range[0] and pred[1] < lon_range[1]: lon_range = True
  return lat_val and lon_range

# priority smallest, if empty, get mean
for index, row in result_df.iterrows():
  res_mean = row["prediction_mean"]
  res_smallest = row["prediction_smallest"]
  if not pd.isna(res_smallest):
    if compute_accuracy(data=[row["GeorefLatitude"], row["GeorefLongitude"]], pred=res_smallest, offset_=offset):
      accuracy_counts += 1
  elif not pd.isna(res_mean):
    if compute_accuracy(data=[row["GeorefLatitude"], row["GeorefLongitude"]], pred=res_mean, offset_=offset):
      accuracy_counts += 1
  else:
    empty_counts += 1

# empty rate
print("pred empty: %0.2f%%" % (empty_counts/total*100))

# accuracy rate
print("accuracy: %0.2f%%" % (accuracy_counts/total*100))

# accuracy rate(without empty)
# print(accuracy_counts/(total-empty_counts))
print("accuracy without empty values: %0.2f%%" % (accuracy_counts/(total-empty_counts)*100))



pred empty: 21.98%
accuracy: 34.14%
accuracy without empty values: 43.75%


In [97]:
!cp ./data/result.csv /content/drive/MyDrive/data/