In [1]:
# Download NECHR
!git clone https://github.com/cbdb-project/named-entities-for-premodern-chinese-history-research.git

Cloning into 'named-entities-for-premodern-chinese-history-research'...
remote: Enumerating objects: 354, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 354 (delta 95), reused 79 (delta 39), pack-reused 170[K
Receiving objects: 100% (354/354), 18.45 MiB | 4.38 MiB/s, done.
Resolving deltas: 100% (174/174), done.


In [4]:
import requests
import os
url = "https://huggingface.co/datasets/cbdb/cbdb-sqlite/resolve/main/latest.7z?download=true"
r = requests.get(url)
os.makedirs("./cbdb_sqlite", exist_ok=True)
with open("./cbdb_sqlite/latest.7z", "wb") as f:
    f.write(r.content)

In [5]:
# import glob
import os
latest_file = "./cbdb_sqlite/latest.7z"
latest_db = latest_file.split("/")[2].split(".")[0] + ".db"
print(latest_db)
if os.path.isfile(latest_db):
    os.remove(latest_db)

latest.db


In [10]:
# Explode latest database file
!pip install pyunpack
!pip install patool
from pyunpack import Archive
Archive(latest_file).extractall(".")



In [11]:
# Create connection to database
import sqlite3
import pandas as pd
global CONN
database_file = os.path.basename(latest_file).split(".")[0]+".db"
CONN = sqlite3.connect(database_file)

In [12]:
addr_df_c_name_chn = pd.read_sql_query("SELECT c_name_chn FROM ADDR_CODES", CONN).drop_duplicates().dropna()
addr_df_c_alt_names = pd.read_sql_query("SELECT c_alt_names FROM ADDR_CODES", CONN).drop_duplicates().dropna()
addr_df = pd.concat([addr_df_c_name_chn, addr_df_c_alt_names], axis=1).stack().reset_index(drop=True).drop_duplicates().dropna().to_frame(name='addr_chn')
addr_df

Unnamed: 0,addr_chn
0,[信息缺乏]
1,[未詳]
2,
3,中華人民共和國
4,北京省市
...,...
13381,濠梁衛
13382,平越府
13383,平越軍民府
13384,尋甸軍民府


In [13]:
# read in address type csv file
address_type_df = pd.read_csv('./named-entities-for-premodern-chinese-history-research/cbdb_entity_address_types.csv')
address_type_list = address_type_df['name'].to_list()
address_type_list = address_type_list + ["等", "等處"]
address_type_list = sorted(address_type_list, key=len)
address_type_list.reverse()
print(len(address_type_list))

139


In [14]:
def recursively_remove_addr_type_from_text(res, text, address_type_list):
  if not text:
    return
  res.append(text)
  for word in address_type_list:
    if text.endswith(word):
      remains = text[:len(text)-len(word)]
      recursively_remove_addr_type_from_text(res, remains, address_type_list)
      break;

def remove_addr_type_from_text(text, address_type_list):
  res = []
  recursively_remove_addr_type_from_text(res, text, address_type_list)
  return res

In [15]:
# Test get_addr_tokens_from_text
input = "沿邊第一區備禦後千戶所左衛"
res = remove_addr_type_from_text(input, address_type_list)
print(res)

['沿邊第一區備禦後千戶所左衛', '沿邊第一區備禦後千戶所', '沿邊第一區']


In [16]:
def names_massage(df, column):
  # split line by ;
  df[column] = df[column].apply(lambda x: x.split(';'))
  df = df.explode(column).drop_duplicates().dropna()
  # remove row with 、
  df = df[df[column].str.contains('、')==False]
  # replace spaces with blank
  df[column] = df[column].str.replace("\\s", "", regex=True)
  # remove single-character row
  df = df[df[column].apply(lambda x: len(x)>1)]
  # remove the addresses which contain any English letters or any sort of brackets
  df = df[df[column].str.contains('.*[a-zA-Z]+.*')==False]
  df = df[df[column].str.contains('\(|\[|{|\)|\]|}|（|「|【|）|」|】')==False]
  # remove address type recursively
  df[column] = df[column].apply(lambda x: remove_addr_type_from_text(x, address_type_list))
  df = df.explode(column).drop_duplicates().dropna()
  # remove single-character 
  df = df[df[column].apply(lambda x: len(x)>1)]
  # remove row that's too long
  df = df[df[column].apply(lambda x: len(x)<14)]
  return df

In [17]:
filtered_addr_df = names_massage(addr_df, 'addr_chn')

In [18]:
filtered_addr_df

Unnamed: 0,addr_chn
3,中華人民共和國
4,北京省市
4,北京省
4,北京
5,天津省市
...,...
13383,平越軍民府
13383,平越軍民
13384,尋甸軍民府
13384,尋甸軍民


In [19]:
# Sort by descending order of the length
final_df = filtered_addr_df.sort_values(by="addr_chn", key=lambda x: -x.str.len())
final_df

Unnamed: 0,addr_chn
9772,松潘宕疊威茂州等處軍民安撫
12471,鎮南管內節度使暨鎮南都護府
2663,鎮沅彝族哈尼族拉祜族自治縣
9770,階文扶州等處番漢軍上千戶所
12470,鎮南管內經略使暨鎮南都護府
...,...
3060,陝州
3061,湖城
3062,閿鄉
6620,零祿


In [20]:
final_df.to_csv('cbdb_entity_addresses.csv', index=False)

In [21]:
# close the db
CONN.close()