### Script purpose.

This script generates user dictionaries for Python package _jieba_.
Part-of-speech tag is using file `POS_tag.md`

Cleaning data fields.
- Strip whitespaces.
- Skip null values, e.g., "[未詳]", str(s)!="None".
- Normalize text from full-width to half-width.
- Split string containing multiple values (use of reexp intead of split).

TODO:

- [X] - Wrap the data cleaning procedures into function.
- [] - Unify the data cleaning in all sections.
- [] - Test on 全宋文墓誌铭.

In [1]:
import sqlite3
import pandas as pd
import re
import itertools as it
import unicodedata

In [77]:
# Init with sqlite file.
sqlite_file='../data_raw/sqlite_20180302.db'
conn = sqlite3.connect(sqlite_file)

# value_clean(string, sep_char=',')
def value_clean(string, sep_char=','):
    if str(string)!="[未詳]" and str(string)!='' and str(string)!='None' and str(string)!='nan':
        string=''.join(string.split())
        string=unicodedata.normalize('NFKC', string)
        return re.findall(r"[\w]+", string)
    else:
        return 'NA'

### People Names.

In [71]:
# Read tables.
df_biog_main=pd.read_sql_query("SELECT * FROM BIOG_MAIN", conn)
df_altname_data=pd.read_sql_query("SELECT * FROM ALTNAME_DATA", conn)

print(len(set(df_altname_data['c_alt_name_chn'])), 
      len(set(df_biog_main['c_name_chn'])), 
      len(set(df_altname_data['c_alt_name_chn']))+len(set(df_biog_main['c_name_chn'])))

71614 321207 392821


In [26]:
# Merge ang tag the dict.
df_name=pd.DataFrame(pd.Series(list(set(df_biog_main['c_name_chn']))), columns=['name'])
df_name['pos']='nr'
df_name['name']=[re.sub(r'\(.+', '', s) for s in df_name['name']]

# Save to file.
df_name.to_csv('../data_build/cn_traditional_jieba/nr_nra_dict_CBDB.txt', 
               sep=' ', index=False, header=False)

### Address.

In [45]:
# Read tables.
# Merge data.
addresses_set=set(''.join(s.split()) for s in pd.read_sql_query("SELECT * FROM ADDRESSES", conn)['c_name_chn'] if str(s)!='None' and str(s)!='[未詳]' and str(s)!='')
place_codes_set=set(''.join(s.split()) for s in pd.read_sql_query("SELECT * FROM PLACE_CODES", conn)['c_name_chn'] if str(s)!='None' and str(s)!='[未詳]' and str(s)!='')
ns_set=place_codes_set.union(addresses_set)
print(len(addresses_set), len(place_codes_set), len(ns_set))

9773 2491 9793


In [47]:
pd.DataFrame([[s, 'ns'] for s in ns_set], 
             columns=['name', 'pos']).to_csv('../data_build/cn_traditional_jieba/ns_dict_CBDB.txt', 
                                             sep=' ', index=False, header=False)

#### Song addresses.

In [136]:
df_song_addr=pd.read_sql_query("SELECT * FROM ADDRESSES", conn)
df_song_addr=df[(df.belongs1_Name=='宋朝')|(df.belongs2_Name=='宋朝')|(df.belongs3_Name=='宋朝')|(df.belongs4_Name=='宋朝')|(df.belongs5_Name=='宋朝')]
df_song_addr['pos']='ns'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [156]:
df_song_addr=df_song_addr[['c_name_chn', 'pos']].drop_duplicates() # Drop duplicates.
# Handle order effect while matching. See notes in qsw_sent_comp script.
df_song_addr.index=df_song_addr['c_name_chn'].str.len()
df_song_addr=df_song_addr.sort_index(ascending=False).reset_index(drop=True)

In [157]:
df_song_addr.to_csv('../data_build/cn_traditional_jieba/song_ns_dict.csv', sep='\t', index=False, header=False)

### Office Title.

In [158]:
# Read tables.
df_office_codes=pd.read_sql_query("SELECT * FROM OFFICE_CODES", conn)
df_office_categories=pd.read_sql_query("SELECT * FROM OFFICE_CATEGORIES", conn)

In [114]:
# Clean data.
office_set=set([s for s in df_office_codes['c_office_chn']])
office_alt_list=[]
for office_alt in df_office_codes['c_office_chn_alt']:
    if office_alt!=None and str(office_alt)!='':
        office_alt_list+=office_alt.split(';')
office_set=office_set.union(set([s for s in office_alt_list if s!='']))
office_cata_set=set(s for s in df_office_categories['c_category_desc_chn'])

In [115]:
# Merge ang tag the dict.
df_no_noc=pd.DataFrame([[s, 'no'] for s in office_set]+[[s, 'noc'] for s in office_cata_set],
                      columns=['name', 'pos'])
# Save to file.
df_no_noc.to_csv('../data_build/cn_traditional_jieba/no_noc_dict_CBDB.txt', 
                  sep=' ', index=False, header=False)

#### Song office titles.

In [160]:
df_office_codes_song=df_office_codes[df_office_codes['c_dy']==15]
df_office_codes_song.sample(3)

Unnamed: 0,tts_sysno,c_office_id,c_dy,c_office_pinyin,c_office_chn,c_office_pinyin_alt,c_office_chn_alt,c_office_trans,c_office_trans_alt,c_source,c_pages,c_notes,c_category_1,c_category_2,c_category_3,c_category_4,c_office_id_old
4145,4146.0,4703,15,tai fu si di er an,太府寺第二案,,,,,,,,機構,太府寺門,,,0.0
5451,5452.0,6010,15,tai shang huang hou,太上皇后,,,,,,,,,,,,0.0
4379,4380.0,4937,15,xi jing liu shou si yu shi tai,西京留守司御史臺,liu tai;xi jing liu tai;xi du liu tai;xi jing ...,留臺;西京留臺;西都留臺;西京留司御史臺;外臺,,,,,,機構,御史臺門,,,0.0


In [181]:
song_no_set=set(it.chain(*df_office_codes_song['c_office_chn'].apply(value_clean).values)).union(set(it.chain(*df_office_codes_song['c_office_chn_alt'].apply(value_clean).values)))

In [188]:
df_song_no=pd.DataFrame(sorted([(s, len(s), 'no') for s in song_no_set if s!='NA'], key=lambda x:x[1], reverse=True),
                        columns=['name', 'len', 'pos']
                       ).drop('len', axis=1)

In [189]:
df_song_no.to_csv('../data_build/cn_traditional_jieba/song_no_dict.csv', 
                  sep='\t', index=False, header=False)

### Time.

In [24]:
# Read tables.
#df_dyna=pd.read_sql_query("SELECT * FROM DYNASTIES", conn)
df_reign=pd.read_sql_query("SELECT * FROM NIAN_HAO", conn)

In [25]:
# Get data.
dynasty_set=set(df_reign['c_dynasty_chn'])
reign_set=set(df_reign['c_nianhao_chn'])

In [26]:
# Merge ang tag the dict.
df_td_tr=pd.DataFrame([[s, 'td'] for s in dynasty_set]+[[s, 'tr'] for s in reign_set],
                      columns=['name', 'pos'])
# Save to file.
df_td_tr.to_csv('../data_build/cn_traditional_jieba/td_tr_dict_CBDB.txt', 
                  sep=' ', index=False, header=False)

### Kinship title.

In [45]:
# Read tables.
df_kin=pd.read_sql_query("SELECT * FROM KINSHIP_CODES", conn)

In [46]:
# Get and clean data.
kin_list=[''.join(s.split()) for s in set(it.chain(*[s.split(';') for s in df_kin['c_kinrel_chn'] if s!=None]))]

In [47]:
# Merge ang tag the dict.
df_nk=pd.DataFrame([(s, 'nk') for s in kin_list], columns=['name', 'pos'])

In [49]:
# Save to file.
df_nk.to_csv('../data_build/cn_traditional_jieba/nk_dict_CBDB.txt', 
             sep=' ', index=False, header=False)

### Institution names.

In [82]:
# Read tables.
inst_nm_set=set(it.chain(*[value_clean(s) for s in 
                           pd.read_sql_query("SELECT * FROM SOCIAL_INSTITUTION_NAME_CODES", conn)['c_inst_name_hz']]))

In [83]:
# Clean data and save to dict file.
pd.DataFrame([[s, 'nt'] for s in inst_nm_set], 
             columns=['name', 'pos']).to_csv('../data_build/cn_traditional_jieba/nt_dict_CBDB.txt',
                                             sep=' ', index=False, header=False)

### TESTING AREA.

In [40]:
pd.read_sql_query("SELECT * FROM FOREIGNKEYS", conn)

Unnamed: 0,AccessTblNm,AccessFldNm,ForeignKey,ForeignKeyBaseField,FKString,FKName,skip,IndexOnField,DataFormat,NULL_allowed
0,ADDR_BELONGS_DATA,c_addr_id,ADDR_CODES,c_addr_id,c_addr_id,ADDR_BELONGS_ADDR_ID,1,Primary Key,Long,0
1,ADDR_BELONGS_DATA,c_belongs_to,ADDR_CODES,c_addr_id,c_addr_id,ADDR_BELONGS_BELONG_ID,0,Primary Key,Long,0
2,ADDR_BELONGS_DATA,c_source,TEXT_CODES,c_textid,c_textid,ADDR_BELONGS_TEXT_CODE,0,,Long,1
3,altname_data,c_alt_name_type_code,ALTNAME_CODES,c_name_type_code,c_name_type_code,ALTNAME_DATA_ALTNAME_CODES,1,Primary Key,Integer,0
4,altname_data,c_personid,BIOG_MAIN,c_personid,c_personid,ALTNAME_DATA_PERSON_ID,1,Primary Key,Long,0
5,altname_data,c_source,TEXT_CODES,c_textid,c_textid,ALTNAME_DATA_TEXT_CODE,1,,Long,1
6,ASSOC_CODE_TYPE_REL,c_assoc_code,ASSOC_CODES,c_assoc_code,c_assoc_code,ASSOC_CODE_TYPE_REL_ASSOC_CODE,0,Primary Key,Integer,0
7,ASSOC_CODE_TYPE_REL,c_assoc_type_id,ASSOC_TYPES,c_assoc_type_id,c_assoc_type_id,ASSOC_CODE_TYPE_REL_ASSOC_TYPE,0,Primary Key,Text,0
8,ASSOC_CODES,c_assoc_pair2,ASSOC_CODES,c_assoc_code,c_assoc_code,ASSOC_CODES_ASSOC_PAIR,0,,Integer,1
9,assoc_data,c_addr_id,ADDR_CODES,c_addr_id,c_addr_id,ASSOC_DATA_ADDR_ID,1,,Long,1
