In [1]:
from tqdm import tqdm
import pandas as pd
import math
import re
import warnings
warnings.filterwarnings('ignore')

### Separate ADM and FUNC.

In [2]:
df_c_ot_adm_raw=pd.read_excel('https://docs.google.com/spreadsheets/d/e/2PACX-1vQwXjRmlMR9w2ZV2tcenPSz9UgE7WAgeumGxxCJlceQOZRQFgm6_mgMCAlC_GzM0yxxNsDOlU1-5aH-/pub?output=xlsx',
                              sheetname='C_OT_ADM'
                             ).drop('c_ot_adm_id', axis=1)
df_c_ot_adm_raw['length']=[len(str(s)) for s in df_c_ot_adm_raw['c_ot_adm_chinm']]
df_c_ot_adm_raw.sort_values(by='length', ascending=False, inplace=True)
df_c_ot_adm_raw.index.name='c_ot_adm_id'
df_c_ot_adm_raw.reset_index(inplace=True)

In [3]:
df_c_ot_adm_raw[(df_c_ot_adm_raw.type=='f')|(df_c_ot_adm_raw.type=='b')].rename(columns={'c_ot_adm_id':'c_ot_func_id', 
                                                                                         'c_ot_adm_chinm':'c_ot_func_chinm',
                                                                                         'c_ot_adm_engnm':'c_ot_func_engnm',
                                                                                         'c_ot_adm_desc':'c_ot_func_desc',
                                                                                         'c_ot_adm_start':'c_ot_func_start',
                                                                                         'c_ot_adm_end':'c_ot_func_end'
                                                                                        }).\
to_csv('../data_output/C_OT_FUNC.tsv', encoding='utf8', sep='\t', index=False)

In [4]:
df_c_ot_adm_raw[df_c_ot_adm_raw.type=='o'].\
to_csv('../data_output/C_OT_ADM.tsv', encoding='utf8', sep='\t', index=False)

### Refined C_OT_TIT.

In [5]:
df_c_ot_tit_raw=pd.read_excel('https://docs.google.com/spreadsheets/d/e/2PACX-1vQwXjRmlMR9w2ZV2tcenPSz9UgE7WAgeumGxxCJlceQOZRQFgm6_mgMCAlC_GzM0yxxNsDOlU1-5aH-/pub?output=xlsx',
                              sheetname='C_OT_TIT'
                             ).drop('c_ot_tit_id', axis=1)
df_c_ot_tit_raw['length']=[len(str(s)) for s in df_c_ot_tit_raw['c_ot_tit_chinm']]
df_c_ot_tit_raw.sort_values(by='length', ascending=False, inplace=True)
df_c_ot_tit_raw.index.name='c_ot_tit_id'
df_c_ot_tit_raw.reset_index(inplace=True)

In [6]:
df_c_ot_tit_raw[(df_c_ot_tit_raw.value_to_run==1)|(df_c_ot_tit_raw.value_to_run==2)].\
to_csv('../data_output/C_OT_TIT.tsv', encoding='utf8', sep='\t', index=False)

### Refined C_OT_CLS

In [8]:
df_c_ot_cls_raw=pd.read_excel('https://docs.google.com/spreadsheets/d/e/2PACX-1vQwXjRmlMR9w2ZV2tcenPSz9UgE7WAgeumGxxCJlceQOZRQFgm6_mgMCAlC_GzM0yxxNsDOlU1-5aH-/pub?output=xlsx',
                              sheetname='C_OT_CLS'
                             ).drop('c_ot_cls_id', axis=1)
df_c_ot_cls_raw['length']=[len(str(s)) for s in df_c_ot_cls_raw['c_ot_cls_chinm']]
df_c_ot_cls_raw.sort_values(by='length', ascending=False, inplace=True)
df_c_ot_cls_raw.index.name='c_ot_cls_id'
df_c_ot_cls_raw.reset_index(inplace=True)

In [9]:
df_c_ot_cls_raw[(df_c_ot_cls_raw.value_to_run==1)|(df_c_ot_cls_raw.value_to_run==2)].\
to_csv('../data_output/C_OT_CLS.tsv', encoding='utf8', sep='\t', index=False)

### Refined Appointment Type.

In [None]:
df_app_ty=pd.read_excel('https://docs.google.com/spreadsheets/d/e/2PACX-1vQwXjRmlMR9w2ZV2tcenPSz9UgE7WAgeumGxxCJlceQOZRQFgm6_mgMCAlC_GzM0yxxNsDOlU1-5aH-/pub?output=xlsx',
                        sheetname='APPOINTMENT_TYPE_CODES'
                       )
df_app_ty['length']=[len(str(s)) for s in df_app_ty['c_appt_type_desc_chn']]
df_app_ty.sort_values(by='length', ascending=False, inplace=True)
df_app_ty.tail()

In [None]:
df_app_ty.to_csv('../data_output/APPOINTMENT_TYPE_CODES.tsv', encoding='utf8', sep='\t', index=False)

### Build Classification table (`C_OT_CLS`).

In [7]:
df_C_OT_CLS=pd.DataFrame(columns=['c_ot_cls_chinm','c_ot_cls_engnm','c_ot_cls_desc','c_ot_cls_start','c_ot_cls_end',])

In [8]:
c_ot_cls_list=list(set(df_ming_title_refined['Institution 1'].unique()).union((df_ming_title_refined['Institution 2'].unique())))
c_ot_cls_list=[s.strip() for s in c_ot_cls_list if str(s)!='nan']

In [9]:
c_ot_cls_chinm_list=[str(s).split(' ')[0] for s in c_ot_cls_list]
c_ot_cls_engnm_list=[' '.join(str(s).split(' ')[1:]) for s in c_ot_cls_list]

In [10]:
df_C_OT_CLS['c_ot_cls_chinm']=c_ot_cls_chinm_list
df_C_OT_CLS['c_ot_cls_engnm']=c_ot_cls_engnm_list
df_C_OT_CLS['len']=[len(s) for s in df_C_OT_CLS['c_ot_cls_chinm']]
df_C_OT_CLS.sort_values(by='len', ascending=False, inplace=True)
df_C_OT_CLS.reset_index(inplace=True, drop=True)
df_C_OT_CLS.index.name='c_ot_cls_id'
df_C_OT_CLS.drop('len', axis=1).to_csv('../data_output/C_OT_CLS.tsv', sep='\t', encoding='utf8')

### Build Admin Unit table (`C_OT_ADM`) and Title table (`C_OT_TIT`).

In [11]:
df_C_OT_ADM=pd.DataFrame(columns=['c_ot_adm_chinm','c_ot_adm_engnm','c_ot_adm_desc','c_ot_adm_start','c_ot_adm_end',])
df_C_OT_TIT=pd.DataFrame(columns=['c_ot_tit_chinm','c_ot_tit_engnm','c_ot_tit_desc','c_ot_tit_start','c_ot_tit_end',])

In [12]:
c_ot_adm_list=list(set(df_ming_title_refined['Institution 3'].unique()))
c_ot_adm_list=[s.strip() for s in c_ot_adm_list if str(s)!='nan']

In [13]:
adm_list=[]
for adm_title in [s.split('/') for s in df_ming_title_refined['concat'].unique()]:
    adm_list+=adm_title[:-1]
    
tit_list=[]
for adm_title in [s.split('/') for s in df_ming_title_refined['concat'].unique()]:
    tit_list.append(adm_title[-1])

In [14]:
len(adm_list), len(set(adm_list)), len(tit_list),  len(set(tit_list))

(1967, 942, 2836, 1349)

In [18]:
# Build Admin Unit table.
df_C_OT_ADM['c_ot_adm_chinm']=list(set(adm_list))
df_C_OT_ADM['len']=[len(s) for s in df_C_OT_ADM['c_ot_adm_chinm']]
df_C_OT_ADM.sort_values(by='len', ascending=False, inplace=True)
df_C_OT_ADM.reset_index(inplace=True, drop=True)
df_C_OT_ADM.index.name='c_ot_adm_id'
df_C_OT_ADM.drop('len', axis=1).to_csv('../data_output/C_OT_ADM.tsv', sep='\t', encoding='utf8')
# Build Title table.
df_C_OT_TIT['c_ot_tit_chinm']=list(set(tit_list))
df_C_OT_TIT['len']=[len(s) for s in df_C_OT_TIT['c_ot_tit_chinm']]
df_C_OT_TIT.sort_values(by='len', ascending=False, inplace=True)
df_C_OT_TIT.reset_index(inplace=True, drop=True)
df_C_OT_TIT.index.name='c_ot_tit_id'
df_C_OT_TIT.drop('len', axis=1).to_csv('../data_output/C_OT_TIT.tsv', sep='\t', encoding='utf8')

In [16]:
df_C_OT_ADM.sample(5)

Unnamed: 0_level_0,c_ot_adm_chinm,c_ot_adm_engnm,c_ot_adm_desc,c_ot_adm_start,c_ot_adm_end,len
c_ot_adm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
895,驃騎,,,,,2
217,蘇州織造,,,,,4
384,兵馬司,,,,,3
186,巡視京營,,,,,4
602,議政,,,,,2


In [17]:
df_C_OT_TIT.sample(5)

Unnamed: 0_level_0,c_ot_tit_chinm,c_ot_tit_engnm,c_ot_tit_desc,c_ot_tit_start,c_ot_tit_end,len
c_ot_TIT_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
939,谷王,,,,,2
1053,襄王,,,,,2
1120,德嬪,,,,,2
1024,侍書,,,,,2
1017,寺正,,,,,2


### Draft.
```Python
df_ming_title_raw=pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSCmhbCk1B-9jjINMhy_VwikM6_Sn7bjdO7b_vaZJkVcYCCYlWVlhYVCFtAs0fPX-UEO62GWxaX1qAS/pub?gid=843266982&single=true&output=tsv',
                              sep='\t')

df_ming_title_raw.sample(3)

for index in tqdm(df_ming_title_raw.index):
    df_ming_title_raw.loc[index, 'concat']='/'.join([str(s).strip() for s in df_ming_title_raw[['titles to revise','Unnamed: 11','Unnamed: 12','Unnamed: 13',]].loc[index]])
df_ming_title_raw['concat']=[s.replace('/nan', '') for s in df_ming_title_raw['concat']]
df_ming_title_refined=df_ming_title_raw[['LINE #','c_office_id（Dictionary Ser#)','Institution 1','Institution 2','Institution 3','data_cat','concat']]
df_ming_title_refined=df_ming_title_refined[df_ming_title_refined.data_cat==1]
```

df_ming_title_refined=df_ming_title_raw[['LINE #','c_office_id（Dictionary Ser#)','Institution 1','Institution 2','Institution 3','data_cat','title_revised_to run']]
df_ming_title_refined['title_revised_to run']=[s.strip('/') for s in df_ming_title_refined['title_revised_to run']]
df_ming_title_refined.rename(columns={'title_revised_to run':'concat'}, inplace=True)
df_ming_title_refined=df_ming_title_refined[(df_ming_title_refined.data_cat==1)|(df_ming_title_refined.data_cat==7)]

df_ming_title_refined.sample(3)