In [1]:
import pandas as pd
import re
from tqdm import tqdm
import math
import numpy as np
import json
from ast import literal_eval

In [2]:
# Read kinship dictionary. nk = POS Kinship.
df_nk=pd.read_csv('../data_dict/song_nk_dict.csv', sep='\t', header=None).rename(columns={0:'name', 1:'pos'})
# Read office title. no = office title. noc = office category.
df_no_noc=pd.read_csv('../data_dict/song_no_noc_dict.csv', sep='\t', header=None).rename(columns={0:'name', 1:'pos'})
df_ns=pd.read_csv('../data_dict/song_ns_dict.csv', sep='\t', header=None).rename(columns={0:'name', 1:'pos'})
df_vno=pd.read_csv('../data_dict/song_vno_dict.csv', sep='\t')
df_nz=pd.read_csv('../data_dict/song_nz_dict.csv', sep='\t', header=None).rename(columns={0:'name', 1:'pos'})
df_kin_mismatch=pd.read_csv('../data_dict/kin_mismatch.csv', sep='\t', header=None).rename(columns={0:'name'})

In [3]:
# All QSW records.
df_qsw_raw=pd.read_excel('../data_raw/quan_song_wen_muzhi.xlsx', sheet_name='墓誌銘墓表壙誌行狀神道碑塔銘墓碑')[['content_id', 'content', 'subject', 'author']].set_index('content_id')
df_qsw_raw['content']=[str(s) for s in df_qsw_raw['content']]
df_qsw_raw.dropna(inplace=True)
df_qsw_raw = df_qsw_raw[df_qsw_raw['content'] != "nan"]

  warn(msg)


In [4]:
df_qsw_raw

Unnamed: 0_level_0,content,subject,author
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4966988.0,公諱文蔚，字君章。其先濟陽考城人也。昔高陽恢若水之靈，光有萬國；伯益獲箕山之護，克成夏功。故...,江文蔚,徐鉉
4966982.0,公諱潭，字孟澤，洛陽人也。周先同姓，即列國之諸侯；漢得名臣，乃洛陽之才子。攀鱗河北，豈須方面...,賈潭,徐鉉
4966978.0,公諱敬宣，字文褒。其先尋陽人，因官徙籍，今爲合淝人也。西京作相，開國封侯，於是貽孫南國，主盟...,陶敬宣,徐鉉
4967079.0,公諱訥，字希仁。其先河南人也，後世從官，徙籍新安，支派繁衍，遂爲郡之著姓。迨公數世，皆以儒雅...,方訥,徐鉉
4967008.0,君諱廷構，字正材，洛陽人也。岐山至德，綿瓜瓞者萬邦；洛宅舊都，守枌榆者百世。簪組相繼，譜諜存...,周廷構,徐鉉
...,...,...,...
5887773.0,南平始隸渝州，元豐始創郡。傳記所載賢牧，前惟劉孝標，後惟江君叔文，賢令惟陳少遊，而江君行始見...,江叔文,劉克莊
5911515.0,詩能窮人尚矣，有生而窮者，有死而窮者。借車載家，蹇驢破帽，此生而窮也；耒陽荒土，采石孤墳，此...,劉過,吕大中
5938065.0,淳祐辛亥十一月二十日戊申，前蒼梧司法曲江張君之夫彭城劉氏終於廣之東莞，壽六十有六。寶祐丙辰十...,劉伯盛,劉宗
5958020.0,頃予登句曲，直東南望，數峰儼然，水倉玉立。其下谿谷迴合，林隰衍沃，其氣乍清乍潤，類非山澤枯槁...,徐洪,鄧光薦


In [5]:
# Function for seperating sentences.
def sep_mark_sent(string):
    string=string.replace('，', '/ws1ep/').replace('；', '/ws2ep/').replace('、', '/ws3ep/')
    string=string.replace('！', '/wend/').replace('。', '/wend/').replace('？', '/wend/').replace('！', '/wend/')
    string=string.replace('：', '/wm/')
    return [s for s in string.split("/wend/") if s!='']

In [6]:
# Seperate sentences and retain sentences with kinship words.
all_sent_count=0
kin_sent_count=0
df_qsw_refined=pd.DataFrame()
for index in tqdm(df_qsw_raw.index):
    content=df_qsw_raw.loc[index]['content']
    sent_list=sep_mark_sent(content) # Seperate sentences.
    all_sent_count+=len(sent_list)
    subject=df_qsw_raw.loc[index]['subject']
    author=df_qsw_raw.loc[index]['author']
    kin_sent_list=[]
    content_id=index
    # Retain sentences with kinship information.
    for sent in sent_list:
        # Remove all kin_mismatch words first.
        sent_temp=sent
        for kin_mismatch in df_kin_mismatch['name']:
            sent_temp=sent_temp.replace(kin_mismatch, '')
        # See if kin_nm still in sentence.
        kin_nm_list=[]
        for kin_nm in df_nk['name']:
            if kin_nm in sent_temp:
                kin_nm_list.append(kin_nm)
        if kin_nm_list!=[]:
            kin_sent_list.append(sent)
    kin_sent_count+=len(kin_sent_list)
    df_qsw_refined=pd.concat([pd.DataFrame(data=[[content_id, subject, author, sent, content] for sent in kin_sent_list],
                                          columns=['content_id', 'subject', 'author', 'sent', 'content']
                                          ), df_qsw_refined], axis=0, ignore_index=True
                            )
print('Kinship sentences / All sentences: ', round(float(kin_sent_count)/all_sent_count, 3))
# Run on entire corpus, 21.2%.

100%|██████████| 4701/4701 [00:33<00:00, 140.60it/s]

Kinship sentences / All sentences:  0.274





In [7]:
drop_index_list=[]
for index in tqdm(df_qsw_refined.index):
    sent=df_qsw_refined.loc[index, 'sent']
    if ('娶' in sent or '取' in sent or '配' in sent) and ('夫人' not in sent and '氏' not in sent):
        drop_index_list.append(index)
for index in tqdm(df_qsw_refined.index):
    sent=df_qsw_refined.loc[index, 'sent']
    if ('歸' in sent or '嫁' in sent or '適' in sent or '許' in sent) and ('女' not in sent and '妹' not in sent and '姑' not in sent and '夫人' not in sent and '姊' not in sent):
        drop_index_list.append(index)
        
df_qsw_refined=df_qsw_refined.drop(drop_index_list)
len(drop_index_list)

100%|██████████| 70834/70834 [00:00<00:00, 171412.99it/s]
100%|██████████| 70834/70834 [00:00<00:00, 166828.85it/s]


13466

In [8]:
def change(s):
    s = s.replace("/ws1ep/", "，")
    s = s.replace("/ws2ep/", "；")
    s = s.replace("/ws3ep/", "、")
    s = s.replace("/wm/", "：")
    return s
def change1(s):
    s = s.replace("/ws1ep/", "/wsep/")
    s = s.replace("/ws2ep/", "/wsep/")
    s = s.replace("/ws3ep/", "/wsep/")
    return s

In [9]:
df_qsw_refined['original'] = df_qsw_refined['sent']
df_qsw_refined['original'] = df_qsw_refined['original'].apply(change)
df_qsw_refined['sent'] = df_qsw_refined['sent'].apply(change1)

In [10]:
# First, subtract 固定称呼 (POS: nz).
for index in tqdm(df_qsw_refined.index):
    sent=df_qsw_refined.loc[index]['sent']
    nz_list=[]
    for nz in df_nz['name']:
        if nz in sent:
            nz_list+=re.findall(nz, sent)
            sent=sent.replace(nz, '/nz/')
    df_qsw_refined.loc[index, 'sent_comp']=sent
    df_qsw_refined.loc[index, 'nz']='#'.join(nz_list)

100%|██████████| 57679/57679 [01:10<00:00, 819.33it/s]


In [11]:
# Second, subtract place names (POS:ns).
for index in tqdm(df_qsw_refined.index):
    sent=df_qsw_refined.loc[index]['sent_comp']
    ns_list=[]
    for ns in df_ns['name']:
        if ns in sent:
            ns_list+=re.findall(ns, sent)
            sent=sent.replace(ns, '/ns/')
    df_qsw_refined.loc[index, 'sent_comp']=sent
    df_qsw_refined.loc[index, 'ns']='#'.join(ns_list)

100%|██████████| 57679/57679 [01:34<00:00, 607.49it/s]


In [12]:
# Third, subtract office title (POS:no_noc).
for index in tqdm(df_qsw_refined.index):
    sent=df_qsw_refined.loc[index]['sent_comp']
    no_noc_list=[]
    for no_noc in df_no_noc['name']:
        if no_noc in sent:
            no_noc_list+=re.findall(no_noc, sent)
            sent=sent.replace(no_noc, '/no_noc/')
    df_qsw_refined.loc[index, 'sent_comp']=sent
    df_qsw_refined.loc[index, 'no_noc']='#'.join(no_noc_list)

100%|██████████| 57679/57679 [02:20<00:00, 410.85it/s]


In [18]:
df_qsw_refined[df_qsw_refined["content_id"] == 5687408]

Unnamed: 0,content_id,subject,author,sent,content,original,sent_comp,nz,ns,no_noc,vno
34911,5687408.0,王恪女、宋許妻,張栻,淳熙二年秋/wsep/安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍/wsep/...,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘,淳熙二年秋/wsep//ns/宋文仲與其弟剛仲書來告其母夫人八月辛酉没於/ns//no_no...,,安陸#袁州,教授,
34912,5687408.0,王恪女、宋許妻,張栻,夫人姓王氏/wsep/六世祖太傅明/wsep/佐藝祖有勳勞/wsep/在太史,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,夫人姓王氏，六世祖太傅明，佐藝祖有勳勞，在太史,夫人姓王氏/wsep/六世祖/no_noc/明/wsep/佐藝祖有/no_noc/勞/wse...,,,太史#太傅#勳,
34913,5687408.0,王恪女、宋許妻,張栻,曾祖臨/wsep/事仁宗爲寶文閣待制,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,曾祖臨，事仁宗爲寶文閣待制,曾祖臨/wsep/事仁宗/vno//no_noc/,,,寶文閣待制,
34914,5687408.0,王恪女、宋許妻,張栻,祖承/wsep/提舉利州路常平事,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,祖承，提舉利州路常平事,祖承/wsep//no_noc//ns/常/no_noc/,,利州路,平事#提舉,
34915,5687408.0,王恪女、宋許妻,張栻,父恪/wsep/爲漢州雒縣令〔一〕,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,父恪，爲漢州雒縣令〔一〕,父恪/wsep//vno//ns//ns//no_noc/〔一〕,,雒縣#漢州,令,
34916,5687408.0,王恪女、宋許妻,張栻,母解氏,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,母解氏,母解氏,,,,
34917,5687408.0,王恪女、宋許妻,張栻,夫人適右朝議大夫/wsep/知德慶府宋許/wsep/生兩男子/wm/文仲/wsep/迪功郎/...,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,夫人適右朝議大夫、知德慶府宋許，生兩男子：文仲，迪功郎、全州清湘縣主簿；剛仲，迪功郎、袁州州學教授,夫人適/no_noc//wsep/知/ns/宋許/wsep/生兩男子/wm/文仲/wsep/...,,德慶府#袁州#清湘#全州,右朝議大夫#縣主簿#迪功郎#迪功郎#學教授,
34918,5687408.0,王恪女、宋許妻,張栻,一女/wsep/適承事郎/wsep/監饒州景德鎮税万俟傳,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,一女，適承事郎、監饒州景德鎮税万俟傳,一女/wsep/適/no_noc//wsep/監/ns/景德鎮税万俟傳,,饒州,承事郎,
34919,5687408.0,王恪女、宋許妻,張栻,孫男女凡七人,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,孫男女凡七人,孫男女凡七人,,,,
34920,5687408.0,王恪女、宋許妻,張栻,德慶君之没先夫人九年/wsep/葬於衡州衡陽縣五馬山之原,淳熙二年秋，安陸宋文仲與其弟剛仲書來告其母夫人八月辛酉没於袁州教授官舍，以喪歸葬，求予銘。予...,德慶君之没先夫人九年，葬於衡州衡陽縣五馬山之原,德慶君之没先夫人九年/wsep/葬於/ns//ns/縣/no_noc/山之原,,衡陽#衡州,五馬,


In [16]:
def vno_mark(vno, sent):
    vno_re=re.compile(vno+'/no_noc/|'+vno+'/ns/')
    vno_list_temp=re.findall(vno_re, sent)
    if vno_list_temp!=[]:
        sent_comp_temp=sent.replace(vno+'/no_noc/', '/vno//no_noc/')
        sent_comp_temp=sent_comp_temp.replace(vno+'/ns/', '/vno//ns/')
        return {'sent_comp':sent_comp_temp, 
                'vno_list':[s.strip('/no_noc/').strip('/ns/') for s in vno_list_temp]}
    else:
        return None

In [17]:
# Subtract appointing verb (POS:vno).
for index in df_qsw_refined.index:
    sent=df_qsw_refined.loc[index]['sent_comp']
    vno_list=[]
    for vno in df_vno['name']:
        tag_result=vno_mark(vno, sent)
        if tag_result!=None:
            sent=tag_result['sent_comp']
            vno_list+=tag_result['vno_list']
    df_qsw_refined.loc[index, 'sent_comp']=sent
    df_qsw_refined.loc[index, 'vno']='#'.join(vno_list)

In [14]:
df_qsw_refined.fillna('', inplace=True)

In [15]:
df_sent_comp=df_qsw_refined

In [16]:
re_rule_list=[r'(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|其先|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|先君|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅|舅)[諱|曰](.{0,2}?)/',
              r'^(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅)([^居某早丧卒母妣諱曰也]{0,2})/',
              r'[公君吾夫人我/](三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅)([^居某早丧卒母妣諱曰也]{0,2}?)/',
              r'[公君吾夫人我/](三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅)([^居某早丧卒母妣諱曰也]{0,2}?)$',
              r'(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|其先|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|先君|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅)(某官{0,1}?)',
              r'(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世祖|四代祖母|曾祖王父|外曾大父|外曾王父|外曾祖父|六代祖|十代祖|十世祖|九世祖|八世祖|七世祖|六世祖|五世祖|五代祖|四世祖|四代祖|高祖母|曾祖考|曾大父|大王父|曾王父|皇曾祖|曾大考|曾伯祖|曾叔祖|先祖考|先大父|先王父|皇祖考|伯祖父|叔祖父|從伯祖|再從伯|再從叔|再從舅|從伯父|從叔父|外祖父|外王父|外大父|外叔祖|外伯祖|再從舅|外伯父|外叔父|外大祖|外曾祖|遠祖|其先|高祖|曾祖|先祖|大父|王父|大考|從祖|族祖|伯祖|叔祖|皇考|先父|先考|先君|伯父|世父|叔父|族父|從伯|從父|從叔|從舅|外舅|母舅|外祖|從舅|外舅|祖|父|考|舅)[諱|曰](.{0,2})$',
             ]

In [17]:
kin_list=[]
df_kin_extracted=pd.DataFrame()
for re_rule in re_rule_list:
    for index in tqdm(df_sent_comp.index):
        string=df_sent_comp.loc[index, 'sent_comp']
        content_id=df_sent_comp.loc[index, 'content_id']
        content=df_sent_comp.loc[index, 'content']
        author=df_sent_comp.loc[index, 'author']
        subj=df_sent_comp.loc[index, 'subject']
        kin_list_temp=re.findall(re_rule, string)
        if kin_list_temp!=[]:
                df_kin_extracted=pd.concat([pd.DataFrame([[index, kin[0], '-',kin[1], re_rule, string, content_id, content,
                                                       author, subj] 
                                                        for kin in kin_list_temp]), df_kin_extracted], ignore_index=True)

100%|██████████| 57679/57679 [00:06<00:00, 9019.47it/s] 
100%|██████████| 57679/57679 [00:12<00:00, 4525.73it/s]
100%|██████████| 57679/57679 [00:07<00:00, 7845.15it/s]
100%|██████████| 57679/57679 [00:03<00:00, 18323.52it/s]
100%|██████████| 57679/57679 [00:03<00:00, 14534.11it/s]
100%|██████████| 57679/57679 [00:02<00:00, 21387.37it/s]


In [18]:
df_kin_extracted.columns=['index', 'kin', 'number', 'name_string', 're_rule', 'sent_comp', 'content_id', 'content','author', 'subject']

In [19]:
for index in tqdm(df_kin_extracted.index):
    sent_comp_index = df_kin_extracted.loc[index, 'index']
    df_temp=df_sent_comp.loc[sent_comp_index][['original']]
    df_kin_extracted.loc[index, 'sent']=df_temp['original']

100%|██████████| 8475/8475 [00:08<00:00, 975.91it/s] 


In [20]:
df_kin_extracted = df_kin_extracted[df_kin_extracted['name_string'] != ""]
df_kin_extracted = df_kin_extracted[df_kin_extracted['name_string'] != "某"]
df_kin_extracted = df_kin_extracted[df_kin_extracted['name_string'] !=  "□"]

In [21]:
df_kin_extracted.sample(5)

Unnamed: 0,index,kin,number,name_string,re_rule,sent_comp,content_id,content,author,subject,sent
3975,46175,曾大父,-,琰,^(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世...,曾大父琰/wsep/以其子/no_noc/知幾/vno//no_noc//no_noc/,5387970.0,紹興二年正月甲子，吴君公擇以疾終於家。明年四月壬子葬於吴興之安吉曰銅山鄉俞塢之原。君之子罕過...,劉一止,吴擇,曾大父琰，以其子通議大夫知幾累封尚書都官郎中
1744,42774,祖,-,澄,[公君吾夫人我/](三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|...,曾祖皓/wsep//vno//no_noc//wsep/祖澄/wsep//vno//no_n...,5502834.0,公諱有嘉，字會之，世居開封。曾祖皓，贈右衛將軍；祖澄，贈左屯衛將軍；父世立，贈金紫光禄大夫。...,胡寅,田有嘉,曾祖皓，贈右衛將軍；祖澄，贈左屯衛將軍；父世立，贈金紫光禄大夫
2520,17017,父,-,允元,[公君吾夫人我/](三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|...,曾祖儒/wsep/祖承雋/wsep/父允元/wsep/皆隱惪不耀,8713032.0,族姪朝請大夫、權成都府路計度轉運副使、借紫金魚袋亞撰。朝散郎、守尚書左司員外郎兼參詳官制格目...,祝亞,吳煇妻、祝允元女,曾祖儒，祖承雋，父允元，皆隱惪不耀
3822,49873,祖,-,睦,^(三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|十一代祖|十一世...,祖睦/wsep//no_noc//no_noc//no_noc/,5316985.0,夫人王氏，曾祖沔，事太宗、真宗，爲執政，號名臣。祖睦，尚書司勳員外郎。父乙，尚書庫部員外郎、...,張耒,王乙女,祖睦，尚書司勳員外郎
1957,34801,祖,-,日章,[公君吾夫人我/](三十一世祖|十八世祖|十六代祖|十五代祖|十三代祖|十二世祖|十二代祖|...,曾祖及/wsep/祖日章/wsep/父徽之,5689984.0,孺人王氏，其先有名宰諱肇者，起其家，繼是登甲乙科蟬聯有人，孺人羣從兄弟也。占籍撫之崇仁。高祖...,陳造,王徽之女、繆昭妻,曾祖及，祖日章，父徽之


In [22]:
re_rule_list=[r'(外叔祖母|外伯祖母|曾祖母|曾祖妣|曾大母|曾王母|曾祖姑|大王母|從祖母|繼祖母|叔祖母|從祖姑|伯祖母|叔祖母|再從姑|從伯姑|外王母|外祖母|祖母|王母|大母|祖妣|祖姑|伯母|叔母|仲姑|從姑|皇妣|先妣|後母|繼母|嫡母|生母|庶母|族母|姨母|外姑|從妗|外姑|姑|母|妣|姨|姑)([^曰也].{0,20}?)氏',
              r'([始先早前首再次又後繼]?娶|取|配)(.{1,10}?)氏',
              r'(繼)([^母].{0,10}?)氏',
              r'(妻)(.{0,10}?)氏',
             ]

In [23]:
kin_list=[]
df_kin_extracted1=pd.DataFrame()
for re_rule in re_rule_list:
    for index in tqdm(df_sent_comp.index):
        string=df_sent_comp.loc[index, 'sent_comp']
        content_id=df_sent_comp.loc[index, 'content_id']
        content=df_sent_comp.loc[index, 'content']
        author=df_sent_comp.loc[index, 'author']
        subj=df_sent_comp.loc[index, 'subject']
        kin_list_temp=re.findall(re_rule, string)
        if kin_list_temp!=[]:
            for kin_nm in kin_list_temp:
                if kin_nm[1]!='':
                    kin_nm=(kin_nm[0], kin_nm[1][-1])
                df_kin_extracted1=pd.concat([pd.DataFrame([[index, kin_nm[0], '-', kin_nm[1],re_rule, string, content_id, content, author, subj]]), df_kin_extracted1], ignore_index=True)
#                 else:
#                     print(string, kin_list_temp)

100%|██████████| 57679/57679 [00:08<00:00, 6984.94it/s]
100%|██████████| 57679/57679 [00:11<00:00, 5142.33it/s]
100%|██████████| 57679/57679 [00:03<00:00, 18564.45it/s]
100%|██████████| 57679/57679 [00:02<00:00, 20389.74it/s]


In [24]:
df_kin_extracted1.columns=['index', 'kin', 'number', 'name_string', 're_rule', 'sent_comp', 'content_id', 'content','author', 'subject']

In [25]:
for index in tqdm(df_kin_extracted1.index):
    sent_comp_index = df_kin_extracted1.loc[index, 'index']
    df_temp=df_sent_comp.loc[sent_comp_index][['original']]
    df_kin_extracted1.loc[index, 'sent']=df_temp['original']

100%|██████████| 5843/5843 [00:05<00:00, 1029.35it/s]


In [26]:
df_kin_extracted1.sample(5)

Unnamed: 0,index,kin,number,name_string,re_rule,sent_comp,content_id,content,author,subject,sent
2981,9884,娶,-,黄,"([始先早前首再次又後繼]?娶|取|配)(.{1,10}?)氏",娶黄氏/wsep/尚氏/wsep/俱/vno//no_noc/,5516979.0,高，齊太公之後，食采於高，因氏焉。齊高氏爲春秋著姓，魯文公時有宋子哀者，《左氏》以爲高哀。某...,胡銓,高世吏,娶黄氏、尚氏，俱封宜人
2475,23511,配,-,徐,"([始先早前首再次又後繼]?娶|取|配)(.{1,10}?)氏",公配徐氏/wsep//vno//no_noc/,5840749.0,昔者洙泗之教亦多術矣，然綜其要歸，不過曰學以成性、行以成己，施之於政則以成物而已。然豈有二致...,真德秀,張彦清,公配徐氏，封安人
5263,12571,曾祖妣,-,薛,(外叔祖母|外伯祖母|曾祖母|曾祖妣|曾大母|曾王母|曾祖姑|大王母|從祖母|繼祖母|叔祖母...,曾祖妣薛氏/wsep/祖妣皇甫氏/wsep/妣聶氏/wsep/皆/vno//ns/太夫人,5216752.0,上即位之三年，朝廷清明，百揆時叙，民安其生，風俗一變。異時薄夫鄙人，皆洗心易德，務爲忠厚，人...,蘇軾,司馬光,曾祖妣薛氏，祖妣皇甫氏，妣聶氏，皆封溫國太夫人
2869,13089,娶,-,陳,"([始先早前首再次又後繼]?娶|取|配)(.{1,10}?)氏",娶陳氏/wsep//vno//ns//no_noc/,5169528.0,曾祖諱惟忠，彰化軍節度使、舒國公。祖諱從謹，宣州觀察使、宣城侯。父諱世崇，貴州防禦使。公諱令...,楊傑,趙令蠙,娶陳氏，封樂壽縣君
582,68637,娶,-,楊,"([始先早前首再次又後繼]?娶|取|配)(.{1,10}?)氏",’嗚呼盛哉!娶夫人楊氏/wsep/關西之裔孫/wsep/柔芳靜淑/wsep/爲內外姻表式/w...,5037264.0,慶曆三年，歲舍鶉首，秋七月丁丑，光祿卿致仕南陽葉君齊終於京師，享年八十。其孤翰林學士清臣奉柩...,宋祁,葉參,’嗚呼盛哉!娶夫人楊氏，關西之裔孫，柔芳靜淑，爲內外姻表式，先君十三年而終，終而葬


In [27]:
re_rule_list=[r'(曾孫男|孫男|子男|孙男|曾孫|子|孫|孙|男|女|婿)曰(.+)',
              r'(子男)(.{0,3}?)人(\/.+)',
              r'([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)',
              r'([^女]*孫子|子)([^男女]{0,3}?)人(\/.+)',
              r'[一二三四五六七八九](曾孫|孫|孫子|子|男子)(\/.+)',
              r'(子男子|男)[一二三四五六七八九](\/.+)',
              r'(孫男)(\/.+)',
              r'生?[一二三四五六七八九](男)(\/.+)'
             ]

In [28]:
kin_list=[]
df_kin_extracted2=pd.DataFrame()
list_temp=[]
for re_rule in re_rule_list:
    for index in tqdm(df_sent_comp.index):
        string=df_sent_comp.loc[index, 'sent_comp']
        content_id=df_sent_comp.loc[index, 'content_id']
        content=df_sent_comp.loc[index, 'content']
        author=df_sent_comp.loc[index, 'author']
        subj=df_sent_comp.loc[index, 'subject']
        kin_list_temp=re.findall(re_rule, string)
        if kin_list_temp!=[]:
            for kin_nm in kin_list_temp:
                if len(kin_nm)==2:
                    kin_nm=(kin_nm[0], '', kin_nm[1])
                df_kin_extracted2=pd.concat([pd.DataFrame([[index]+list(kin_nm)+[re_rule, string, content_id, content, author, subj]]), 
                                            df_kin_extracted2], ignore_index=True)
                list_temp.append(kin_nm)

100%|██████████| 57679/57679 [00:03<00:00, 16567.87it/s]
100%|██████████| 57679/57679 [00:03<00:00, 16660.59it/s]
100%|██████████| 57679/57679 [00:06<00:00, 9398.24it/s] 
100%|██████████| 57679/57679 [00:04<00:00, 12442.17it/s]
100%|██████████| 57679/57679 [00:06<00:00, 8959.61it/s] 
100%|██████████| 57679/57679 [00:02<00:00, 20425.03it/s]
100%|██████████| 57679/57679 [00:02<00:00, 26000.44it/s]
100%|██████████| 57679/57679 [00:03<00:00, 17782.25it/s]


In [29]:
df_kin_extracted2=df_kin_extracted2.rename(columns={0:'index', 1:'kin', 2:'number', 3:'name_string', 
                                                  4:'re_rule', 5:'sent_comp', 6:'content_id',
                                                  7:'content', 8:'author', 9:'subject'})

In [30]:
del_dict=['皆其婿也','季曰','伯曰','仲曰','叔曰','次即','及第','中第',
          '中舉','今以','今爲','江淮','兩浙','某官','税務','支鹽',
          '士族','士人','許嫁','許適','許歸','未冠','未仕','未官',
          '未命','先歿','先亡','先公','早夭','早亡','早世','尚幼',
          '未名','前卒','先卒','蚤卒','俱有','早卒','早喪','一人',
          '二人','一尚','二尚','三尚','三女','三人','三曰','四人',
          '四先','四女','未仕','未銓','左銓','司户','户部','長',
          '次','幼','曰','即','也','貢','等','第','登','科','今',
          '授','事','都','轄','新','知','舊','監','倉','庫','起',
          '終','故','前','後','左','右','州','軍','路','郡','縣',
          '府','寺','尉','官','俱','業','業','習','適','嫁','歸',
          '許','皆','並','并','餘','俱','竝','未','夭','卒','幼','CF]','張氏','宗室','王氏','内殿','朱氏', '李氏','之子',
          '尚','早','喪','一','二','三','四','户','女','五','六','七','八','九','十','某','孫','〔〕','人','曾孫','以','其','而','孫人','□','在室','某某','某某某','亡','生于','小', '出家']

In [31]:
def clean_name_string(name_string):
    global del_dict
    name_string_list=[s for s in re.sub(r'[a-z_]', '', name_string).split('/') if s!='']
    name_string_list_refined=[]
    for name_string in name_string_list:
        for del_string in del_dict:
            name_string=name_string.replace(del_string, '')
        if name_string!='' and len(name_string)<=2:
            name_string_list_refined.append(name_string)
    return name_string_list_refined

In [32]:
for index in tqdm(df_kin_extracted2.index):
    sent_comp_index = df_kin_extracted2.loc[index, 'index']
    df_temp=df_sent_comp.loc[sent_comp_index][['original']]
    df_kin_extracted2.loc[index, 'sent']=df_temp['original']

100%|██████████| 5250/5250 [00:05<00:00, 1023.28it/s]


In [33]:
df_kin_extracted2['name_string']=[clean_name_string(s) for s in df_kin_extracted2['name_string']]

In [34]:
df_kin_extracted2 = df_kin_extracted2[df_kin_extracted2['name_string'].map(lambda d: len(d)) > 0]

In [None]:
x = {}
for index in df_kin_extracted2.index:
    row = df_kin_extracted2.loc[index]
    for name in literal_eval(row['name_string']):
        if name in x.keys():
            x[name] += 1
        else:
            x[name] = 1

In [37]:
for index in df_kin_extracted2.index:
    namelist = df_kin_extracted2.loc[index]['name_string']
    namelist = [s for s in namelist if s != '某'and s != '孫' and s != '〔〕' and s!= '人' and s!= '曾孫' and s!= '以' and s!='其' and s!= '而' and s!='孫人' and s!= '□' and s!= '在室' and s!= '某某' and s!= '某某某' and s!= '亡']
    df_kin_extracted2.loc[index, 'name_string'] = str(namelist)

In [38]:
df_kin_extracted2 = df_kin_extracted2[df_kin_extracted2['name_string'].map(lambda d: len(d)) > 0]

In [39]:
#{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

In [40]:
#df_kin_extracted2['name_string'].value_counts().head(50)

In [41]:
df_kin_extracted2.sample(10)

Unnamed: 0,index,kin,number,name_string,re_rule,sent_comp,content_id,content,author,subject,sent
3177,38607,孫,男三,"['垓', '圯', '埴']","([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)",孫男三人/wm/垓/wsep/圯/wsep/埴/wsep/女三人/wsep/尚幼,5631350.0,伯父沅陵公好賢喜士，其規模寧與時利相反。一時巨室，衆方慕嚮，有來請交，多捨去不顧；即故家若寒...,周必大,蔡衢,孫男三人：垓、圯、埴；女三人，尚幼
3395,30565,孫,男二,"['載', '古']","([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)",孫男二人/wsep/曰載/wsep/曰古,5764363.0,少南以謫死，葬不及銘。後三十年，當淳熙五年三月己酉，改葬於舊墓南百步先人之側，其子六齡，始次...,葉適,陳鵬飛,孫男二人，曰載，曰古
1941,68477,子,五,"['巽', '叔', '鼎', '觀', '賁']","([^女]*孫子|子)([^男女]{0,3}?)人(\/.+)",〔三〕‘子五人……/vno//no_noc/’/wm/《補正》作‘子五人/wsep/巽/vn...,5039996.0,夫人姓徐，錢塘人，父啓，當錢氏王吴越，以勤廉事忠懿王，爲閤門祗候。王以土入朝，國官并隨附京邸...,李之才,衛廷諤妻,〔三〕‘子五人……舉進士’：《補正》作‘子五人，巽爲右班殿直，叔、鼎、觀、賁皆舉進士
153,49105,男,,"['公發', '公才', '公美']",生?[一二三四五六七八九](男)(\/.+),三男/wsep/公發/wsep/公才/wsep/公美/wsep/皆幼,5326412.0,君孫氏，諱握，字叔權，世爲常州晋陵人。曾祖世南兄弟六人皆有聲場屋，同時薦禮部者四人，自後子孫...,鄒浩,孫握,三男，公發、公才、公美，皆幼
438,30385,男,,['齒'],(子男子|男)[一二三四五六七八九](\/.+),曾孫男七/wsep/女一/wsep/皆未齒也,5764605.0,越新昌黄公，諱仁静，字仲山。其先婺徙也。曾祖朴，祖巽，父惠之。公累封朝奉大夫，賜服金紫。年八...,葉適,黄仁静,曾孫男七，女一，皆未齒也
2626,69096,曾孫,男二,"['才臣', '純臣']","([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)",曾孫男二人/wm/才臣/wsep/純臣,5019873.0,君諱約，字不約，掞之一從兄也。遠祖范陽人，五代亂離，挈家南渡，遂家歷城焉。曾祖諱從實，祖諱光...,張掞,張約,曾孫男二人：才臣、純臣
3502,24052,孫,男三,"['龕', '集', '壽']","([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)",孫男三人/wm/龕/wsep//no_noc//wsep/新/ns//ns//no_noc/...,5840544.0,寶文閣待制李公既没，其孤洪宗爲書赴某於西山精舍，且請銘。某曰僕非能銘者也，矧方儼然在衰絰中，...,真德秀,李訦,孫男三人：龕，迪功郎、新興化軍莆田縣主簿；集孫、壽孫，尚幼
1640,18513,子,,"['應牛', '應', '應鳳']",[一二三四五六七八九](曾孫|孫|孫子|子|男子)(\/.+),三子/wm/應牛〔一〕/wsep/應登/wsep/應鳳,5959515.0,安成古長沙郡，吾廬陵之劉未嘗不本長沙也，而譜多不可考，多故不可考也。按《唐思禪師塔碑》，思劉...,劉辰翁,劉可仕,三子：應牛〔一〕、應登、應鳳
3939,1897,孫,男一,['洤'],"([^女]*曾孫|孫)([^女]{0,3}?)人(\/.+)",）……人/wm/鈆/wsep//no_noc//wsep/……/no_noc//wsep/新...,8714787.0,宋故……（題）太安人……冠南……先君……升……享年……■日，其……義城鄉白巖坤（？）……人：...,王鈆,王鈆母,）……人：鈆，朝奉郎，……文林郎，新監……孫男一人：洤，將仕郎
2208,48083,子,四,"['輔之', '翼之', '直之', '得之']","([^女]*孫子|子)([^男女]{0,3}?)人(\/.+)",子四人/wm/輔之/wsep//no_noc//wsep/翼之/wsep//no_noc//...,5338751.0,惟博平侯叔樂，字和甫，昭化軍節度使、觀察留後、彭城郡公克孚之子，深州團練使承訓之孫，廣平郡王...,慕容彦逢,趙叔樂,子四人：輔之，左班殿直；翼之，右班殿直；直之、得之，皆三班奉職


In [42]:
re_rule_list=[r'(曾孫女|孫女|女)[許適嫁歸](.{1,200})',
              r'(十一女|十六女|一女|二女|三女|四女|五女|六女|七女|八女|九女|十女).{0,50}?[許適嫁歸](.+)',
              r'(曾孫女|孫女|女孫|女|妹).{0,2}人.{0,50}?[許適嫁歸](.+)',
              r'(曾孫女|孫女|女孫|女|妹)[一二三四五六七八九十]/(.+)', # Some records are not about son in law but daughters' name.
              r'(再從妹|再從姊|外甥女|曾孫女|外孫女|姪孫女|表甥女|從孫女|同母姊|堂姊|堂妹|表妹|表姊|仲女|長女|次女|幼女|季女|甥女|表甥|孫婦|甥女|姑|姊|妹)[^一二三四五六七八九十].{1,50}?[適嫁歸](.+)',
             ]

In [43]:
kin_list=[]
df_kin_extracted3=pd.DataFrame()
list_temp=[]
for re_rule in re_rule_list:
    for index in tqdm(df_sent_comp.index):
        string=df_sent_comp.loc[index, 'sent_comp']
        content_id=df_sent_comp.loc[index, 'content_id']
        content=df_sent_comp.loc[index, 'content']
        author=df_sent_comp.loc[index, 'author']
        subj=df_sent_comp.loc[index, 'subject']
        kin_list_temp=re.findall(re_rule, string)
        if kin_list_temp!=[]:
            for kin_nm in kin_list_temp:
                df_kin_extracted3=pd.concat([pd.DataFrame([[index]+list(kin_nm)+[re_rule, string, content_id, content, author, subj]]), 
                                            df_kin_extracted3], ignore_index=True)
                list_temp.append(kin_nm)

100%|██████████| 57679/57679 [00:03<00:00, 19124.48it/s]
100%|██████████| 57679/57679 [00:04<00:00, 14272.73it/s]
100%|██████████| 57679/57679 [00:05<00:00, 10115.43it/s]
100%|██████████| 57679/57679 [00:02<00:00, 21524.28it/s]
100%|██████████| 57679/57679 [00:02<00:00, 21903.73it/s]


In [44]:
df_kin_extracted3=df_kin_extracted3.rename(columns={0:'index', 1:'kin', 2:'name_string', 
                                                  3:'re_rule', 4:'sent_comp', 5:'content_id',
                                                  6:'content', 7:'author', 8:'subject'})

In [45]:
for index in tqdm(df_kin_extracted3.index):
    sent_comp_index = df_kin_extracted3.loc[index, 'index']
    df_temp=df_sent_comp.loc[sent_comp_index][['original']]
    df_kin_extracted3.loc[index, 'sent']=df_temp['original']

100%|██████████| 3045/3045 [00:02<00:00, 1062.60it/s]


In [46]:
def clean_name_string(name_string):
    global del_dict
    name_string_list=[s for s in re.sub(r'[a-z_]', '', name_string).split('/') if s!='']
    name_string_list_refined=[]
    for name_string in name_string_list:
        for del_string in del_dict:
            name_string=name_string.replace(del_string, '')
        if name_string!='' and len(name_string)<=3:
            name_string_list_refined.append(name_string)
    return name_string_list_refined

In [47]:
df_kin_extracted3['name_string']=[clean_name_string(s) for s in df_kin_extracted3['name_string']]

In [48]:
x = {}
for index in df_kin_extracted3.index:
    row = df_kin_extracted3.loc[index]
    for name in row['name_string']:
        if name in x.keys():
            x[name] += 1
        else:
            x[name] = 1
#{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

In [49]:
df_kin_extracted3 = df_kin_extracted3[df_kin_extracted3['name_string'].map(lambda d: len(d)) > 0]

In [50]:
df_kin_extracted3.sample(5)

Unnamed: 0,index,kin,name_string,re_rule,sent_comp,content_id,content,author,subject,sent
2038,51707,四女,"[軌, 仲箎, 劉炳, 仲葳]","(十一女|十六女|一女|二女|三女|四女|五女|六女|七女|八女|九女|十女).{0,50}...",四女/wm/長適/ns//no_noc//no_noc/軌/wsep/次適/ns//no_n...,5302240.0,自古佐命功臣，逮國家平定之後，鮮□以功名終者。唯焦氏之先，翊戴藝祖，削平區夏。其後乘輿幸洛，...,李昭玘,趙允弼,四女：長適成州防禦使令軌，次適恩州防禦使仲箎，次適國學進士劉炳，次適右千牛衛將軍仲葳
2603,66060,女,[毛楷],"(曾孫女|孫女|女)[許適嫁歸](.{1,200})",次子鼎未仕/wsep/一女適/ns//no_noc/毛楷,5071129.0,太常博士周君，熙寧四年十一月二十五日以疾卒。其子尚書屯田郎中革，號訴於余曰：‘革嘗從事於幕府...,韓琦,周孝恭,次子鼎未仕，一女適保州司理參軍毛楷
1138,34364,女,[曹源],"(曾孫女|孫女|女孫|女|妹).{0,2}人.{0,50}?[許適嫁歸](.+)",女一人/wsep/適/no_noc/曹源,5703194.0,陳在婺爲右族，名數布於七縣，其譜或通族爲之。其籍永康之前黄者，不知於婺何别也。蓋傳至數世而得...,吕祖謙,陳持,女一人，適進士曹源
1801,66060,一女,[毛楷],"(十一女|十六女|一女|二女|三女|四女|五女|六女|七女|八女|九女|十女).{0,50}...",次子鼎未仕/wsep/一女適/ns//no_noc/毛楷,5071129.0,太常博士周君，熙寧四年十一月二十五日以疾卒。其子尚書屯田郎中革，號訴於余曰：‘革嘗從事於幕府...,韓琦,周孝恭,次子鼎未仕，一女適保州司理參軍毛楷
2182,39865,二女,"[同年, 方翥, 友陵]","(十一女|十六女|一女|二女|三女|四女|五女|六女|七女|八女|九女|十女).{0,50}...",二女/wsep/一嫁同年/no_noc//no_noc/方翥之子/wsep/即友陵也/wse...,5583338.0,吾友趙德莊將葬于饒州餘干縣某山之原，其婿方友陵以狀來曰：盍爲之銘?始與德莊遊，蓋三十年，在朝...,韓元吉,趙彦端,二女，一嫁同年進士秘書省正字方翥之子，即友陵也，一尚幼


In [51]:
df_kin = pd.concat([df_kin_extracted, df_kin_extracted1, df_kin_extracted2, df_kin_extracted3], axis=0)
df_kin = df_kin[df_kin['name_string'] != '']
df_kin = df_kin[df_kin['name_string'].map(lambda d: len(d)) > 0]
df_kin = df_kin.sort_values(by='content_id')
df_kin.reset_index(drop=True, inplace=True)
df_kin.drop('index', axis=1, inplace=True) 

In [52]:
df_kin.to_excel('kin_result.xlsx', encoding='utf8')

In [53]:
df_kin_extracted.to_excel('father.xlsx', encoding='utf8')
df_kin_extracted1.to_excel('mother.xlsx', encoding='utf8')
df_kin_extracted2.to_excel('son.xlsx', encoding='utf8')
df_kin_extracted3.to_excel('daughter.xlsx', encoding='utf8')

In [None]:
def getdoccano(id):
    table = df_kin_extracted[df_kin_extracted['content_id'] == id]
    content = table['content'].iloc[0]
    list = []
    for index in table.index:
        subcontent = table['sent'].loc[index]
        name = table['name_string'].loc[index]
        kin = table['kin'].loc[index]
        length = len(name)
        start = subcontent.find(name)
        if start != -1:
            start = start + content.find(subcontent)
            end = start + length
            if start != end:
                list.append([start, end, kin])
                
    table1 = df_kin_extracted1[df_kin_extracted1['content_id'] == id]
    for index in table1.index:
        subcontent = table1['sent'].loc[index]
        name = table1['name_string'].loc[index]
        kin = table1['kin'].loc[index]
        length = len(name)
        start = subcontent.find(name)
        if start != -1:
            start = start + content.find(subcontent)
            end = start + length
            if start != end:
                list.append([start, end, kin])
    
    table2 = df_kin_extracted2[df_kin_extracted2['content_id'] == id]
    for index in table2.index:
        subcontent = table2['sent'].loc[index]
        namelist = table2['name_string'].loc[index]
        kin = table2['kin'].loc[index]
        offset = content.find(subcontent)
        for name in namelist:
            length = len(name)
            start = subcontent.find(name)
            if start != -1:
                start = start + offset
                end = start + length
                if start != end:
                    list.append([start, end, kin])
                
    table3 = df_kin_extracted3[df_kin_extracted3['content_id'] == id]
    for index in table3.index:
        subcontent = table3['sent'].loc[index]
        namelist = table3['name_string'].loc[index]
        kin = table3['kin'].loc[index]
        offset = content.find(subcontent)
        for name in namelist:
            length = len(name)
            start = subcontent.find(name)
            if start != -1:
                start = start + offset
                end = start + length
                if start != end:
                    list.append([start, end, kin])
        
    dict = {"id": id, "text": content, "label":list}
    dict = json.dumps(dict, ensure_ascii=False)
    return dict

In [None]:
with open('data.jsonl', 'w') as f:
    for id in df_kin_extracted['content_id'].unique():
        table = df_kin_extracted[df_kin_extracted['content_id'] == id]
        content = table['content'].iloc[0]
        f.write(getdoccano(id) + "\n")