# Create a Dataframe of texts

In [3]:
import pandas as pd
import glob

In [None]:
def read_file(file_path: str) -> pd.DataFrame:
    file_path = Path(file_path)
    
    # Check if the file exists
    if not file_path.is_file():
        raise FileNotFoundError(f"The file {file_path} does not exist.")

    # Read the file content directly instead of using pd.read_csv
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()
    except Exception as e:
        raise IOError(f"An error occurred while reading the file: {e}")

    # Create a DataFrame with a single row containing the text
    df = pd.DataFrame({'text': [text_content]})
    
    # Extracting metadata from the file name
    file_name = file_path.stem 
    info_list = file_name.split(' ') 
    if len(info_list) < 4:
        raise ValueError(f"Filename '{file_name}' does not contain enough parts separated by spaces.")
    name_str = info_list[0]
    type_str = info_list[1]
    time_str = info_list[2]
    style_str = info_list[3]

    # Adding extracted metadata as new columns in the DataFrame
    df['name'] = name_str
    df['type'] = type_str
    df['time'] = time_str
    df['style'] = style_str
    
    return df

In [5]:
def read_files(folder_path: str) -> pd.DataFrame:
    """
    Read all the txt files in the folder and return a DataFrame
    """
    # Use glob to get the paths of all the txt files in the directory and subdirectory of "./public"
    file_list = glob.glob(folder_path + '/**/*.txt', recursive=True)
    # Read all the files and concatenate them
    df = pd.concat([read_file(file_path) for file_path in file_list], ignore_index=True)
    return df

In [6]:
from pathlib import Path
df_concated = read_files('./public')
df_concated

Unnamed: 0,text,name,type,time,style
0,卷一 古艳部\n\n\n升官\n一官升职，谓其妻曰：“我的官职比前更大了。”妻曰：“官大，...,笑林广记,笑话,近古,散文
1,﻿ 齐民要术 后魏 贾思勰\n ●序\n 《史记》曰：“齐民无盖藏。”如...,齐民要术,农书,中古,散文
2,浮生六记 清 沈三白\n\n ●卷一　闺房记乐\n 余生乾隆癸未冬卜一月二十有二...,浮生六记,自传,近古,散文
3,《牡丹亭》 \n作者：明朝·汤显祖\n\n 天下女子有情，宁有如杜丽娘者乎！梦其人即病...,牡丹亭,剧曲,中古,杂剧
4,全元曲·杂剧\n\n☆【关汉卿】\n \n 关汉卿，名不详，字汉卿，号已斋，又作一斋。大都...,全元曲杂剧,剧曲,中古,杂剧
5,西厢记 元 王实甫\n第一本 张君瑞闹道场杂剧\n\n○楔子\n〔外扮老夫人上开〕老身姓郑，...,西厢记,剧曲,中古,杂剧
6,关张双赴西蜀梦\n\n一折\n\n【仙吕】【点绛唇】织履编席，能勾做大蜀皇帝，非容易。官里旦...,全元曲,剧曲,中古,元曲
7,试一出　先　声\n\n康熙甲子八月 \n\n【蝶恋花】（副末毡巾、道袍、白鬚上）古董先生谁似...,桃花扇,剧曲,中古,杂剧
8,第一回纣王女娲宫进香\n \n \n \n 混沌初分盘古先，太极两仪四象悬，子天...,封神演义,小说,近古,小说
9,醒世恒言 \n冯梦龙 编著 \n 第一卷　两县令竞义婚孤女\n 风水人间不可无，也须阴骘两...,醒世恒言,小说,近古,小说


# Data Preprocessing

In [None]:
# Strip unwanted characters
def clean_text(text: str) -> str:
    cleaned_text = text.replace("\u3000", " ").replace("\n", " ").strip()
    return cleaned_text

In [8]:
df_concated2 = df_concated.copy()
df_concated2['text'] = df_concated['text'].apply(clean_text)
df_concated2

Unnamed: 0,text,name,type,time,style
0,卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。”妻曰：“官大，不知此物...,笑林广记,笑话,近古,散文
1,﻿ 齐民要术 后魏 贾思勰 ●序 《史记》曰：“齐民无盖藏。”如淳注...,齐民要术,农书,中古,散文
2,浮生六记 清 沈三白 ●卷一 闺房记乐 余生乾隆癸未冬卜一月二十有二日，正...,浮生六记,自传,近古,散文
3,《牡丹亭》 作者：明朝·汤显祖 天下女子有情，宁有如杜丽娘者乎！梦其人即病，病即...,牡丹亭,剧曲,中古,杂剧
4,全元曲·杂剧 ☆【关汉卿】 关汉卿，名不详，字汉卿，号已斋，又作一斋。大都（今北京...,全元曲杂剧,剧曲,中古,杂剧
5,西厢记 元 王实甫 第一本 张君瑞闹道场杂剧 ○楔子 〔外扮老夫人上开〕老身姓郑，夫主姓崔...,西厢记,剧曲,中古,杂剧
6,关张双赴西蜀梦 一折 【仙吕】【点绛唇】织履编席，能勾做大蜀皇帝，非容易。官里旦暮朝夕，...,全元曲,剧曲,中古,元曲
7,试一出 先 声 康熙甲子八月 【蝶恋花】（副末毡巾、道袍、白鬚上）古董先生谁似我？非玉...,桃花扇,剧曲,中古,杂剧
8,第一回纣王女娲宫进香 混沌初分盘古先，太极两仪四象悬，子天丑地人寅出，...,封神演义,小说,近古,小说
9,醒世恒言 冯梦龙 编著 第一卷 两县令竞义婚孤女 风水人间不可无，也须阴骘两相扶。...,醒世恒言,小说,近古,小说


# Cut Sentences

In [None]:
import re

def cut_sentences(text: str) -> list[str]:
    split_pattern = r'([。！？…])'
    parts = re.split(split_pattern, text)
    sentences = []
    current_sentence = ""
    for part in parts:
        if re.match(split_pattern, part): 
            current_sentence += part
            sentences.append(current_sentence.strip())
            current_sentence = ""
        else:
            current_sentence += part
    if current_sentence.strip():
        sentences.append(current_sentence.strip())
    final_sentences = []
    i = 0
    while i < len(sentences):
        s = sentences[i]
        if i + 1 < len(sentences) and re.fullmatch(r'”|」|』|》', sentences[i+1].strip()):
            final_sentences.append(s + sentences[i+1].strip())
            i += 2
        else:
            final_sentences.append(s)
            i += 1
    return [s for s in final_sentences if s.strip()]

In [None]:
# create a dataframe, for each sentence, add the name, type, time, style
def add_sentences(df_concated2: pd.DataFrame) -> pd.DataFrame:
    df_concated3 = df_concated2.copy()
    df_concated3['sentences'] = df_concated2['text'].apply(cut_sentences)
    # drop rows with empty sentences, i.e. empty list
    df_concated3 = df_concated3[df_concated3['sentences'].map(len) > 0]
    df_concated3 = df_concated3.explode('sentences')
    df_concated3 = df_concated3.reset_index(drop=True)
    df_concated3 = df_concated3[["sentences",'name','type','time','style']]
    return df_concated3

In [11]:
df_concated3 = add_sentences(df_concated2)
df_concated3

Unnamed: 0,sentences,name,type,time,style
0,卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。,笑林广记,笑话,近古,散文
1,”妻曰：“官大，不知此物亦大不？,笑林广记,笑话,近古,散文
2,”官曰：“自然。,笑林广记,笑话,近古,散文
3,”及行事，妻怪其藐小如故，官曰：“大了许多，汝自不觉着。,笑林广记,笑话,近古,散文
4,”妻曰：“如何不觉？,笑林广记,笑话,近古,散文
...,...,...,...,...,...
2193804,为听杨柳曲，行役几伤心。,全唐诗,诗歌,中古,近体诗
2193805,卷903_39 【素】李峤 濯手天津女，纤腰洛浦妃。,全唐诗,诗歌,中古,近体诗
2193806,鱼肠远方至，雁足上林飞。,全唐诗,诗歌,中古,近体诗
2193807,妙夺鲛绡色，光腾月扇辉。,全唐诗,诗歌,中古,近体诗


# Split corpus
把语料分成一批一批处理，一次性处理要耗费数小时，中途失败可能要重来。因此分批处理，并保存结果。
另外，需要去除长句子，有的长句子不是句子，却长达数千字，可能占用30g以上内存导致python内核崩溃。

In [None]:
test_corpus = df_concated3['sentences'].to_list()
# split test_corpus by 10000
test_corpus_split = [test_corpus[i:i + 10000] for i in range(0, len(test_corpus), 10000)]

--- Test Corpus 0 ---
Number of sentences: 10000
--- Test Corpus 1 ---
Number of sentences: 10000
--- Test Corpus 2 ---
Number of sentences: 10000
--- Test Corpus 3 ---
Number of sentences: 10000
--- Test Corpus 4 ---
Number of sentences: 10000
--- Test Corpus 5 ---
Number of sentences: 10000
--- Test Corpus 6 ---
Number of sentences: 10000
--- Test Corpus 7 ---
Number of sentences: 10000
--- Test Corpus 8 ---
Number of sentences: 10000
--- Test Corpus 9 ---
Number of sentences: 10000
--- Test Corpus 10 ---
Number of sentences: 10000
--- Test Corpus 11 ---
Number of sentences: 10000
--- Test Corpus 12 ---
Number of sentences: 10000
--- Test Corpus 13 ---
Number of sentences: 10000
--- Test Corpus 14 ---
Number of sentences: 10000
--- Test Corpus 15 ---
Number of sentences: 10000
--- Test Corpus 16 ---
Number of sentences: 10000
--- Test Corpus 17 ---
Number of sentences: 10000
--- Test Corpus 18 ---
Number of sentences: 10000
--- Test Corpus 19 ---
Number of sentences: 10000
--- Test C

In [None]:
# show max 5 len item in test_corpus_split[88]
for i, corpus in enumerate(test_corpus_split[88]):
    if len(corpus) > 300:
        print(f"--- Test Corpus No.{i} ---")
        print(f"Number of characters: {len(corpus)}")
        print(corpus)

# get rid of items with len > 300
test_corpus_split_limited = []
for i, splited in enumerate(test_corpus_split):
    if i >= 88:
        list_temp = [corpus for corpus in splited if len(corpus) <= 300]
        test_corpus_split_limited.append(list_temp)
    else:
        test_corpus_split_limited.append(splited)

test_corpus_split_merged = []
for i in test_corpus_split_limited:
    test_corpus_split_merged += i
len(test_corpus_split_merged)

for i, corpus in enumerate(test_corpus_split_limited):
    print(f"--- Test Corpus No.{i} ---")
    print(f"Number of characters: {len(corpus)}")

--- Test Corpus No.1071 ---
Number of characters: 397
武散阶四十有五；从一品曰骠骑大将军；正二品曰辅国大将军；从二品曰镇军大将军；正三品上曰冠军大将军、怀化大将军；正三品下曰怀化将军；从三品上曰云麾将军、归德大将军；从三品下曰归德将军；正四品上曰忠武将军；正四品下曰壮武将军、怀化中郎将；从四品上曰宣威将军；从四品下曰明威将军、归德中郎将；正五品上曰定远将军；正五品下曰宁远将军、怀化郎将；从五品上曰游骑将军；从五品下曰游击将军、归德郎将；正六品上曰昭武校尉；正六品下曰昭武副尉、怀化司阶；从六品上曰振威校尉；从六品下曰振威副尉、归德司阶；正七品上曰致果校尉；正七品下曰致果副尉、怀化中候；从七品上曰翊麾校尉；从七品下曰翊麾副尉、归德中候；正八品上曰宣节校尉；正八品下曰宣节副尉、怀化司戈；从八品上曰御侮校尉；从八品下曰御侮副尉、归德司戈；正九品上曰仁勇校尉；正九品下曰仁勇副尉、怀化执戟长上；从九品上曰陪戎校尉；从九品下曰陪戎副尉、归德执戟长上。
--- Test Corpus No.3332 ---
Number of characters: 482
唐初，兵之戍边者，大曰军，小曰守捉，曰城，曰镇，而总之者曰道：若卢龙军一，东军等守捉十一，曰平卢道；横海、北平、高阳、经略、安塞、纳降、唐兴、渤海、怀柔、威武、镇远、静塞、雄武、镇安、怀远、保定军十六，曰范阳道；天兵、大同、天安、横野军四，岢岚等守捉五，曰河东道；朔方经略、丰安、定远、新昌、天柱、宥州经略、横塞、天德、天安军九，三受降、丰宁、保宁、乌延等六城，新泉守捉一，曰关内道；赤水、大斗、白亭、豆卢、墨离、建康、宁寇、玉门、伊吾、天山军十，乌城等守捉十四，曰河西道；瀚海、清海、静塞军三，沙钵等守捉十，曰北庭道；保大军一，鹰娑都督一，兰城等守捉八，曰安西道；镇西、天成、振威、安人、绥戎、河源、白水、天威、榆林、临洮、莫门、神策、宁边、威胜、金天、武宁、曜武、积石军十八，平夷、绥和、合川守捉三，曰陇右道；威戎、安夷、昆明、宁远、洪源、通化、松当、平戎、天保、威远军十，羊灌田等守捉十五，新安等城三十二，犍为等镇三十八，曰剑南道；岭南、安南、桂管、邕管、容管经略、清海军六，曰岭南道；福州经略军一，曰江南道；平海军一，东牟、东莱守捉二，蓬莱镇一，曰河南道。
--- Test Co

# Get tokens, pos and dep
hanlp的mtl工具包能一次性处理分词、词性标注和依存句法分析任务

In [None]:
import hanlp
import pickle
mtl = hanlp.load(hanlp.pretrained.mtl.KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH)
for i, corpus in enumerate(test_corpus_split_limited):
    # check if the pickle file exists
    if not Path(f'./temp/processed_sentences_{i}.pkl').is_file():
        # Process the sentences with the MTL model
        print(f"--- Test Corpus {i} ---")
        processed_sentences = mtl(corpus, tasks=['tok/coarse', 'pos/upos', 'dep'])
        # Save the processed sentences to a pickle file
        with open(f'./temp/processed_sentences_{i}.pkl', 'wb') as f:
            pickle.dump(processed_sentences, f)
        del processed_sentences
    else:
        continue

                                   

--- Test Corpus 93 ---
--- Test Corpus 94 ---
--- Test Corpus 95 ---
--- Test Corpus 96 ---
--- Test Corpus 97 ---
--- Test Corpus 98 ---
--- Test Corpus 99 ---
--- Test Corpus 100 ---
--- Test Corpus 101 ---
--- Test Corpus 102 ---
--- Test Corpus 103 ---
--- Test Corpus 104 ---
--- Test Corpus 105 ---
--- Test Corpus 106 ---
--- Test Corpus 107 ---
--- Test Corpus 108 ---
--- Test Corpus 109 ---
--- Test Corpus 110 ---
--- Test Corpus 111 ---
--- Test Corpus 112 ---
--- Test Corpus 113 ---
--- Test Corpus 114 ---
--- Test Corpus 115 ---
--- Test Corpus 116 ---
--- Test Corpus 117 ---
--- Test Corpus 118 ---
--- Test Corpus 119 ---
--- Test Corpus 120 ---
--- Test Corpus 121 ---
--- Test Corpus 122 ---
--- Test Corpus 123 ---
--- Test Corpus 124 ---
--- Test Corpus 125 ---
--- Test Corpus 126 ---
--- Test Corpus 127 ---
--- Test Corpus 128 ---
--- Test Corpus 129 ---
--- Test Corpus 130 ---
--- Test Corpus 131 ---
--- Test Corpus 132 ---
--- Test Corpus 133 ---
--- Test Corpus 134 ---

In [None]:
import pickle
import os
import glob
from pathlib import Path

# read the pickle files and concatenate them
def read_pickle_files(folder_path: str) -> list:
    file_list = glob.glob(folder_path + '/*.pkl')
    # Sort the file_list by the numbers before the ".pkl" extension
    file_list.sort(key=lambda x: int(Path(x).stem.split('_')[-1]))
    
    print(file_list)
    processed_sentences = []
    for file_path in file_list:
        # print(f"Reading file: {file_path}")
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
            processed_sentences.append(data)
    return processed_sentences
processed_sentences = read_pickle_files('./temp')
len(processed_sentences)

['./results/processed_sentences_0.pkl', './results/processed_sentences_1.pkl', './results/processed_sentences_2.pkl', './results/processed_sentences_3.pkl', './results/processed_sentences_4.pkl', './results/processed_sentences_5.pkl', './results/processed_sentences_6.pkl', './results/processed_sentences_7.pkl', './results/processed_sentences_8.pkl', './results/processed_sentences_9.pkl', './results/processed_sentences_10.pkl', './results/processed_sentences_11.pkl', './results/processed_sentences_12.pkl', './results/processed_sentences_13.pkl', './results/processed_sentences_14.pkl', './results/processed_sentences_15.pkl', './results/processed_sentences_16.pkl', './results/processed_sentences_17.pkl', './results/processed_sentences_18.pkl', './results/processed_sentences_19.pkl', './results/processed_sentences_20.pkl', './results/processed_sentences_21.pkl', './results/processed_sentences_22.pkl', './results/processed_sentences_23.pkl', './results/processed_sentences_24.pkl', './result

220

In [24]:
processed_sentences_df = pd.DataFrame()
count = 0
for i in processed_sentences:
    df_temp = pd.DataFrame(i)
    print(f"--- Processed Sentences {count} ---")
    count += 1
    print(len(df_temp))
    processed_sentences_df = pd.concat([processed_sentences_df, df_temp], ignore_index=True)
processed_sentences_df.reset_index(drop=True, inplace=True)
# Display the first few rows of the processed sentences DataFrame
processed_sentences_df

--- Processed Sentences 0 ---
10000
--- Processed Sentences 1 ---
10000
--- Processed Sentences 2 ---
10000
--- Processed Sentences 3 ---
10000
--- Processed Sentences 4 ---
10000
--- Processed Sentences 5 ---
10000
--- Processed Sentences 6 ---
10000
--- Processed Sentences 7 ---
10000
--- Processed Sentences 8 ---
10000
--- Processed Sentences 9 ---
10000
--- Processed Sentences 10 ---
10000
--- Processed Sentences 11 ---
10000
--- Processed Sentences 12 ---
10000
--- Processed Sentences 13 ---
10000
--- Processed Sentences 14 ---
10000
--- Processed Sentences 15 ---
10000
--- Processed Sentences 16 ---
10000
--- Processed Sentences 17 ---
10000
--- Processed Sentences 18 ---
10000
--- Processed Sentences 19 ---
10000
--- Processed Sentences 20 ---
10000
--- Processed Sentences 21 ---
10000
--- Processed Sentences 22 ---
10000
--- Processed Sentences 23 ---
10000
--- Processed Sentences 24 ---
10000
--- Processed Sentences 25 ---
10000
--- Processed Sentences 26 ---
10000
--- Process

Unnamed: 0,tok/coarse,pos/upos,dep
0,"[卷, 一, 古艳, 部, 升, 官, 一, 官, 升, 职, ，, 谓, 其, 妻, 曰,...","[NOUN, NUM, PROPN, NOUN, VERB, NOUN, NUM, NOUN...","[(3, nmod), (4, nummod), (4, nmod), (5, nsubj)..."
1,"[”, 妻, 曰, ：, “, 官, 大, ，, 不, 知, 此, 物, 亦, 大, 不, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, NOUN, VERB, ...","[(3, discourse), (3, nsubj), (0, root), (7, di..."
2,"[”, 官, 曰, ：, “, 自然, 。]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, PRON, PUNCT]","[(3, nsubj:outer), (3, nsubj), (0, root), (3, ..."
3,"[”, 及, 行事, ，, 妻, 怪, 其, 藐, 小, 如, 故, ，, 官, 曰, ：,...","[PUNCT, VERB, VERB, PUNCT, NOUN, VERB, PRON, V...","[(2, nsubj), (0, root), (2, ccomp), (6, csubj:..."
4,"[”, 妻, 曰, ：, “, 如, 何, 不, 觉, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, VERB, PRON, ...","[(3, discourse), (3, nsubj), (0, root), (3, di..."
...,...,...,...
2193747,"[为, 听, 杨柳, 曲, ，, 行役, 几, 伤, 心, 。]","[ADP, VERB, NOUN, NOUN, PUNCT, VERB, ADV, VERB...","[(2, advmod), (8, advcl), (4, nmod), (2, obj),..."
2193748,"[卷, 903, _, 39, 【, 素, 】, 李峤, 濯手, 天津, 女, ，, 纤腰,...","[VERB, NUM, PUNCT, NUM, PUNCT, NOUN, PUNCT, PR...","[(11, nsubj), (11, nummod), (2, discourse:sp),..."
2193749,"[鱼肠, 远方, 至, ，, 雁足, 上林, 飞, 。]","[NOUN, VERB, VERB, PUNCT, NOUN, PROPN, VERB, P...","[(3, nsubj), (3, obl:lmod), (0, root), (3, dis..."
2193750,"[妙, 夺, 鲛绡, 色, ，, 光, 腾, 月扇, 辉, 。]","[ADV, VERB, NOUN, NOUN, PUNCT, NOUN, VERB, NOU...","[(2, advmod), (0, root), (4, nmod), (2, obj), ..."


In [25]:
processed_sentences_df['sentences'] = test_corpus_split_merged
processed_sentences_df

Unnamed: 0,tok/coarse,pos/upos,dep,sentences
0,"[卷, 一, 古艳, 部, 升, 官, 一, 官, 升, 职, ，, 谓, 其, 妻, 曰,...","[NOUN, NUM, PROPN, NOUN, VERB, NOUN, NUM, NOUN...","[(3, nmod), (4, nummod), (4, nmod), (5, nsubj)...",卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。
1,"[”, 妻, 曰, ：, “, 官, 大, ，, 不, 知, 此, 物, 亦, 大, 不, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, NOUN, VERB, ...","[(3, discourse), (3, nsubj), (0, root), (7, di...",”妻曰：“官大，不知此物亦大不？
2,"[”, 官, 曰, ：, “, 自然, 。]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, PRON, PUNCT]","[(3, nsubj:outer), (3, nsubj), (0, root), (3, ...",”官曰：“自然。
3,"[”, 及, 行事, ，, 妻, 怪, 其, 藐, 小, 如, 故, ，, 官, 曰, ：,...","[PUNCT, VERB, VERB, PUNCT, NOUN, VERB, PRON, V...","[(2, nsubj), (0, root), (2, ccomp), (6, csubj:...",”及行事，妻怪其藐小如故，官曰：“大了许多，汝自不觉着。
4,"[”, 妻, 曰, ：, “, 如, 何, 不, 觉, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, VERB, PRON, ...","[(3, discourse), (3, nsubj), (0, root), (3, di...",”妻曰：“如何不觉？
...,...,...,...,...
2193747,"[为, 听, 杨柳, 曲, ，, 行役, 几, 伤, 心, 。]","[ADP, VERB, NOUN, NOUN, PUNCT, VERB, ADV, VERB...","[(2, advmod), (8, advcl), (4, nmod), (2, obj),...",为听杨柳曲，行役几伤心。
2193748,"[卷, 903, _, 39, 【, 素, 】, 李峤, 濯手, 天津, 女, ，, 纤腰,...","[VERB, NUM, PUNCT, NUM, PUNCT, NOUN, PUNCT, PR...","[(11, nsubj), (11, nummod), (2, discourse:sp),...",卷903_39 【素】李峤 濯手天津女，纤腰洛浦妃。
2193749,"[鱼肠, 远方, 至, ，, 雁足, 上林, 飞, 。]","[NOUN, VERB, VERB, PUNCT, NOUN, PROPN, VERB, P...","[(3, nsubj), (3, obl:lmod), (0, root), (3, dis...",鱼肠远方至，雁足上林飞。
2193750,"[妙, 夺, 鲛绡, 色, ，, 光, 腾, 月扇, 辉, 。]","[ADV, VERB, NOUN, NOUN, PUNCT, NOUN, VERB, NOU...","[(2, advmod), (0, root), (4, nmod), (2, obj), ...",妙夺鲛绡色，光腾月扇辉。


In [None]:
processed_sentences_df.to_pickle('./results/processed_sentences_df.pkl')

# Revise Data Format

In [1]:
import pandas as pd
processed_sentences_df = pd.read_pickle('./results/processed_sentences_df.pkl')
processed_sentences_df

Unnamed: 0,tok/coarse,pos/upos,dep,sentences
0,"[卷, 一, 古艳, 部, 升, 官, 一, 官, 升, 职, ，, 谓, 其, 妻, 曰,...","[NOUN, NUM, PROPN, NOUN, VERB, NOUN, NUM, NOUN...","[(3, nmod), (4, nummod), (4, nmod), (5, nsubj)...",卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。
1,"[”, 妻, 曰, ：, “, 官, 大, ，, 不, 知, 此, 物, 亦, 大, 不, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, NOUN, VERB, ...","[(3, discourse), (3, nsubj), (0, root), (7, di...",”妻曰：“官大，不知此物亦大不？
2,"[”, 官, 曰, ：, “, 自然, 。]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, PRON, PUNCT]","[(3, nsubj:outer), (3, nsubj), (0, root), (3, ...",”官曰：“自然。
3,"[”, 及, 行事, ，, 妻, 怪, 其, 藐, 小, 如, 故, ，, 官, 曰, ：,...","[PUNCT, VERB, VERB, PUNCT, NOUN, VERB, PRON, V...","[(2, nsubj), (0, root), (2, ccomp), (6, csubj:...",”及行事，妻怪其藐小如故，官曰：“大了许多，汝自不觉着。
4,"[”, 妻, 曰, ：, “, 如, 何, 不, 觉, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, VERB, PRON, ...","[(3, discourse), (3, nsubj), (0, root), (3, di...",”妻曰：“如何不觉？
...,...,...,...,...
2193747,"[为, 听, 杨柳, 曲, ，, 行役, 几, 伤, 心, 。]","[ADP, VERB, NOUN, NOUN, PUNCT, VERB, ADV, VERB...","[(2, advmod), (8, advcl), (4, nmod), (2, obj),...",为听杨柳曲，行役几伤心。
2193748,"[卷, 903, _, 39, 【, 素, 】, 李峤, 濯手, 天津, 女, ，, 纤腰,...","[VERB, NUM, PUNCT, NUM, PUNCT, NOUN, PUNCT, PR...","[(11, nsubj), (11, nummod), (2, discourse:sp),...",卷903_39 【素】李峤 濯手天津女，纤腰洛浦妃。
2193749,"[鱼肠, 远方, 至, ，, 雁足, 上林, 飞, 。]","[NOUN, VERB, VERB, PUNCT, NOUN, PROPN, VERB, P...","[(3, nsubj), (3, obl:lmod), (0, root), (3, dis...",鱼肠远方至，雁足上林飞。
2193750,"[妙, 夺, 鲛绡, 色, ，, 光, 腾, 月扇, 辉, 。]","[ADV, VERB, NOUN, NOUN, PUNCT, NOUN, VERB, NOU...","[(2, advmod), (0, root), (4, nmod), (2, obj), ...",妙夺鲛绡色，光腾月扇辉。


In [12]:
# inner join with df_concated3 by column 'sentences'
df_joined = pd.merge(df_concated3, processed_sentences_df, on='sentences', how='inner')
df_joined.drop_duplicates(subset=['sentences', 'name'], inplace=True)
df_joined.reset_index(drop=True, inplace=True)
df_joined = df_joined[['tok/coarse', 'pos/upos','dep', 'name','sentences']]
df_joined.rename(columns={'tok/coarse': 'token', 'pos/upos': 'upos'}, inplace=True)
df_joined

Unnamed: 0,token,upos,dep,name,sentences
0,"[卷, 一, 古艳, 部, 升, 官, 一, 官, 升, 职, ，, 谓, 其, 妻, 曰,...","[NOUN, NUM, PROPN, NOUN, VERB, NOUN, NUM, NOUN...","[(3, nmod), (4, nummod), (4, nmod), (5, nsubj)...",笑林广记,卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。
1,"[”, 妻, 曰, ：, “, 官, 大, ，, 不, 知, 此, 物, 亦, 大, 不, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, NOUN, VERB, ...","[(3, discourse), (3, nsubj), (0, root), (7, di...",笑林广记,”妻曰：“官大，不知此物亦大不？
2,"[”, 官, 曰, ：, “, 自然, 。]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, PRON, PUNCT]","[(3, nsubj:outer), (3, nsubj), (0, root), (3, ...",笑林广记,”官曰：“自然。
3,"[”, 及, 行事, ，, 妻, 怪, 其, 藐, 小, 如, 故, ，, 官, 曰, ：,...","[PUNCT, VERB, VERB, PUNCT, NOUN, VERB, PRON, V...","[(2, nsubj), (0, root), (2, ccomp), (6, csubj:...",笑林广记,”及行事，妻怪其藐小如故，官曰：“大了许多，汝自不觉着。
4,"[”, 妻, 曰, ：, “, 如, 何, 不, 觉, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, VERB, PRON, ...","[(3, discourse), (3, nsubj), (0, root), (3, di...",笑林广记,”妻曰：“如何不觉？
...,...,...,...,...,...
2158053,"[参差, 横, 凤, 翼, ，, 搜索, 动, 猿吟, 。]","[VERB, VERB, NOUN, NOUN, PUNCT, VERB, VERB, NO...","[(2, nsubj), (0, root), (4, nmod), (2, obj), (...",全唐诗,参差横凤翼，搜索动猿吟。
2158054,"[灵鹤, 时, 来, 到, ，, 仙人, 幸, 见, 寻, 。]","[NOUN, NOUN, VERB, VERB, PUNCT, NOUN, ADV, VER...","[(3, nsubj), (3, obl:tmod), (0, root), (3, par...",全唐诗,灵鹤时来到，仙人幸见寻。
2158055,"[为, 听, 杨柳, 曲, ，, 行役, 几, 伤, 心, 。]","[ADP, VERB, NOUN, NOUN, PUNCT, VERB, ADV, VERB...","[(2, advmod), (8, advcl), (4, nmod), (2, obj),...",全唐诗,为听杨柳曲，行役几伤心。
2158056,"[卷, 903, _, 39, 【, 素, 】, 李峤, 濯手, 天津, 女, ，, 纤腰,...","[VERB, NUM, PUNCT, NUM, PUNCT, NOUN, PUNCT, PR...","[(11, nsubj), (11, nummod), (2, discourse:sp),...",全唐诗,卷903_39 【素】李峤 濯手天津女，纤腰洛浦妃。


In [15]:
df_books = df_concated[['name', 'type', 'time', 'style']]
df_books

Unnamed: 0,name,type,time,style
0,笑林广记,笑话,近古,散文
1,齐民要术,农书,中古,散文
2,浮生六记,自传,近古,散文
3,牡丹亭,剧曲,中古,杂剧
4,全元曲杂剧,剧曲,中古,杂剧
5,西厢记,剧曲,中古,杂剧
6,全元曲,剧曲,中古,元曲
7,桃花扇,剧曲,中古,杂剧
8,封神演义,小说,近古,小说
9,醒世恒言,小说,近古,小说


In [34]:
# inner join with df_joined by column 'name'
df_temp = pd.merge(df_joined, df_books, on='name', how='inner')
df_temp

Unnamed: 0,token,upos,dep,name,sentences,type,time,style
0,"[卷, 一, 古艳, 部, 升, 官, 一, 官, 升, 职, ，, 谓, 其, 妻, 曰,...","[NOUN, NUM, PROPN, NOUN, VERB, NOUN, NUM, NOUN...","[(3, nmod), (4, nummod), (4, nmod), (5, nsubj)...",笑林广记,卷一 古艳部 升官 一官升职，谓其妻曰：“我的官职比前更大了。,笑话,近古,散文
1,"[”, 妻, 曰, ：, “, 官, 大, ，, 不, 知, 此, 物, 亦, 大, 不, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, NOUN, VERB, ...","[(3, discourse), (3, nsubj), (0, root), (7, di...",笑林广记,”妻曰：“官大，不知此物亦大不？,笑话,近古,散文
2,"[”, 官, 曰, ：, “, 自然, 。]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, PRON, PUNCT]","[(3, nsubj:outer), (3, nsubj), (0, root), (3, ...",笑林广记,”官曰：“自然。,笑话,近古,散文
3,"[”, 及, 行事, ，, 妻, 怪, 其, 藐, 小, 如, 故, ，, 官, 曰, ：,...","[PUNCT, VERB, VERB, PUNCT, NOUN, VERB, PRON, V...","[(2, nsubj), (0, root), (2, ccomp), (6, csubj:...",笑林广记,”及行事，妻怪其藐小如故，官曰：“大了许多，汝自不觉着。,笑话,近古,散文
4,"[”, 妻, 曰, ：, “, 如, 何, 不, 觉, ？]","[PUNCT, NOUN, VERB, PUNCT, PUNCT, VERB, PRON, ...","[(3, discourse), (3, nsubj), (0, root), (3, di...",笑林广记,”妻曰：“如何不觉？,笑话,近古,散文
...,...,...,...,...,...,...,...,...
2158053,"[参差, 横, 凤, 翼, ，, 搜索, 动, 猿吟, 。]","[VERB, VERB, NOUN, NOUN, PUNCT, VERB, VERB, NO...","[(2, nsubj), (0, root), (4, nmod), (2, obj), (...",全唐诗,参差横凤翼，搜索动猿吟。,诗歌,中古,近体诗
2158054,"[灵鹤, 时, 来, 到, ，, 仙人, 幸, 见, 寻, 。]","[NOUN, NOUN, VERB, VERB, PUNCT, NOUN, ADV, VER...","[(3, nsubj), (3, obl:tmod), (0, root), (3, par...",全唐诗,灵鹤时来到，仙人幸见寻。,诗歌,中古,近体诗
2158055,"[为, 听, 杨柳, 曲, ，, 行役, 几, 伤, 心, 。]","[ADP, VERB, NOUN, NOUN, PUNCT, VERB, ADV, VERB...","[(2, advmod), (8, advcl), (4, nmod), (2, obj),...",全唐诗,为听杨柳曲，行役几伤心。,诗歌,中古,近体诗
2158056,"[卷, 903, _, 39, 【, 素, 】, 李峤, 濯手, 天津, 女, ，, 纤腰,...","[VERB, NUM, PUNCT, NUM, PUNCT, NOUN, PUNCT, PR...","[(11, nsubj), (11, nummod), (2, discourse:sp),...",全唐诗,卷903_39 【素】李峤 濯手天津女，纤腰洛浦妃。,诗歌,中古,近体诗


# save data

In [18]:
# save df_joined and df_books to pickle files
df_joined.to_pickle('./results/tok_pos_dep.pkl')
df_books.to_pickle('./results/books_info.pkl')