In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/CorpusforDDL_compute')
print(os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/CorpusforDDL_compute


In [96]:
import pandas as pd
import pickle
import re
from collections import Counter

In [4]:
with open('pkl_files/compute_test/asbc_test.pkl', 'rb') as f:
    asbc_test = pickle.load(f)

In [107]:
def asbc_get_sentence(sentence):

  sentence_words = re.sub(r'\([A-z0-9]+\)', '', sentence)
  sentence_words = re.sub(r'[^\w\s]', '', sentence_words).strip().split(' ')
  sentence_words = list(filter(None, sentence_words))

  return sentence_words

In [108]:
asbc_test_sent = [asbc_get_sentence(sentence) for sentence in asbc_test]
asbc_test_sent[0]

['碰到', '這', '種', '情形', '真', '是', '會', '令', '人', '又', '氣憤', '又', '苦惱']

In [109]:
len(asbc_test_sent)

3104

In [135]:
asbc_test_pos = [re.findall(r'\([A-z0-9]{1,4}\)', x) for x in asbc_test] # 長度設為 {1, 4} 可以濾掉標點符號 (CATEGORY)
asbc_test_pos[0]

['(VC)',
 '(Nep)',
 '(Nf)',
 '(Na)',
 '(D)',
 '(SHI)',
 '(D)',
 '(VL)',
 '(Na)',
 '(Caa)',
 '(VH)',
 '(Caa)',
 '(VH)']

## 句子長度
計算方法：算出每一個句子中含有幾個詞彙

In [112]:
asbc_sentence_length = [len(sentence) for sentence in asbc_test_sent]
asbc_sentence_length[:10]

[13, 25, 31, 16, 31, 33, 3, 40, 24, 7]

## 詞頻
計算方法：參考 COCT 詞頻資料，以詞頻 1200 為標準，回傳每一個句子中含有幾個高頻詞、幾個低頻詞

In [113]:
# 讀入 COCT txt 檔
coct_path = 'data/coct_frequency_list_2019.txt'
with open(coct_path) as f:
    lines = f.readlines()
    lines = [line.rstrip('\n') for line in lines]

In [80]:
coct_data = [x.split('\t') for x in lines[5:len(lines)-2]]
coct_data[:5]

[['1', '，', '23348406'],
 ['2', '的', '16029838'],
 ['3', '。', '11412297'],
 ['4', '是', '4642407'],
 ['5', '一', '3701681']]

In [95]:
keys = [x[1] for x in coct_data]
vals = [int(x[2]) for x in coct_data]
print(keys[:5])
print(vals[:5])

['，', '的', '。', '是', '一']
[23348406, 16029838, 11412297, 4642407, 3701681]


In [93]:
coct_data_dict = dict(list(zip(keys, vals)))
coct_data_dict.get('我')

3033032

In [97]:
def get_high_low_freq(sentence):

  freq = []
  for word in sentence:
    try:
      f = coct_data_dict[word]
    except:
      f = 0 # 若是在 sorted_wordfreq_dict 中找不到該詞的頻率資料，則將其頻率指定為 0
    freq.append(f)

  high_low = ['High' if f > 100 else 'Low' for f in freq] # 若在 ASBC 中詞頻大於 100，則視為高頻詞 (High)

  return dict(Counter(high_low))

In [123]:
get_high_low_freq(asbc_test_sent[5])

{'High': 32, 'Low': 1}

In [122]:
high_low_freq = [get_high_low_freq(sentence) for sentence in asbc_test_sent]
high_low_freq[:10]

[{'High': 13},
 {'High': 25},
 {'High': 31},
 {'High': 16},
 {'High': 31},
 {'High': 32, 'Low': 1},
 {'High': 3},
 {'High': 40},
 {'High': 24},
 {'High': 7}]

## 詞彙等級
計算方法：參考國教院詞語分級表，回傳每一個句子中，各級別的詞彙分別有幾個

In [125]:
# 參考國教院詞語分級表
naer_word_list = pd.read_excel('data/臺灣華語文能力基準詞語表_111-09-20.xlsx')
naer_word_list

Unnamed: 0,序號,詞語,等別,級別,情境,書面字頻(每百萬字),口語字頻(每百萬字),簡編本系統號,參考注音,參考漢語拼音
0,1,愛,基礎,第1級,核心詞,535,681,"[['愛', ['39542']]]",ㄞˋ,ài
1,2,吧,基礎,第1級,核心詞,706,748,"[['吧', ['32', '103']]]",˙ㄅㄚ,ba
2,3,八,基礎,第1級,核心詞,214,163,"[['八', ['1']]]",ㄅㄚ,bā
3,4,爸爸/爸,基礎,第1級,核心詞,226,806,"[['爸爸', ['82']], ['爸', ['81']]]",ㄅㄚˋ ˙ㄅㄚ / ㄅㄚˋ,bàba / bà
4,5,百,基礎,第1級,核心詞,108,77,"[['百', ['157', '334']]]",ㄅㄞˇ,bǎi
...,...,...,...,...,...,...,...,...,...,...
14462,14463,左右手,精熟,第7級,,3,2,"[['左右手', ['37450']]]",ㄗㄨㄛˇ ㄧㄡˋ ㄕㄡˇ,zuǒ yòu shǒu
14463,14464,坐鎮,精熟,第7級,,5,2,"[['坐鎮', ['37472']]]",ㄗㄨㄛˋ ㄓㄣˋ,zuò zhèn
14464,14465,佐證,精熟,第7級,,5,4,"[['佐證', ['37456']]]",ㄗㄨㄛˇ ㄓㄥˋ,zuǒ zhèng
14465,14466,坐姿,精熟,第7級,,5,1,[],ㄗㄨㄛˋ ㄗ,zuò zī


In [127]:
naer_word_list = naer_word_list.assign(word_1=naer_word_list['詞語'].str.split('/')).explode('詞語')
naer_word_list = naer_word_list.explode('word_1') # 把用 / 連接的詞語切開做成新的一列
naer_word_list

Unnamed: 0,序號,詞語,等別,級別,情境,書面字頻(每百萬字),口語字頻(每百萬字),簡編本系統號,參考注音,參考漢語拼音,var1,word_1
0,1,愛,基礎,第1級,核心詞,535,681,"[['愛', ['39542']]]",ㄞˋ,ài,[愛],愛
1,2,吧,基礎,第1級,核心詞,706,748,"[['吧', ['32', '103']]]",˙ㄅㄚ,ba,[吧],吧
2,3,八,基礎,第1級,核心詞,214,163,"[['八', ['1']]]",ㄅㄚ,bā,[八],八
3,4,爸爸/爸,基礎,第1級,核心詞,226,806,"[['爸爸', ['82']], ['爸', ['81']]]",ㄅㄚˋ ˙ㄅㄚ / ㄅㄚˋ,bàba / bà,"[爸爸, 爸]",爸爸
3,4,爸爸/爸,基礎,第1級,核心詞,226,806,"[['爸爸', ['82']], ['爸', ['81']]]",ㄅㄚˋ ˙ㄅㄚ / ㄅㄚˋ,bàba / bà,"[爸爸, 爸]",爸
...,...,...,...,...,...,...,...,...,...,...,...,...
14462,14463,左右手,精熟,第7級,,3,2,"[['左右手', ['37450']]]",ㄗㄨㄛˇ ㄧㄡˋ ㄕㄡˇ,zuǒ yòu shǒu,[左右手],左右手
14463,14464,坐鎮,精熟,第7級,,5,2,"[['坐鎮', ['37472']]]",ㄗㄨㄛˋ ㄓㄣˋ,zuò zhèn,[坐鎮],坐鎮
14464,14465,佐證,精熟,第7級,,5,4,"[['佐證', ['37456']]]",ㄗㄨㄛˇ ㄓㄥˋ,zuǒ zhèng,[佐證],佐證
14465,14466,坐姿,精熟,第7級,,5,1,[],ㄗㄨㄛˋ ㄗ,zuò zī,[坐姿],坐姿


In [129]:
levels = naer_word_list['級別'].values
word_level_dict = pd.Series(levels, index=naer_word_list['word_1']).to_dict()

In [130]:
def get_word_level(sentence):

  levels = []
  for word in sentence:
    try:
      level = word_level_dict[word]
    except:
      level = 'Unknown' # 如果找不到該詞彙，級別設為 Unknown
    levels.append(level)
  
  return dict(Counter(levels))

In [131]:
word_level = [get_word_level(sentence) for sentence in asbc_test_sent]
word_level[0]

{'第4級': 1, '第1級': 3, 'Unknown': 1, '第3*級': 1, '第1*級': 4, '第5級': 2, '第6級': 1}

## 詞彙長度
計算方法：算出每一個句子中有幾個「長度大於三個字」的詞彙

In [132]:
def get_long_word_count(sentence):

  long_word_count = 0
  for word in sentence:
    if len(word) >= 3:
      long_word_count += 1
  
  return long_word_count

In [133]:
long_word_count = [get_long_word_count(sentence) for sentence in asbc_test_sent] # 算出該句子有幾個「長度大於三個字」的詞彙
long_word_count[:10]

[0, 2, 1, 1, 0, 1, 1, 1, 0, 0]

## 完整的句子
計算方法：
1. 以。？！結尾
1. 至少含有一個動詞