In [1]:
import os
print(os.getcwd())

/Users/andreashih/Desktop/compute_parameter/compute_parameter


In [2]:
import pandas as pd
import re
import unicodedata
from collections import Counter
import xml.etree.ElementTree as ET

In [3]:
# 載入事先寫好的 functions
from compute_parameters import *

In [4]:
class Corpus():
  def __init__(self, ws, ws_pos, text):
    self.ws = ws
    self.ws_pos = ws_pos
    self.text = text
  
  def sentence_length(self):
    sentence_length = []
    for text in self.ws:
      length = [len(sentence) for sentence in text]
      sentence_length.append(length)
    return sentence_length
  
  def high_low_freq(self):
    high_low_freq = []
    for text in self.ws:
      freq = [get_high_low_freq(sentence) for sentence in text]
      high_low_freq.append(freq)
    return high_low_freq
  
  def word_freq(self):
    word_freq = []
    for text in self.ws:
      freq = [get_word_freq(sentence) for sentence in text]
      word_freq.append(freq)
    return word_freq
  
  def word_level(self):
    word_level = []
    for text in self.ws:
      level = [get_word_level(sentence) for sentence in text]
      word_level.append(level)
    return word_level
  
  def long_word_count(self):
    long_word_count = []
    for text in self.ws:
      count = [get_long_word_count(sentence) for sentence in text]
      long_word_count.append(count)
    return long_word_count
  
  def is_complete_sentence(self):
    is_complete_sentence = []
    for text in self.ws_pos:
      sent = [get_complete_sentence(sentence) for sentence in text]
      is_complete_sentence.append(sent)
    return is_complete_sentence
  
  def is_complete_context(self):
    is_complete_context = []
    for text in self.ws_pos:
      sent = [get_complete_context(sentence) for sentence in text]
      is_complete_context.append(sent)
    return is_complete_context
  
  def is_greylist(self):
    is_greylist = []
    for text in self.ws_pos:
      gl = [get_greylist(x) for x in text]
      is_greylist.append(gl)
    return is_greylist
  
  def is_blacklist(self):
    is_blacklist = []
    for text in self.text:
      bl = [get_blacklist(x) for x in text]
      is_blacklist.append(bl)
    return is_blacklist

In [5]:
from nltk.text import Text
def make_concordance_df(target_words, ws):

  corpus = [item for sublist in ws for item in sublist]
  text = Text(corpus)
  dfs = []

  for word in target_words:

    con_list = text.concordance_list(word)
    right_word = [x.right[0] for x in con_list]
    left_word = [x.left[-1] for x in con_list]
    context = [x.left + [word] + x.right for x in con_list]
    context = [' '.join(x) for x in context]

    df = pd.DataFrame({'left_word': left_word,
                      'target_word': word,
                      'right_word': right_word,
                      'context': context})
    dfs.append(df)

  return dfs

## naer_written

In [6]:
naer_written_path = '../corpora/naer_written'
all_files = sorted(os.listdir(naer_written_path))

In [17]:
len(all_files)

116

In [22]:
trees = []
for idx,f in enumerate(all_files):
    try:
        tree = ET.parse(f'{naer_written_path}/{f}')
    except:
        pass
    trees.append(tree)

In [24]:
roots = [tree.getroot() for tree in trees]

In [26]:
naer_written_corpus = []
for root in roots:
    corp = [x.text for x in root.iter('sentence')]
    naer_written_corpus.append(corp)

### 語料格式準備 - 1
斷詞 + pos tag

In [28]:
# 前處理
def preprocess_naer(string):

  clean_string = unicodedata.normalize('NFKC', string) # 全形轉半形
  clean_string = re.sub(r'\n', '', clean_string) # 移除換行符號
  clean_string = re.sub(r'\(PUNC\)', '', clean_string) # 移除標點符號標記

  clean_string = clean_string.translate(str.maketrans({',': '，', 
                                                       '!': '！', 
                                                       '?': '？',
                                                       ':': '：',
                                                       ';': '；'})) # 部分符號改回全形
  return clean_string

In [29]:
naer_written_ws_pos = []

for text in naer_written_corpus:
  preprocessed = [preprocess_naer(string) for string in text]
  joined = ''.join(preprocessed)
  split_1 = re.split(r'(?<=。」|！」|？」)', joined) # 先用 。」 ！」 ？」 分隔
  split_2 = [re.split(r'(?<=[。！？])(?!」)', x) for x in split_1] # 再用 。！？ 分隔
  split = [item for sublist in split_2 for item in sublist] # 將 list of list 攤平

  naer_written_ws_pos.append(split)

In [30]:
naer_written_ws_pos[0][0]

'這(Nep) 個(Nf) 工作(Na) 是(SHI) 秉(Na) 正(D) 第一(Neu) 次(Nf) 做(VC) 獨當一面(Na) 的(DE) 首長(Na) ， 由於(Cbb) 情況(Na) 複雜(VH) ， 各(Nes) 方(Nf) 人士(Na) 頻頻(D) 關心(VK) 建議(VE) ， 秉正(D) 缺乏(VJ) 決斷(Na) 能力(Na) ， 感到(VK) 極(Dfa) 大(VH) 的(DE) 困擾(Na) 。'

### 語料格式準備 - 2
斷好詞，無標點符號

In [31]:
naer_written_ws = []
for text in naer_written_ws_pos:
  ws = []
  for sent in text:
    sent_words = re.sub(r'\([A-z0-9]+\)', '', sent)
    sent_words = re.sub(r'[^\w\s]', '', sent_words).strip().split(' ')
    sent_words = list(filter(None, sent_words))
    ws.append(sent_words)
  naer_written_ws.append(ws)

In [32]:
naer_written_ws[0][0]

['這',
 '個',
 '工作',
 '是',
 '秉',
 '正',
 '第一',
 '次',
 '做',
 '獨當一面',
 '的',
 '首長',
 '由於',
 '情況',
 '複雜',
 '各',
 '方',
 '人士',
 '頻頻',
 '關心',
 '建議',
 '秉正',
 '缺乏',
 '決斷',
 '能力',
 '感到',
 '極',
 '大',
 '的',
 '困擾']

### 語料格式準備 - 3
完整句子，含標點符號

In [33]:
naer_written_text = []

for text in naer_written_ws_pos:
  t = []
  for sent in text:
    res = re.sub(r'\([A-z0-9_]+\)', '', sent)
    res = res.replace(' ', '')
    t.append(res)
  naer_written_text.append(t)

In [34]:
naer_written_text[0][0]

'這個工作是秉正第一次做獨當一面的首長，由於情況複雜，各方人士頻頻關心建議，秉正缺乏決斷能力，感到極大的困擾。'

### 計算指標

In [35]:
naer_written_corpus = Corpus(naer_written_ws, naer_written_ws_pos, naer_written_text)

In [36]:
naer_written_sentence_length = naer_written_corpus.sentence_length()
naer_written_high_low_freq = naer_written_corpus.high_low_freq()
naer_written_word_freq = naer_written_corpus.word_freq()
naer_written_word_level = naer_written_corpus.word_level()
naer_written_long_word_count = naer_written_corpus.long_word_count()
naer_written_is_complete_sentence = naer_written_corpus.is_complete_sentence()
naer_written_is_complete_context = naer_written_corpus.is_complete_context()
naer_written_is_greylist = naer_written_corpus.is_greylist()
naer_written_is_blacklist = naer_written_corpus.is_blacklist()

### 製作表格

In [37]:
naer_written_dfs = []

for text, ws, length, w_freq, hl_freq, level, count, complete_sent, complete_cont, bl, gl in zip(naer_written_text, naer_written_ws, 
                  naer_written_sentence_length, naer_written_word_freq, naer_written_high_low_freq,
                  naer_written_word_level, naer_written_long_word_count, naer_written_is_complete_sentence, 
                  naer_written_is_complete_context, naer_written_is_blacklist, naer_written_is_greylist):
  
  ws = [' '.join(sent) for sent in ws]
  naer_written_df = pd.DataFrame({'sentence': text,
                        'sentence_preprocessed': ws,
                        'sentence_length': length,
                        'word_freq': w_freq,
                        'high_low_freq': hl_freq,
                        'word_level': level,
                        'long_word_count': count,
                        'is_complete_sentence': complete_sent,
                        'is_complete_context': complete_cont,
                        'is_blacklist': bl,
                        'is_greylist': gl})
  naer_written_df = naer_written_df[naer_written_df['sentence_length']>0]
  naer_written_dfs.append(naer_written_df)

In [38]:
for filename, df in zip(all_files, naer_written_dfs):
  filename = re.sub('.xml', '', filename)
  df.to_csv(f'../results/naer_written/naer_written_parameters/parameters_{filename}.csv', index = False)

In [40]:
target_words = ['難得', '畢竟', '的確', '難免', '總是', '有助於']

naer_written_concordance_dfs = []
for text in naer_written_ws:
  dfs = make_concordance_df(target_words, text)
  concordance_df = pd.concat(dfs)
  naer_written_concordance_dfs.append(concordance_df)

In [41]:
for filename, df in zip(all_files, naer_written_concordance_dfs):
  filename = re.sub('.xml', '', filename)
  df.to_csv(f'../results/naer_written/naer_written_concordance_df/concordance_df_{filename}.csv', index = False)