In [1]:
import os
print(os.getcwd())

/Users/andreashih/Desktop/compute_parameter/compute_parameter


In [6]:
import pandas as pd
import re
import unicodedata
from collections import Counter
import xml.etree.ElementTree as ET

In [3]:
# 載入事先寫好的 functions
from compute_parameters import *

In [4]:
class Corpus():
  def __init__(self, ws, ws_pos, text):
    self.ws = ws
    self.ws_pos = ws_pos
    self.text = text
  
  def sentence_length(self):
    sentence_length = []
    for text in self.ws:
      length = [len(sentence) for sentence in text]
      sentence_length.append(length)
    return sentence_length
  
  def high_low_freq(self):
    high_low_freq = []
    for text in self.ws:
      freq = [get_high_low_freq(sentence) for sentence in text]
      high_low_freq.append(freq)
    return high_low_freq
  
  def word_freq(self):
    word_freq = []
    for text in self.ws:
      freq = [get_word_freq(sentence) for sentence in text]
      word_freq.append(freq)
    return word_freq
  
  def word_level(self):
    word_level = []
    for text in self.ws:
      level = [get_word_level(sentence) for sentence in text]
      word_level.append(level)
    return word_level
  
  def long_word_count(self):
    long_word_count = []
    for text in self.ws:
      count = [get_long_word_count(sentence) for sentence in text]
      long_word_count.append(count)
    return long_word_count
  
  def is_complete_sentence(self):
    is_complete_sentence = []
    for text in self.ws_pos:
      sent = [get_complete_sentence(sentence) for sentence in text]
      is_complete_sentence.append(sent)
    return is_complete_sentence
  
  def is_complete_context(self):
    is_complete_context = []
    for text in self.ws_pos:
      sent = [get_complete_context(sentence) for sentence in text]
      is_complete_context.append(sent)
    return is_complete_context
  
  def is_greylist(self):
    is_greylist = []
    for text in self.ws_pos:
      gl = [get_greylist(x) for x in text]
      is_greylist.append(gl)
    return is_greylist
  
  def is_blacklist(self):
    is_blacklist = []
    for text in self.text:
      bl = [get_blacklist(x) for x in text]
      is_blacklist.append(bl)
    return is_blacklist

In [5]:
from nltk.text import Text
def make_concordance_df(target_words, ws):

  corpus = [item for sublist in ws for item in sublist]
  text = Text(corpus)
  dfs = []

  for word in target_words:

    con_list = text.concordance_list(word)
    right_word = [x.right[0] for x in con_list]
    left_word = [x.left[-1] for x in con_list]
    context = [x.left + [word] + x.right for x in con_list]
    context = [' '.join(x) for x in context]

    df = pd.DataFrame({'left_word': left_word,
                      'target_word': word,
                      'right_word': right_word,
                      'context': context})
    dfs.append(df)

  return dfs

## naer_bilingual

In [16]:
naer_bilingual_path = '../corpora/naer_bilingual'
all_files = sorted(os.listdir(naer_bilingual_path))

In [17]:
trees = []
for f in all_files:
    try:
        tree = ET.parse(f'{naer_bilingual_path}/{f}')
    except:
        pass
    trees.append(tree)

In [18]:
roots = [tree.getroot() for tree in trees]

In [19]:
naer_bilingual_corpus = []
for root in roots:
    corp = [x.text for x in root.iter('chsentence') if x.text is not None]
    naer_bilingual_corpus.append(corp)

### 語料格式準備 - 1
斷詞 + pos tag

In [20]:
naer_bilingual_ws_pos = []

for text in naer_bilingual_corpus:
  joined = ''.join(text)
  split_1 = re.split(r'(?<=。」|！」|？」)', joined) # 先用 。」 ！」 ？」 分隔
  split_2 = [re.split(r'(?<=[。！？])(?!」)', x) for x in split_1] # 再用 。！？ 分隔
  split = [item for sublist in split_2 for item in sublist] # 將 list of list 攤平

  naer_bilingual_ws_pos.append(split)

In [21]:
naer_bilingual_ws_pos[0][0]

'在/P 澳門/Nc 機場/Nc ，/PUNC 單/Da 是/SHI 去年/Nd 就/D 有/V_2 近/Da 百萬/Neu 人次/Na 的/DE 台灣/Nc 旅客/Na 取道/VC 澳門/Nc 前往/VCL 中國/Nc 大陸/Nc 或/Caa 入境/VA 澳門/Nc ，/PUNC 澳門/Nc 機場/Nc 的/DE 主要/A 旅客/Na 中/Ng ，/PUNC 台灣/Nc 客人/Na 佔/VJ 了/Di 八成/Neqa 。'

### 語料格式準備 - 2
斷好詞，無標點符號

In [22]:
naer_bilingual_ws = []
for text in naer_bilingual_ws_pos:
  ws = []
  for sent in text:
    sent_words = re.sub(r'/[A-z]+', '', sent)
    sent_words = re.sub(r'[^\w\s]', '', sent_words).strip().split(' ')
    sent_words = list(filter(None, sent_words))
    ws.append(sent_words)
  naer_bilingual_ws.append(ws)

In [23]:
naer_bilingual_ws[0][0]

['在',
 '澳門',
 '機場',
 '單',
 '是',
 '去年',
 '就',
 '有2',
 '近',
 '百萬',
 '人次',
 '的',
 '台灣',
 '旅客',
 '取道',
 '澳門',
 '前往',
 '中國',
 '大陸',
 '或',
 '入境',
 '澳門',
 '澳門',
 '機場',
 '的',
 '主要',
 '旅客',
 '中',
 '台灣',
 '客人',
 '佔',
 '了',
 '八成']

### 語料格式準備 - 3
完整句子，含標點符號

In [24]:
naer_bilingual_text = []

for text in naer_bilingual_ws_pos:
  t = []
  for sent in text:
    res = re.sub(r'/[A-z]+', '', sent)
    res = res.replace(' ', '')
    t.append(res)
  naer_bilingual_text.append(t)

In [25]:
naer_bilingual_text[0][0]

'在澳門機場，單是去年就有2近百萬人次的台灣旅客取道澳門前往中國大陸或入境澳門，澳門機場的主要旅客中，台灣客人佔了八成。'

### 計算指標

In [26]:
naer_bilingual_corpus = Corpus(naer_bilingual_ws, naer_bilingual_ws_pos, naer_bilingual_text)

In [27]:
naer_bilingual_sentence_length = naer_bilingual_corpus.sentence_length()
naer_bilingual_high_low_freq = naer_bilingual_corpus.high_low_freq()
naer_bilingual_word_freq = naer_bilingual_corpus.word_freq()
naer_bilingual_word_level = naer_bilingual_corpus.word_level()
naer_bilingual_long_word_count = naer_bilingual_corpus.long_word_count()
naer_bilingual_is_complete_sentence = naer_bilingual_corpus.is_complete_sentence()
naer_bilingual_is_complete_context = naer_bilingual_corpus.is_complete_context()
naer_bilingual_is_greylist = naer_bilingual_corpus.is_greylist()
naer_bilingual_is_blacklist = naer_bilingual_corpus.is_blacklist()

### 製作表格

In [28]:
naer_bilingual_dfs = []

for text, ws, length, w_freq, hl_freq, level, count, complete_sent, complete_cont, bl, gl in zip(naer_bilingual_text, naer_bilingual_ws, 
                  naer_bilingual_sentence_length, naer_bilingual_word_freq, naer_bilingual_high_low_freq,
                  naer_bilingual_word_level, naer_bilingual_long_word_count, naer_bilingual_is_complete_sentence, 
                  naer_bilingual_is_complete_context, naer_bilingual_is_blacklist, naer_bilingual_is_greylist):
  
  ws = [' '.join(sent) for sent in ws]
  naer_bilingual_df = pd.DataFrame({'sentence': text,
                        'sentence_preprocessed': ws,
                        'sentence_length': length,
                        'word_freq': w_freq,
                        'high_low_freq': hl_freq,
                        'word_level': level,
                        'long_word_count': count,
                        'is_complete_sentence': complete_sent,
                        'is_complete_context': complete_cont,
                        'is_blacklist': bl,
                        'is_greylist': gl})
  naer_bilingual_df = naer_bilingual_df[naer_bilingual_df['sentence_length']>0]
  naer_bilingual_dfs.append(naer_bilingual_df)

In [30]:
for filename, df in zip(all_files, naer_bilingual_dfs):
  filename = re.sub('.xml', '', filename)
  df.to_csv(f'../results/naer_bilingual/naer_bilingual_parameters/parameters_{filename}.csv', index = False)

In [31]:
target_words = ['難得', '畢竟', '的確', '難免', '總是', '有助於']

naer_bilingual_concordance_dfs = []
for text in naer_bilingual_ws:
  dfs = make_concordance_df(target_words, text)
  concordance_df = pd.concat(dfs)
  naer_bilingual_concordance_dfs.append(concordance_df)

In [33]:
for filename, df in zip(all_files, naer_bilingual_concordance_dfs):
  filename = re.sub('.xml', '', filename)
  df.to_csv(f'../results/naer_bilingual/naer_bilingual_concordance_df/concordance_df_{filename}.csv', index = False)