In [None]:
import pandas as pd
import re
import unicodedata
import pickle
from collections import Counter

In [None]:
import os
print(os.getcwd())

In [None]:
# 載入事先寫好的 functions
from compute_parameters import *

In [None]:
class Corpus():
  def __init__(self, ws, ws_pos, text):
    self.ws = ws
    self.ws_pos = ws_pos
    self.text = text
  
  def sentence_length(self):
    sentence_length = []
    for text in self.ws:
      length = [len(sentence) for sentence in text]
      sentence_length.append(length)
    return sentence_length
  
  def high_low_freq(self):
    high_low_freq = []
    for text in self.ws:
      freq = [get_high_low_freq(sentence) for sentence in text]
      high_low_freq.append(freq)
    return high_low_freq
  
  def word_freq(self):
    word_freq = []
    for text in self.ws:
      freq = [get_word_freq(sentence) for sentence in text]
      word_freq.append(freq)
    return word_freq
  
  def word_level(self):
    word_level = []
    for text in self.ws:
      level = [get_word_level(sentence) for sentence in text]
      word_level.append(level)
    return word_level
  
  def long_word_count(self):
    long_word_count = []
    for text in self.ws:
      count = [get_long_word_count(sentence) for sentence in text]
      long_word_count.append(count)
    return long_word_count
  
  def is_complete_sentence(self):
    is_complete_sentence = []
    for text in self.ws_pos:
      sent = [get_complete_sentence(sentence) for sentence in text]
      is_complete_sentence.append(sent)
    return is_complete_sentence
  
  def is_complete_context(self):
    is_complete_context = []
    for text in self.ws_pos:
      sent = [get_complete_context(sentence) for sentence in text]
      is_complete_context.append(sent)
    return is_complete_context
  
  def is_greylist(self):
    is_greylist = []
    for text in self.ws_pos:
      gl = [get_greylist(x) for x in text]
      is_greylist.append(gl)
    return is_greylist
  
  def is_blacklist(self):
    is_blacklist = []
    for text in self.text:
      bl = [get_blacklist(x) for x in text]
      is_blacklist.append(bl)
    return is_blacklist

In [None]:
from nltk.text import Text
def make_concordance_df(target_words, ws):

  corpus = [item for sublist in ws for item in sublist]
  text = Text(corpus)
  dfs = []

  for word in target_words:

    con_list = text.concordance_list(word)
    right_word = [x.right[0] for x in con_list]
    left_word = [x.left[-1] for x in con_list]
    context = [x.left + [word] + x.right for x in con_list]
    context = [' '.join(x) for x in context]

    df = pd.DataFrame({'left_word': left_word,
                      'target_word': word,
                      'right_word': right_word,
                      'context': context})
    dfs.append(df)

  return dfs

## ASBC

In [None]:
# 讀入 ASBC txt 檔
asbc_path = '../corpora/ASBC_去XML標記'
all_files = sorted(os.listdir(asbc_path))

asbc_corpus = []
for f in all_files:
  with open(f'{asbc_path}/{f}') as f:
      lines = f.readlines()
      asbc_corpus.append(lines)

In [None]:
# 前處理
def preprocess_asbc(string):

  clean_string = unicodedata.normalize('NFKC', string) # 全形轉半形
  clean_string = re.sub(r'\n', '', clean_string) # 移除換行符號
  clean_string = re.sub(r'\-+', '', clean_string) # 移除連續的 -
  clean_string = re.sub(r'\[\+[A-z0-9]+\]', '', clean_string) # 移除特徵標記
  clean_string = re.sub(r'\(\w+CATEGORY\)', '', clean_string) # 移除標點符號標記

  clean_string = clean_string.translate(str.maketrans({',': '，', 
                                                       '!': '！', 
                                                       '?': '？',
                                                       ':': '：',
                                                       ';': '；'})) # 部分符號改回全形
  return clean_string

### 語料格式準備 - 1
斷詞 + pos tag

In [None]:
asbc_ws_pos = []

for text in asbc_corpus:
  preprocessed = [preprocess_asbc(string) for string in text]
  joined = ''.join(preprocessed)
  split_1 = re.split(r'(?<=。」|！」|？」)', joined) # 先用 。」 ！」 ？」 分隔
  split_2 = [re.split(r'(?<=[。！？])(?!」)', x) for x in split_1] # 再用 。！？ 分隔
  split = [item for sublist in split_2 for item in sublist] # 將 list of list 攤平
  split = [x for x in split if len(x)>1]

  asbc_ws_pos.append(split)

In [None]:
asbc_ws_pos[0][0]

'\ufeff時間(Na) ：三月(Nd) 十日(Nd) ( 星期四(Nd) ) 上午(Nd) 十時(Nd) 。'

### 語料格式準備 - 2
斷好詞，無標點符號

In [None]:
asbc_ws = []
for text in asbc_ws_pos:
  ws = []
  for sent in text:
    sent_words = re.sub(r'\([A-z0-9]+\)', '', sent)
    sent_words = re.sub(r'[^\w\s]', '', sent_words).strip().split(' ')
    sent_words = list(filter(None, sent_words))
    ws.append(sent_words)
  asbc_ws.append(ws)

In [None]:
asbc_ws[0][0]

['時間', '三月', '十日', '星期四', '上午', '十時']

### 語料格式準備 - 3
完整句子，含標點符號

In [None]:
asbc_text = []

for text in asbc_ws_pos:
  t = []
  for sent in text:
    res = re.sub(r'\([A-z0-9_]+\)', '', sent)
    res = res.replace(' ', '')
    t.append(res)
  asbc_text.append(t)

In [None]:
asbc_text[0][0]

'\ufeff時間：三月十日(星期四)上午十時。'

### 計算指標

In [None]:
asbc_corpus = Corpus(asbc_ws, asbc_ws_pos, asbc_text)

In [None]:
asbc_sentence_length = asbc_corpus.sentence_length()
asbc_high_low_freq = asbc_corpus.high_low_freq()
asbc_word_freq = asbc_corpus.word_freq()
asbc_word_level = asbc_corpus.word_level()
asbc_long_word_count = asbc_corpus.long_word_count()
asbc_is_complete_sentence = asbc_corpus.is_complete_sentence()
asbc_is_complete_context = asbc_corpus.is_complete_context()
asbc_is_greylist = asbc_corpus.is_greylist()
asbc_is_blacklist = asbc_corpus.is_blacklist()

### 製作表格

In [None]:
asbc_dfs = []

for text, ws, length, w_freq, hl_freq, level, count, complete_sent, complete_cont, bl, gl in zip(asbc_text, asbc_ws, 
                  asbc_sentence_length, asbc_word_freq, asbc_high_low_freq,
                  asbc_word_level, asbc_long_word_count, asbc_is_complete_sentence, 
                  asbc_is_complete_context, asbc_is_blacklist, asbc_is_greylist):
  
  ws = [' '.join(sent) for sent in ws]
  asbc_df = pd.DataFrame({'sentence': text,
                        'sentence_preprocessed': ws,
                        'sentence_length': length,
                        'word_freq': w_freq,
                        'high_low_freq': hl_freq,
                        'word_level': level,
                        'long_word_count': count,
                        'is_complete_sentence': complete_sent,
                        'is_complete_context': complete_cont,
                        'is_blacklist': bl,
                        'is_greylist': gl})
  asbc_df = asbc_df[asbc_df['sentence_length']>0]
  asbc_dfs.append(asbc_df)

In [None]:
for filename, df in zip(all_files, asbc_dfs):
  filename = re.sub('.txt', '', filename)
  df.to_csv(f'../results/asbc/asbc_parameters/parameters_{filename}.csv', index = False)

In [None]:
target_words = ['難得', '畢竟', '的確', '難免', '總是', '有助於']

asbc_concordance_dfs = []
for text in asbc_ws:
  dfs = make_concordance_df(target_words, text)
  concordance_df = pd.concat(dfs)
  asbc_concordance_dfs.append(concordance_df)

In [None]:
for filename, df in zip(all_files, asbc_concordance_dfs):
  filename = re.sub('.txt', '', filename)
  df.to_csv(f'../results/asbc/asbc_concordance_df/concordance_df_{filename}.csv', index = False)