In [1]:
import os
print(os.getcwd())

/Users/andreashih/Desktop/compute_parameter/compute_parameter


In [2]:
import pandas as pd
import re
import unicodedata
from collections import Counter
import xml.etree.ElementTree as ET

In [3]:
# 載入事先寫好的 functions
from compute_parameters import *

In [5]:
class Corpus():
  def __init__(self, ws, ws_pos, text):
    self.ws = ws
    self.ws_pos = ws_pos
    self.text = text
  
  def sentence_length(self):
    sentence_length = []
    for text in self.ws:
      length = [len(sentence) for sentence in text]
      sentence_length.append(length)
    return sentence_length
  
  def high_low_freq(self):
    high_low_freq = []
    for text in self.ws:
      freq = [get_high_low_freq(sentence) for sentence in text]
      high_low_freq.append(freq)
    return high_low_freq
  
  def word_freq(self):
    word_freq = []
    for text in self.ws:
      freq = [get_word_freq(sentence) for sentence in text]
      word_freq.append(freq)
    return word_freq
  
  def word_level(self):
    word_level = []
    for text in self.ws:
      level = [get_word_level(sentence) for sentence in text]
      word_level.append(level)
    return word_level
  
  def long_word_count(self):
    long_word_count = []
    for text in self.ws:
      count = [get_long_word_count(sentence) for sentence in text]
      long_word_count.append(count)
    return long_word_count
  
  def is_complete_sentence(self):
    is_complete_sentence = []
    for text in self.ws_pos:
      sent = [get_complete_sentence(sentence) for sentence in text]
      is_complete_sentence.append(sent)
    return is_complete_sentence
  
  def is_complete_context(self):
    is_complete_context = []
    for text in self.ws_pos:
      sent = [get_complete_context(sentence) for sentence in text]
      is_complete_context.append(sent)
    return is_complete_context
  
  def is_greylist(self):
    is_greylist = []
    for text in self.ws_pos:
      gl = [get_greylist(x) for x in text]
      is_greylist.append(gl)
    return is_greylist
  
  def is_blacklist(self):
    is_blacklist = []
    for text in self.text:
      bl = [get_blacklist(x) for x in text]
      is_blacklist.append(bl)
    return is_blacklist

In [53]:
from nltk.text import Text
def make_concordance_df(target_words, ws):

  corpus = [item for sublist in ws for item in sublist]
  text = Text(corpus)
  dfs = []

  for word in target_words:

    con_list = text.concordance_list(word)
    right_word = [x.right[0] for x in con_list]
    left_word = [x.left[-1] for x in con_list]
    context = [x.left + [word] + x.right for x in con_list]
    context = [' '.join(x) for x in context]

    df = pd.DataFrame({'left_word': left_word,
                      'target_word': word,
                      'right_word': right_word,
                      'context': context})
    dfs.append(df)

  return dfs

## essay

In [8]:
essay_path = '../corpora/essay'
all_files = sorted(os.listdir(essay_path))

In [24]:
ws_all = []
pos_all = []

for file in all_files:
    if re.findall(r'ws', file):
        with open(f'../corpora/essay/{file}', 'rb') as f:
            ws = pickle.load(f)
        ws_all.append(ws)
    else:
        with open(f'../corpora/essay/{file}', 'rb') as f:
            pos = pickle.load(f)
        pos_all.append(pos)

In [28]:
ws_all[0][0]

['新', '的', '開始', '，', '從', '心', '開始', '！']

In [29]:
pos_all[0][0]

['VH', 'DE', 'Nv', 'COMMACATEGORY', 'P', 'Na', 'VH', 'EXCLAMATIONCATEGORY']

### 語料格式準備 - 1
斷詞 + pos tag

In [38]:
def get_essay_ws_pos(ws, pos):
  essay_ws_pos = []
  for sent_ws, sent_pos in zip(ws, pos):
    sent_ws_pos = []
    for ws, pos in zip(sent_ws, sent_pos):
      ws_pos = f"{ws}({pos})"
      sent_ws_pos.append(ws_pos)
    essay_ws_pos.append(' '.join(sent_ws_pos))
  return essay_ws_pos

In [37]:
essay_ws_pos = [get_essay_ws_pos(ws, pos) for ws, pos in zip(ws_all, pos_all)]

In [41]:
essay_ws_pos[0][0]

'新(VH) 的(DE) 開始(Nv) ，(COMMACATEGORY) 從(P) 心(Na) 開始(VH) ！(EXCLAMATIONCATEGORY)'

### 語料格式準備 - 2
斷好詞，無標點符號

In [42]:
essay_ws = []
for text in essay_ws_pos:
  ws = []
  for sent in text:
    sent_words = re.sub(r'\([A-z0-9]+\)', '', sent)
    sent_words = re.sub(r'[^\w\s]', '', sent_words).strip().split(' ')
    sent_words = list(filter(None, sent_words))
    ws.append(sent_words)
  essay_ws.append(ws)

In [43]:
essay_ws[0][0]

['新', '的', '開始', '從', '心', '開始']

### 語料格式準備 - 3
完整句子，含標點符號

In [44]:
essay_text = []

for text in essay_ws_pos:
  t = []
  for sent in text:
    res = re.sub(r'\([A-z0-9_]+\)', '', sent)
    res = res.replace(' ', '')
    t.append(res)
  essay_text.append(t)

In [45]:
essay_text[0][0]

'新的開始，從心開始！'

### 計算指標

In [47]:
essay_corpus = Corpus(essay_ws, essay_ws_pos, essay_text)

In [48]:
essay_sentence_length = essay_corpus.sentence_length()
essay_high_low_freq = essay_corpus.high_low_freq()
essay_word_freq = essay_corpus.word_freq()
essay_word_level = essay_corpus.word_level()
essay_long_word_count = essay_corpus.long_word_count()
essay_is_complete_sentence = essay_corpus.is_complete_sentence()
essay_is_complete_context = essay_corpus.is_complete_context()
essay_is_greylist = essay_corpus.is_greylist()
essay_is_blacklist = essay_corpus.is_blacklist()

### 製作表格

In [49]:
essay_dfs = []

for text, ws, length, w_freq, hl_freq, level, count, complete_sent, complete_cont, bl, gl in zip(essay_text, essay_ws, 
                  essay_sentence_length, essay_word_freq, essay_high_low_freq,
                  essay_word_level, essay_long_word_count, essay_is_complete_sentence, 
                  essay_is_complete_context, essay_is_blacklist, essay_is_greylist):
  
  ws = [' '.join(sent) for sent in ws]
  essay_df = pd.DataFrame({'sentence': text,
                        'sentence_preprocessed': ws,
                        'sentence_length': length,
                        'word_freq': w_freq,
                        'high_low_freq': hl_freq,
                        'word_level': level,
                        'long_word_count': count,
                        'is_complete_sentence': complete_sent,
                        'is_complete_context': complete_cont,
                        'is_blacklist': bl,
                        'is_greylist': gl})
  essay_df = essay_df[essay_df['sentence_length']>0]
  essay_dfs.append(essay_df)

In [51]:
for filename, df in zip(all_files[:3], essay_dfs):
  filename = re.sub('.pkl', '', filename)
  df.to_csv(f'../results/essay/essay_parameters/parameters_{filename}.csv', index = False)

In [54]:
target_words = ['難得', '畢竟', '的確', '難免', '總是', '有助於']

essay_concordance_dfs = []
for text in essay_ws:
  dfs = make_concordance_df(target_words, text)
  concordance_df = pd.concat(dfs)
  essay_concordance_dfs.append(concordance_df)

In [55]:
for filename, df in zip(all_files[:3], essay_concordance_dfs):
  filename = re.sub('.pkl', '', filename)
  df.to_csv(f'../results/essay/essay_concordance_df/concordance_df_{filename}.csv', index = False)