### 1. 處理前準備

In [1]:
import json
import pickle
import CwnGraph, DistilTag, CwnSenseTagger
from DistilTag import DistilTag  
from CwnSenseTagger import senseTag

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
# 半形轉全形函數 (斷詞需要全形符號才能運作)
def strB2Q(ustring):
    ss = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 32:
                inside_code = 12288
            elif (inside_code >= 33 and inside_code <= 126):
                inside_code += 65248
            rstring += chr(inside_code)
        ss.append(rstring)
    return ''.join(ss)

In [3]:
# 定義 senseTag2 函數
def senseTag2(tagged):
    out = []
    for t in tagged:
        try:
            tt = senseTag([t])
        except:
            continue
        out.append(tt)
    return out

### 2. Load Data

In [7]:
with open("kh_tagged.pkl", "rb") as f:
    kh_tagged = pickle.load(f)
    
with open("kh_tagged_sense.pkl", "rb") as f:
    kh_tagged_sense = pickle.load(f)
    
with open("tp_tagged.pkl", "rb") as f:
    tp_tagged = pickle.load(f)

### 3. Make Frequency Lists

In [5]:
# making frequency tables
def freq_tables(sense_tagged_data):
    word_freq = {}
    pos_freq = {}
    sense_freq = {}

    for post in sense_tagged_data:
        for sent in post:
            for form, pos, sense_id, sense_def in sent:

                if form not in word_freq:
                    word_freq[form] = 1
                else:
                    word_freq[form] += 1

                word_pos = f"{form}_{pos}"
                if word_pos not in pos_freq:
                    pos_freq[word_pos] = 1
                else:
                    pos_freq[word_pos] += 1

                word_sense = f"{form}_{sense_id}_{sense_def}"
                if word_sense not in sense_freq:
                    sense_freq[word_sense] = 1
                else:
                    sense_freq[word_sense] += 1
        
    return word_freq, pos_freq, sense_freq

### 4. 高雄板詞頻表、詞類頻表、詞義頻表

#### 4.1 高雄板詞頻表

In [8]:
# sorting word frequency tables
kh_tables = freq_tables(kh_tagged_sense)
kh_tables_word = list(kh_tables[0].items())
sorted(kh_tables_word, reverse=True, key=lambda x: x[1])

[('：', 1296),
 ('，', 802),
 ('判決', 424),
 ('。', 408),
 ('．', 282),
 ('原文', 259),
 ('的', 249),
 ('不', 237),
 ('依', 210),
 ('水桶', 165),
 ('、', 158),
 ('或', 153),
 ('屬', 152),
 ('處', 150),
 ('］', 146),
 ('［', 144),
 ('是', 141),
 ('之', 140),
 ('一', 131),
 ('有', 127),
 ('故', 121),
 ('（', 113),
 ('６１', 111),
 ('）', 110),
 ('日', 107),
 ('新聞', 100),
 ('高雄', 95),
 ('７', 91),
 ('針對', 89),
 ('規定', 88),
 ('就', 85),
 ('者', 80),
 ('了', 78),
 ('經', 73),
 ('板友', 72),
 ('內容', 71),
 ('文', 70),
 ('在', 68),
 ('？', 62),
 ('已', 62),
 ('引起', 62),
 ('爭端', 62),
 ('請', 59),
 ('影射', 59),
 ('上', 57),
 ('以', 54),
 ('特定', 54),
 ('不當', 54),
 ('為', 53),
 ('徵求', 52),
 ('族群', 52),
 ('無', 51),
 ('人', 50),
 ('會', 47),
 ('這', 47),
 ('並', 47),
 ('歧視', 47),
 ('／／ｉ', 46),
 ('ｉｍｇｕｒ', 46),
 ('我', 45),
 ('但', 43),
 ('次', 43),
 ('都', 43),
 ('交換', 43),
 ('查', 42),
 ('退文', 41),
 ('被', 40),
 ('內文', 40),
 ('也', 39),
 ('內', 39),
 ('文章', 39),
 ('個', 39),
 ('警告', 39),
 ('ｈｔｔｐｓ', 38),
 ('與', 37),
 ('刪', 37),
 ('禁止', 36),
 ('應', 36),
 ('

#### 4.2 高雄板詞類頻表

In [9]:
# sorting pos frequency tables
kh_tables_pos = list(kh_tables[1].items())
sorted(kh_tables_pos, reverse=True, key=lambda x: x[1])

[('：_COLONCATEGORY', 1296),
 ('，_COMMACATEGORY', 802),
 ('。_PERIODCATEGORY', 408),
 ('判決_VE', 408),
 ('．_PERIODCATEGORY', 282),
 ('原文_Na', 259),
 ('不_D', 237),
 ('的_DE', 233),
 ('依_P', 210),
 ('水桶_Na', 165),
 ('、_PAUSECATEGORY', 158),
 ('或_Caa', 153),
 ('屬_VG', 152),
 ('］_PARENTHESISCATEGORY', 146),
 ('［_PARENTHESISCATEGORY', 144),
 ('是_SHI', 141),
 ('一_Neu', 129),
 ('有_V_2', 126),
 ('故_Cbb', 118),
 ('之_DE', 116),
 ('（_PARENTHESISCATEGORY', 113),
 ('６１_Neu', 111),
 ('）_PARENTHESISCATEGORY', 110),
 ('日_Nf', 106),
 ('新聞_Na', 100),
 ('高雄_Nc', 94),
 ('針對_P', 89),
 ('處_Nf', 83),
 ('就_D', 81),
 ('者_Na', 80),
 ('經_P', 73),
 ('板友_Na', 72),
 ('內容_Na', 71),
 ('文_Na', 70),
 ('７_Neu', 67),
 ('處_Nc', 64),
 ('？_QUESTIONCATEGORY', 62),
 ('已_D', 62),
 ('規定_Na', 62),
 ('引起_VC', 62),
 ('爭端_Na', 62),
 ('了_Di', 59),
 ('請_VF', 59),
 ('在_P', 57),
 ('影射_Na', 55),
 ('特定_A', 54),
 ('不當_VH', 54),
 ('族群_Na', 52),
 ('人_Na', 50),
 ('以_P', 50),
 ('徵求_VC', 47),
 ('歧視_Nv', 47),
 ('／／ｉ_FW', 46),
 ('ｉｍｇｕｒ_FW', 46),
 ('

#### 4.3 高雄版詞意頻表

In [10]:
# sorting sense frequency tables
kh_tables_sense = list(kh_tables[2].items())
sorted(kh_tables_sense, reverse=True, key=lambda x: x[1])

[('：__', 1296),
 ('，__', 802),
 ('。__', 408),
 ('判決__', 408),
 ('．__', 282),
 ('原文_04060101_原來的文本。', 259),
 ('不_05010901_表疑問的語氣，置於句末。', 234),
 ('水桶__', 165),
 ('、__', 158),
 ('屬__', 152),
 ('］__', 146),
 ('［__', 144),
 ('（__', 113),
 ('６１__', 111),
 ('）__', 110),
 ('日_03036209_計算時間的單位。一日為二十四小時。', 106),
 ('依_04018504_引介遵循的原則。', 98),
 ('高雄_06047401_位於臺灣南部，介於臺東縣、屏東縣、臺南縣之間的地區。', 91),
 ('７__', 91),
 ('或_04001201_連接語意相似的詞組或子句，表選擇關係或並列關係。', 90),
 ('針對_04017001_引介事件所涉及的特定對象。', 89),
 ('的_07023402_表領屬關係，用於修飾語和中心語間。', 88),
 ('規定_06678002_要求他人在進行特定事件時必須遵守的內容。', 86),
 ('處_03007402_計算部位、地點的單位。', 83),
 ('故_03015807_表帶有明確意圖做事。', 82),
 ('者_06641401_具有前述身份或進行前述行為的人或團體。', 78),
 ('之_04090401_表一般的修飾關係，用於修飾語和中心語間。', 76),
 ('經_05002105_引介事件進行的過程或方法。', 73),
 ('板友__', 72),
 ('內容_06773301_特定事物內部所包含的實質對象。', 70),
 ('？__', 62),
 ('已_06668401_表後述事件在說話之前發生或完成。', 62),
 ('引起_06772201_因使特定對象注意而導致後述結果。', 62),
 ('爭端__', 62),
 ('文_05156301_獨立而首尾完整的成篇文字。', 60),
 ('影射__', 59),
 ('依_04018505_引介判斷的標準或條件。', 58),
 ('請_06532302_

### 5. 台北板詞頻表、詞類頻表、詞義頻表

#### 5.1 台北板詞頻表

In [None]:
# sorting word frequency tables
tp_tables = freq_tables(tp_tagged_sense)
tp_tables_word = list(tp_tables[0].items())
sorted(tp_tables_word, reverse=True, key=lambda x: x[1])

#### 5.2 台北板詞類頻表

In [None]:
# sorting pos frequency tables
tp_tables_pos = list(tp_tables[1].items())
sorted(tp_tables_pos, reverse=True, key=lambda x: x[1])

#### 5.3 台北板詞義頻表

In [None]:
# sorting sense frequency tables
tp_tables_sense = list(tp_tables[2].items())
sorted(tp_tables_sense, reverse=True, key=lambda x: x[1])