# 自製智能中文選字系統  (1)

In [1]:
import sys
sys.version

'3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]'

## 資料前處理

確認版本為 python3

In [2]:
import re

In [3]:
def prepocess_line(line):
    # 僅僅挑出中文字元，並且斷開不連續的中文字
    # YOUR CODE HERE
    segments = re.findall('[\u4E00-\u9FFF]+', line)
    
    # END YOUR CODE
    return segments

In [4]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  
# 應該為：['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

In [5]:
segments = []
with open('./wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

In [6]:
segments[:10]

['英語',
 '英語英語',
 '又稱爲英文',
 '是一種西日耳曼語言',
 '誕生於中世紀早期的英格蘭',
 '如今具有全球通用語的地位',
 '英語',
 '一詞源於遷居英格蘭的日耳曼部落盎格魯',
 '而',
 '盎格魯']

In [7]:
total = 0
for segment in segments:          # 確認一下總共有多少字
    total += len(segment)

total

371373

In [8]:
len(segments)                     # 有多少句子

48767

## Ngram

一開始要先計算字詞出現的次數

In [9]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數, 0 代表總共有多少字

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數
        # YOUR CODE HERE
        for segment in segments:
            for i in range(self.n + 1):
                if i == 0:
                    self.counters[i][''] += len(segment)
                else:
                    result = self._skip(segment, i)
                    if  result != []:
                        for s in result:                        # generator 用 for 來逐個取出
                            self.counters[i][s] += 1
        
        # END YOUR CODE

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):           # 理論上 _skip 要回傳很大的 'n個字' 的集合，但用 generator 形式
            yield segment[i:i+shift+1]                  # 比較不占空間

In [10]:
counters = Counters(n=3)
counters.fit(segments)

In [11]:
counters[0]
# 應該為： Counter({'': 371373})

Counter({'': 371373})

In [12]:
test = '又稱為英文'
counters[0][test[1:1]]

371373

In [13]:
counters[1]['我'], counters[1]['思']

(201, 181)

In [14]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        try:
            assert len(prefix) >= self.n - 1
        except:                                      # 若設 n，則 prefix 至少需要 n-1 個字，否則計算會有問題，因此
            return None                              # 對於這種狀況設定回傳 None
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        # YOUR CODE HERE
        import math
        prefix_prob = 1
        if len(prefix) == self.n-1:                            # 若 prefix == n-1 則直接紀錄 post_item
            post_item = prefix
        else:                                                  # 從 prefix 去算已知的 prefix_prob，分子分母都多 +1，
            for i in range(len(prefix)-(self.n-1)):            # 避免有 0 的狀況，最下方 demo 在輸入注音時，注音符號就造成
                try:                                           # 因為找不到而出現 0 的 error
                    prefix_prob += math.log((self.major_counter[prefix[i:i+self.n]]+1)/(self.minor_counter[prefix[i:i+(self.n-1)]]+1))
                except Exception as e:
                    print(e)
#                    print('分子: ', self.major_counter[prefix[i:i+self.n]], 'prefix: ', prefix[i:i+self.n])
#                    print('分母: ', self.minor_counter[prefix[i:i+(self.n-1)]], 'prefix: ', prefix[i:i+(self.n-1)])
                    
                if i == len(prefix)-(self.n-1)-1:             # 到最後一個 i 時，記錄下 post_item
                    post_item = prefix[i+1:i+self.n]          # post_item 是給 post_prob 計算分母項


        probs = list()
        for key, value in self.major_counter.items():         # 搜尋每項中前面 n-1 個字是 post_item
            if key[:-1] == post_item:                         # 而不同最後一個字的個別機率
                post_prob = math.log(self.major_counter[key]+1/self.minor_counter[post_item]+1)
                prob = math.exp(prefix_prob + post_prob)
                probs.append([prob, key[-1]]) 

        # normalize
        total = sum(item[0] for item in probs)
        for i in range(len(probs)):
            probs[i][0] /= total
            probs[i] = tuple(probs[i])

        sorted_probs = sorted(probs, key=lambda x:x[0], reverse=True)

        # END YOUR CODE
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs


    def get_proba_dict(self, prefix=''):
        if self.predict_proba(prefix, top_k=-1) != None:
            return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}
        else:
            return None


In [15]:
unigram = Ngram(1, counters)

In [16]:
unigram.predict_proba('我')

[(0.03536698042759946, '的'),
 (0.012797245126524493, '國'),
 (0.010513355274945636, '中'),
 (0.009884419446506181, '在'),
 (0.009753835312635285, '一')]

In [17]:
#unigram.predict_proba('我思')
# 應該為：[(0.035732269174118744, '的'),
#         (0.012927703414087723, '國'),
#         (0.010620050461395955, '中'),
#         (0.009984570768472667, '在'),
#         (0.009852627950874188, '一')]

In [18]:
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

In [19]:
bigram.predict_proba('我')

[(0.3224247877626069, '們'),
 (0.06292789796956295, '的'),
 (0.03933727162474079, '在'),
 (0.019678416337388975, '是'),
 (0.019678416337388975, '思')]

In [20]:
p = trigram.predict_proba('我')
print(p)

None


## 使用Ngram來建立第一版選字系統

In [21]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        # YOUR CODE HERE
        ngrams = [self.unigram, self.bigram, self.trigram]             # 建立 Ngram 串列，以 index 去識別不同的 n 
        results = []
        for i in range(len(ngrams)):
#            results.append(ngrams[i].predict_proba(prefix=prefix, top_k=-1))
            results.append(ngrams[i].get_proba_dict(prefix=prefix))    # 取得字典形式的推薦字和機率
        
        recom = dict()
        for i in range(len(results)):
            if results[i] != None:
                for word, prob in results[i].items():
                    recom[word] = recom.get(word, 0) + (i+1)*prob      # 合併成一個字典，取加權計算個別機率和
        
        # normalize
        total = 0
        for value in recom.values():
            total += value
        
        recom_final = list()
        for key, value in recom.items():
            recom_final.append((value/total, key))
        
        recommendation = sorted(recom_final, key=lambda x:x[0], reverse=True)
        
        return recommendation[:top_k] if top_k > 0 else recommendation
        # END YOUR CODE

In [22]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

In [23]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.3339990972489473, '故'),
 (0.20647326391523002, '維'),
 (0.10874174156714446, '想'),
 (0.04042359130747717, '考'),
 (0.017383453933229157, '是'),
 (0.014046417838796026, '汗'),
 (0.011162292494646392, '的'),
 (0.010389071579385312, '爲'),
 (0.00689343497548566, '一'),
 (0.0059828993481545285, '成')]

In [24]:
# probs = model.predict_proba('我思', top_k=10)
# probs

## Demo

In [25]:
# !pip install -U pip
# !pip install -q ipywidgets

In [26]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')

In [None]:
# import ipywidgets as widgets

# text = widgets.Textarea()
# label = widgets.Label()
# display(label, text)

# def func(change):
#     probs = model.predict_proba(change.new, top_k=10)
#     label.value = ' ' + '\t'.join([word for prob, word in probs])

# text.observe(func, names='value')

Label(value='')

Textarea(value='')