# 自製智能中文選字系統  (1)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd './drive/My Drive/NLP/day29'

/content/drive/My Drive/NLP/day29


In [3]:
import sys
sys.version

'3.6.9 (default, Oct  8 2020, 12:12:24) \n[GCC 8.4.0]'

## 資料前處理

確認版本為 python3

In [4]:
import re

In [None]:
!ls

data  data.zip	wiki_zh_small.txt  基礎篇_hw.ipynb


In [5]:
def preprocess_line(line):
    # 僅僅挑出中文字元，並且斷開不連續的中文字(斷句)
    pattern = '[\u4E00-\u9FCC]+'
    if line!=None:
      word_list = re.findall(pattern , line)
    return word_list

In [6]:
preprocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  
# 應該為：['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

In [7]:
segments = []
with open('./wiki_zh_small.txt' , encoding='utf-8') as fr:
    for line in fr.readlines():
        #segments += prepocess_line(line)
        segments.extend(preprocess_line(line))
#print(segments)

In [8]:
print(len(segments))

48768


## Ngram

一開始要先計算字詞出現的次數

In [9]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數
        # ref:https://pymotw.com/2/collections/counter.html
        # 出現0個字代表在統計總共有多少字(重複字也算)
        for i in range(self.n+1):
          for segment in segments:
            if i ==0:
              self.counters[i].update([""]*len(segment))
            else:
              self.counters[i].update(self._skip(segment , i))
        
        return self.counters

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [None]:
counters = Counters(n=3)
counters.fit(segments)

In [11]:
counters[0]
# 應該為： Counter({'': 371373})

Counter({'': 371370})

In [12]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        sorted_probs = []
        if self.n==1:
          total = sum(self.major_counter.values())
        else:
          prefix = prefix[-(self.n-1):]
          # ref:https://stackoverflow.com/questions/19523563/python-typeerror-int-object-is-not-iterable
          # int object is not iterable，所以sum裡面要加[]
          total = sum([self.minor_counter[prefix]])
        
        for word , counts in self.major_counter.items():
          if self.n == 1:
            prob = counts/total
            sorted_probs.append((prob , word))
          else:
            if word[:-1] == prefix[len(prefix)-self.n+1:]:
              prob = counts/total
              sorted_probs.append((prob , word))

        sorted_probs = sorted(sorted_probs , key = lambda x : x[0] , reverse = True)
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}


In [13]:
unigram = Ngram(1, counters)

In [14]:
unigram.predict_proba('我思')
# 應該為：[(0.035732269174118744, '的'),
#         (0.012927703414087723, '國'),
#         (0.010620050461395955, '中'),
#         (0.009984570768472667, '在'),
#         (0.009852627950874188, '一')]

[(0.035732557826426474, '的'),
 (0.012927807846621966, '國'),
 (0.010620136252255163, '中'),
 (0.00998465142580176, '在'),
 (0.009852707542343216, '一')]

In [15]:
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

## 使用Ngram來建立第一版選字系統

In [16]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        if len(prefix) == 0:
          return self.unigram.predict_proba(prefix , top_k)
        elif len(prefix)==1 :
          return self.bigram.predict_proba(prefix , top_k)
        else:
          return self.trigram.predict_proba(prefix , top_k)


In [17]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

In [18]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.75, '我思故'), (0.25, '我思維')]

## Demo

In [19]:
!pip install -U pip
!pip install -q ipywidgets



In [20]:
import ipywidgets as widgets

text = widgets.Textarea()
# The Label widget is useful if you need to build a custom description next to a control using similar styling to the built-in control descriptions.
label = widgets.Label()
display(label, text)

def func(change):
    print(change)
    print(type(change))
    print('------')
    # https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html
    # https://stackoverflow.com/questions/56286221/what-is-the-difference-between-bunch-and-dictionary-type-in-python
    # 可以將keys當成attribute使用
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

#  the observe method of the widget can be used to register a callback
text.observe(func, names='value')

Label(value='')

Textarea(value='')

{'name': 'value', 'old': '', 'new': '我', 'owner': Textarea(value='我'), 'type': 'change'}
<class 'traitlets.utils.bunch.Bunch'>
------
{'name': 'value', 'old': '我', 'new': '我的', 'owner': Textarea(value='我的'), 'type': 'change'}
<class 'traitlets.utils.bunch.Bunch'>
------
