In [1]:
import pandas as pd
import re
import jieba
import os
from hanziconv import HanziConv

# get the text from Wikipedia dataset

In [2]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

TEXT = ""
dirname = 'D:/Machine Learning/NLP/data/zhwiki_partial'
for root, dirs, files in os.walk(dirname):
    for filename in files:
        file_path = root + '/' + filename
        for line in open(file_path, encoding='utf8'):
            sline = line.strip()
            if sline == "": continue
            rline = cleanhtml(sline)
            TEXT += ' '.join(re.findall('[\w|\d]+', HanziConv.toSimplified(rline)))

In [3]:
ALL_TOKENS = list(jieba.cut(TEXT))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.289 seconds.
Prefix dict has been built succesfully.


In [4]:
len(ALL_TOKENS)

20085646

In [5]:
valid_tokens = [t for t in ALL_TOKENS if t.strip()]

In [6]:
len(valid_tokens)

16916508

# 1-Gram model

In [7]:
from collections import Counter
from functools import reduce

In [8]:
words_count = Counter(valid_tokens)

In [9]:
words_count.most_common(10)

[('的', 984482),
 ('在', 255052),
 ('年', 205285),
 ('是', 182671),
 ('和', 157701),
 ('了', 125501),
 ('为', 115853),
 ('与', 91322),
 ('有', 87614),
 ('月', 85988)]

In [10]:
frequencies_all = [f for w, f in words_count.most_common()]
frequencies_sum = sum(frequencies_all)

In [11]:
frequencies_sum

16916508

In [12]:
def get_prob(word):
    esp = 1 / frequencies_sum
    if word in words_count:
        return words_count[word] / frequencies_sum
    else:
        return esp

def product(numbers):
    return reduce(lambda n1, n2: n1 * n2, numbers)

def language_model_one_gram(string):
    words = list(jieba.cut(string))
    return product([get_prob(w) for w in words])

In [18]:
language_model_one_gram('数学是有趣的学科')

2.2080963111330184e-16

In [14]:
language_model_one_gram('我晚上去上课')

6.473819612522043e-17

In [15]:
language_model_one_gram('长征火箭下周发射')

1.4097179320104287e-20

In [19]:
sentences = """
这是一个比较正常的句子
这个一个比较罕见的句子
小明毕业于清华大学
小明毕业于秦华大学
""".split()

In [20]:
for s in sentences:
    print(s, language_model_one_gram(s))

这是一个比较正常的句子 6.040556831807112e-21
这个一个比较罕见的句子 9.5917256504427e-21
小明毕业于清华大学 4.883891421472945e-18
小明毕业于秦华大学 5.86043206824639e-24


In [21]:
need_compared = [
    "今天晚上请你吃大餐，我们一起吃日料 明天晚上请你吃大餐，我们一起吃苹果",
    "真事一只好看的小猫 真是一只好看的小猫",
    "我去吃火锅，今晚 今晚我去吃火锅"
]

for s in need_compared:
    s1, s2 = s.split()
    p1, p2 = language_model_one_gram(s1), language_model_one_gram(s2)
    
    better = s1 if p1 > p2 else s2
    
    print('{} is more possible'.format(better))
    print('-'*4 + ' {} with probility {}'.format(s1, p1))
    print('-'*4 + ' {} with probility {}'.format(s2, p2))

明天晚上请你吃大餐，我们一起吃苹果 is more possible
---- 今天晚上请你吃大餐，我们一起吃日料 with probility 9.341578703953339e-54
---- 明天晚上请你吃大餐，我们一起吃苹果 with probility 4.5342585863035046e-52
真是一只好看的小猫 is more possible
---- 真事一只好看的小猫 with probility 4.570949067777483e-26
---- 真是一只好看的小猫 with probility 3.694664498661956e-23
今晚我去吃火锅 is more possible
---- 我去吃火锅，今晚 with probility 6.567623455714128e-28
---- 今晚我去吃火锅 with probility 1.1110125472957569e-20


# 2-Gram model

In [22]:
all_2_grams_words = [''.join(valid_tokens[i:i+2]) for i in range(len(valid_tokens[:-2]))]

In [23]:
_2_gram_sum = len(all_2_grams_words)
_2_gram_counter = Counter(all_2_grams_words)

def get_combination_prob(w1, w2):
    if w1 + w2 in _2_gram_counter:
        return _2_gram_counter[w1+w2] / _2_gram_sum
    else:
        return 1 / _2_gram_sum
    
def get_prob_2_gram(w1, w2):
    return get_combination_prob(w1, w2) / get_prob(w1)

def language_model_of_2_gram(sentence):
    sentences_probability = 1
    words = list(jieba.cut(sentence))
    for i, word in enumerate(words):
        if i == 0:
            prob = get_prob(word)
        else:
            previous = words[i - 1]
            prob = get_prob_2_gram(previous, word)
        sentences_probability *= prob
    return sentences_probability

In [25]:
language_model_of_2_gram('小明今天抽奖抽到一台苹果手机')

1.5471442945043766e-18

In [27]:
language_model_of_2_gram('自然语言处理取得重大进展')

1.307665875739169e-12

In [35]:
need_compared = [
    "我喜欢学习算法 算法学习喜欢我",
    "今天晚上请你吃大餐，我们一起吃日料 明天晚上请你吃大餐，我们一起吃苹果",
    "真事一只好看的小猫 真是一只好看的小猫",
    "今晚我去吃火锅 今晚火锅去吃我",
    "我想去喝奶茶 奶茶想去我喝"
]

for s in need_compared:
    s1, s2 = s.split()
    p1, p2 = language_model_of_2_gram(s1), language_model_of_2_gram(s2)
    
    better = s1 if p1 > p2 else s2
    
    print('{} is more possible'.format(better))
    print('-'*4 + ' {} with probility {}'.format(s1, p1))
    print('-'*4 + ' {} with probility {}'.format(s2, p2))

我喜欢学习算法 is more possible
---- 我喜欢学习算法 with probility 1.4204430920006666e-12
---- 算法学习喜欢我 with probility 3.580948971430252e-14
今天晚上请你吃大餐，我们一起吃日料 is more possible
---- 今天晚上请你吃大餐，我们一起吃日料 with probility 2.83516590784376e-26
---- 明天晚上请你吃大餐，我们一起吃苹果 with probility 5.670331815687517e-27
真是一只好看的小猫 is more possible
---- 真事一只好看的小猫 with probility 9.760013771756896e-20
---- 真是一只好看的小猫 with probility 1.989090571518405e-16
今晚我去吃火锅 is more possible
---- 今晚我去吃火锅 with probility 2.858776967945884e-14
---- 今晚火锅去吃我 with probility 1.0803097640123967e-15
我想去喝奶茶 is more possible
---- 我想去喝奶茶 with probility 1.8369059633459546e-13
---- 奶茶想去我喝 with probility 3.4782342656190107e-17


# Q: If we need to solve following problems, how can language model help us?
* Voice Recognization.
* Sogou pinyin input.
* Auto correction in search engine.
* Abnormal Detection.

Ans: 在语义识别、搜狗拼音输入检测、搜索引擎自动纠错、异常检测等任务中，语言模型可以量化语句出现的概率即该语句正常出现的可能性，辅助系统提供合理程度最高的语句推荐给用户，供用户选择。

# Q: Compared to the previous learned parsing and pattern match problems. What's the advantage and disavantage of Probability Based Methods?

Ans: 基于概率的模型简单直观易理解，解决了基于模式匹配方法不能解决的复杂问题。