In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd './drive/My Drive/NLP/day38'

/content/drive/My Drive/NLP/day38


In [3]:
#題目: 將某篇文章以上下文相同，比方三連詞(trigram)方式修改內容
#說明：某篇文章中我們可以找出所有的三連詞(trigram)，以及在前字與後字出現時，
#按照出現度隨機選出一個字去換掉中間字，這是利用三連詞修改文章內容的最基本作法。
#一旦字典的資料結構建立，我們就以某種機率(比方20%)去置換原文，並將置換文與原文印出來

#延伸: 可用五連詞或七連詞去取代中間字，可利用三連詞之前兩字去更換第三字，
#可增加加詞性的相同性(Parts Of Sentence)提高可讀性，甚至使用 Word2Vec, Glove，或者RNN的

#範例程式檔名: article_modifier_自動文件修改器.py。
#模組: sklearn, random, numpy, nltk, bs4
#輸入檔：./electronics/positive.review
#成績：被置換文的合理性與可讀性


# 使用三連詞 trigrams 練習簡易文件產生器
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [4]:
import nltk
import random
import numpy as np

from bs4 import BeautifulSoup

In [5]:
# load the reviews
positive_reviews = BeautifulSoup(open('positive(作業數據).review', encoding='ISO-8859-1').read(), "lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
# 提出 三連詞 並置入字典
# (w1, w3) 當作 key, [ w2 ] 當作值
trigrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.word_tokenize(s)
    for i in range(len(tokens) - 2):
        k = (tokens[i], tokens[i+2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i+1])

In [8]:
# 將中間字矩陣變成或然率向量
trigram_probabilities = {}
for k, words in iteritems(trigrams):
    # 產生一個  word -> count 字典
    if len(set(words)) > 1:
        # 如果中間字middle word不只有一個機率 
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in iteritems(d):
            d[w] = float(c) / n
        ############################
        d = sorted(d.items() , key = lambda x:x[1])
        ############################
        trigram_probabilities[k] = d

In [12]:
def random_sample(d):
    # 從字典隨機選出一個帶機率值的樣本，回傳累積機率值最大的字
    r = random.random()
    cumulative = 0
    for w, p in (d):
        cumulative += p
        if r < cumulative:
            return w

In [13]:
def test_spinner():
    review = random.choice(positive_reviews)
    s = review.text.lower()
    print("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print("Spun:")
    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

In [16]:
if __name__ == '__main__':
    test_spinner()

Original: 
i just recieved my hdmi cable and am very impressed. the price is just what it should be about $5 and makes me wonder how somebody would spend over $100 for this cable at a store. the service was excellent and the cable arrived in 4 days! i highly recommend this cable. i just plugged it into my cable box and the other end into the tv and wow what a great picture all around. the color is just so much more vivid using hdmi compared to component 3 wire connectors. get this cable for your system and stay away from those high priced others

Spun:
i just removed my hdmi cable and am very impressed. the price is just what it should know about $5 and makes you wonder how those would spend over $100 for this cable at a store. the problem was leveling and regular cable arrived in 4 days! i highly recommend this cable. i just plug it into my cable box and the other end into the tv and wow what a great picture all around. the color is nice so much more vivid using hdmi compared to have 