## 作業目標：搭建一個bag of words模型

---

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "./drive/My Drive/NLP/day12"

/content/drive/My Drive/NLP/day12


In [3]:
!ls

'Day12- bag of words作業.ipynb'   practice.ipynb   Restaurant_Reviews.tsv


In [4]:
import pandas as pd
import nltk
# download package punkt to use word_tokenize
nltk.download()
import numpy as np
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)
corpus=dataset['Review'].values

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt
    Downloading package punkt to /root/nltk_data...
      Package punkt is already up-to-date!

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


### 從文本中取出所有單字

In [5]:
whole_words = []
for sentence in corpus:
    """
    tokenize(s)
    Return: a tokenized copy of s.  
    Return type:list of str
    """
    tokenized_sentence = nltk.word_tokenize(sentence)  
    for word in tokenized_sentence:
        whole_words.append(word)

In [6]:
print(len(whole_words))

12676


### 移除重複單字

In [7]:
# ref:https://stackoverflow.com/questions/46839277/series-unique-vs-list-of-set-performance
whole_words = set(whole_words)
print('共有{}個單字'.format(len(whole_words)))

共有2356個單字


### 建立字典使每一個單字有對應數值

In [8]:
# 不是做詞頻字典，而是做label encoding
# 如果建立詞頻字典的話，每個詞頻都是1(前面有使用np.unique)
word_index = {}
index_word = {}

for i , word in enumerate(whole_words):
    word_index[word] = i
    index_word[i] = word

In [9]:
word_index

{'tables': 0,
 'passed': 1,
 'ended': 2,
 'salads': 3,
 'fireball': 4,
 'bit': 5,
 'beef': 6,
 'desserts': 7,
 'working/eating': 8,
 'Appetite': 9,
 'peanut': 10,
 'Fantastic': 11,
 'colder': 12,
 'tops': 13,
 'Flower': 14,
 'turn': 15,
 'Sauce': 16,
 'Anyways': 17,
 'piano': 18,
 'shops': 19,
 'cafe': 20,
 'hello': 21,
 'fabulous': 22,
 'vegetarian': 23,
 'showed': 24,
 'any': 25,
 'twice': 26,
 "n't": 27,
 'drawing': 28,
 'away': 29,
 'wash': 30,
 'fare': 31,
 'combos': 32,
 'THIS': 33,
 'blame': 34,
 'is': 35,
 'STALE': 36,
 'said': 37,
 'just': 38,
 'anything': 39,
 'nude': 40,
 'ventilation': 41,
 'each': 42,
 'Phoenix': 43,
 'tepid': 44,
 'Mary': 45,
 'moz': 46,
 'trap': 47,
 'before': 48,
 'serve': 49,
 'pho': 50,
 'use': 51,
 'covered': 52,
 'gem': 53,
 'plastic': 54,
 'expert/connisseur': 55,
 'disgraceful': 56,
 'lukewarm': 57,
 'buying': 58,
 'does': 59,
 'way': 60,
 'patty': 61,
 'recommendation': 62,
 'nigiri': 63,
 'Why': 64,
 'street': 65,
 '%': 66,
 'burned': 67,
 'frus

In [10]:
index_word

{0: 'tables',
 1: 'passed',
 2: 'ended',
 3: 'salads',
 4: 'fireball',
 5: 'bit',
 6: 'beef',
 7: 'desserts',
 8: 'working/eating',
 9: 'Appetite',
 10: 'peanut',
 11: 'Fantastic',
 12: 'colder',
 13: 'tops',
 14: 'Flower',
 15: 'turn',
 16: 'Sauce',
 17: 'Anyways',
 18: 'piano',
 19: 'shops',
 20: 'cafe',
 21: 'hello',
 22: 'fabulous',
 23: 'vegetarian',
 24: 'showed',
 25: 'any',
 26: 'twice',
 27: "n't",
 28: 'drawing',
 29: 'away',
 30: 'wash',
 31: 'fare',
 32: 'combos',
 33: 'THIS',
 34: 'blame',
 35: 'is',
 36: 'STALE',
 37: 'said',
 38: 'just',
 39: 'anything',
 40: 'nude',
 41: 'ventilation',
 42: 'each',
 43: 'Phoenix',
 44: 'tepid',
 45: 'Mary',
 46: 'moz',
 47: 'trap',
 48: 'before',
 49: 'serve',
 50: 'pho',
 51: 'use',
 52: 'covered',
 53: 'gem',
 54: 'plastic',
 55: 'expert/connisseur',
 56: 'disgraceful',
 57: 'lukewarm',
 58: 'buying',
 59: 'does',
 60: 'way',
 61: 'patty',
 62: 'recommendation',
 63: 'nigiri',
 64: 'Why',
 65: 'street',
 66: '%',
 67: 'burned',
 68: '

## 轉換句子為bag of words型式

In [11]:
def _get_bag_of_words_vector(sentence, word_index_dic, whole_words):
    sentence = sentence
    '''創建一個vector'''
    vector = np.zeros(shape=(len(whole_words)))
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            id = word_index_dic.get(word)
            vector[id] = vector[id] + 1
    return vector


In [12]:
_get_bag_of_words_vector('Wow... Loved this place.', word_index, whole_words)

array([0., 0., 0., ..., 0., 0., 0.])