<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/English_Vocabrary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# glue data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print("train : {}".format(len(train_sentences)))
print("test : {}".format(len(test_sentences)))

print(train_sentences[:2])
print(test_sentences[:2])


train : 8551
test : 1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up."]
['Bill whistled past the house.', 'The car honked its way down the road.']


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# tokenizing & cleaning
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

# Word tokenizing
sen_to_tokens = []
for sen in train_sentences[:3]:
  sen_to_tokens.append(TreebankWordTokenizer().tokenize(sen))

print(sen_to_tokens)


# cleaning
stopwords = set(stopwords.words('english'))

clean_tokens = []
for sen in sen_to_tokens:
  for t in sen:
    if t not in stopwords:
      clean_tokens.append(t)

print(clean_tokens)



[['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.'], ['One', 'more', 'pseudo', 'generalization', 'and', 'I', "'m", 'giving', 'up', '.'], ['One', 'more', 'pseudo', 'generalization', 'or', 'I', "'m", 'giving', 'up', '.']]
['Our', 'friends', 'wo', "n't", 'buy', 'analysis', ',', 'let', 'alone', 'next', 'one', 'propose', '.', 'One', 'pseudo', 'generalization', 'I', "'m", 'giving', '.', 'One', 'pseudo', 'generalization', 'I', "'m", 'giving', '.']


In [None]:
# Vocabrary
from nltk import FreqDist

import numpy as np
from collections import Counter


# 단어들의 빈도수를 계산한다.
vocab = FreqDist(np.hstack(clean_tokens)) 
vocab = dict(vocab)


# 빈도수가 높은 단어들만 원하는 수 (vocab size) 만큼 불러온다.
vocab_size = 10
filtered_vocab = Counter(vocab).most_common(vocab_size)



# token에 index 부여
word_to_index = {}
word_to_index['pad'] = 0                  # padding을 위해 'pad'를 생성
word_to_index['unk'] = 1                  # vocab에 없는 token은 'unk'

for index, word in enumerate(filtered_vocab):
  word_to_index[word[0]] = index + 2



# Sentence를 index 조합으로 구성
sen_to_index = []
for sen in sen_to_tokens:
  temp = []
  for token in sen:
    try:
      temp.append(word_to_index[token])
    except:
      temp.append(word_to_index['unk'])

  sen_to_index.append(temp)

# padding
max_len = 0
for sen in sen_to_index:
  if max_len <= len(sen):
    max_len = len(sen)

for sen in sen_to_index:
  if len(sen) < max_len:
    sen += [word_to_index['pad']] * (max_len - len(sen))



print('vocab freq      : {}'.format(vocab))
print('vocab filtering : {}'.format(filtered_vocab))
print('word to index   : {}'.format(word_to_index))
print('sentence        : {}'.format(sen_to_tokens))
print('sen to index    : {}'.format(sen_to_index))
print('padded sentence : {}'.format(sen_to_index))


vocab freq      : {'Our': 1, 'friends': 1, 'wo': 1, "n't": 1, 'buy': 1, 'analysis': 1, ',': 1, 'let': 1, 'alone': 1, 'next': 1, 'one': 1, 'propose': 1, '.': 3, 'One': 2, 'pseudo': 2, 'generalization': 2, 'I': 2, "'m": 2, 'giving': 2}
vocab filtering : [('.', 3), ('One', 2), ('pseudo', 2), ('generalization', 2), ('I', 2), ("'m", 2), ('giving', 2), ('Our', 1), ('friends', 1), ('wo', 1)]
word to index   : {'pad': 0, 'unk': 1, '.': 2, 'One': 3, 'pseudo': 4, 'generalization': 5, 'I': 6, "'m": 7, 'giving': 8, 'Our': 9, 'friends': 10, 'wo': 11}
sentence        : [['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.'], ['One', 'more', 'pseudo', 'generalization', 'and', 'I', "'m", 'giving', 'up', '.'], ['One', 'more', 'pseudo', 'generalization', 'or', 'I', "'m", 'giving', 'up', '.']]
sen to index    : [[9, 10, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2], [3, 1, 4, 5, 1, 6, 7, 8, 1, 2, 0, 0, 0, 0, 0, 0], [3, 1, 4, 5, 1, 6, 7, 8,

In [None]:
# tensorflow(keras) vocabrary

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words = 100, oov_token = 'OOV')           # num_words는 vocab size이고, oov_token은 위의 'unk'와 같다.
tokenizer.fit_on_texts(train_sentences[:3])                         # train sentences에 대한 문장들 내의 단어들에 인덱스를 부여한다. 구두점 (, . ! ?)은 제외된다.

word_index = tokenizer.word_index
train_sen_to_index= tokenizer.texts_to_sequences(train_sentences[:3])        # train setences를 index 조합으로 구성한다.
padded_train_sen = pad_sequences(train_sen_to_index, padding = 'post')      # padding

test_sen_to_index = tokenizer.texts_to_sequences(test_sentences[:3])


print("train sentences              : {}".format(train_sentences[:3]))
print("test sentences               : {}".format(test_sentences[:3]))
print()

print("word index                   : {}".format(word_index))
print("train sentences to index seq : {}".format(train_sen_to_index))
print("test sentences to index seq  : {}".format(test_sen_to_index))
print()

print("padded train sentences : {}".format(padded_train_sen))
print()

print("Encoding samples")
print(train_sentences[0], "->", train_sen_to_index[0])
print(test_sentences[0], "->", test_sen_to_index[0])

train sentences              : ["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up.", "One more pseudo generalization or I'm giving up."]
test sentences               : ['Bill whistled past the house.', 'The car honked its way down the road.', 'Bill pushed Harry off the sofa.']

word index                   : {'OOV': 1, 'one': 2, 'more': 3, 'pseudo': 4, 'generalization': 5, "i'm": 6, 'giving': 7, 'up': 8, 'our': 9, 'friends': 10, "won't": 11, 'buy': 12, 'this': 13, 'analysis': 14, 'let': 15, 'alone': 16, 'the': 17, 'next': 18, 'we': 19, 'propose': 20, 'and': 21, 'or': 22}
train sentences to index seq : [[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 2, 19, 20], [2, 3, 4, 5, 21, 6, 7, 8], [2, 3, 4, 5, 22, 6, 7, 8]]
test sentences to index seq  : [[1, 1, 1, 17, 1], [17, 1, 1, 1, 1, 1, 17, 1], [1, 1, 1, 1, 17, 1]]

padded train sentences : [[ 9 10 11 12 13 14 15 16 17 18  2 19 20]
 [ 2  3  4  5 21  6  7  8  0  0  0  0  0]


In [92]:
# torchtext 0.12.0 version

from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict

tokenizer = get_tokenizer("basic_english")

tokens = []
for sen in train_sentences[:3]:
  sen_to_tokens = tokenizer(sen)
  tokens.extend(sen_to_tokens)

tokens_counter = Counter(tokens)

sorted_tokens_dict = sorted(tokens_counter.items(), key = lambda x : x[1], reverse=True)

ordered_dict = OrderedDict(sorted_tokens_dict)

vocab = vocab(ordered_dict, specials=['pad','unk'])       # 'pad' = 0, 'unk' = 1

token_to_index = vocab.get_stoi()
token_to_index = sorted(token_to_index.items(), key = lambda x : x[1])


print('tokens         : {}'.format(tokens))
print('sorted by freq : {}'.format(sorted_tokens_dict))
print('ordered dict   : {}'.format(ordered_dict))
print('token to index : {}'.format(token_to_index))


tokens         : ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', 'one', 'more', 'pseudo', 'generalization', 'and', 'i', "'", 'm', 'giving', 'up', '.', 'one', 'more', 'pseudo', 'generalization', 'or', 'i', "'", 'm', 'giving', 'up', '.']
sorted by freq : [("'", 3), ('one', 3), ('.', 3), ('more', 2), ('pseudo', 2), ('generalization', 2), ('i', 2), ('m', 2), ('giving', 2), ('up', 2), ('our', 1), ('friends', 1), ('won', 1), ('t', 1), ('buy', 1), ('this', 1), ('analysis', 1), (',', 1), ('let', 1), ('alone', 1), ('the', 1), ('next', 1), ('we', 1), ('propose', 1), ('and', 1), ('or', 1)]
ordered dict   : OrderedDict([("'", 3), ('one', 3), ('.', 3), ('more', 2), ('pseudo', 2), ('generalization', 2), ('i', 2), ('m', 2), ('giving', 2), ('up', 2), ('our', 1), ('friends', 1), ('won', 1), ('t', 1), ('buy', 1), ('this', 1), ('analysis', 1), (',', 1), ('let', 1), ('alone', 1), ('the', 1), ('next', 1), ('we', 1), ('propose',