In [None]:
!pip install torchdata==0.5.1 torchtext==0.14.1

In [3]:
import numpy as np
import torch
import torchtext

In [4]:
# 뉴스기사 데이터, 라벨은 기사의 주제 분야임
dataset_train_AG_NEWS, dataset_test_AG_NEWS = torchtext.datasets.AG_NEWS(root='./data/')
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

In [7]:
next(iter(dataset_train_AG_NEWS))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [13]:
dataset_train_AG_NEWS = list(dataset_train_AG_NEWS)
dataset_test_AG_NEWS = list(dataset_test_AG_NEWS)
dataset_train_AG_NEWS[0]

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [9]:
# tokenizer 불러오기
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# torchtext.data.utils.get_tokenizer('spacy')
tokenizer('Hello World!')

['hello', 'world', '!']

In [15]:
import collections
counter = collections.Counter()
for (label, line) in dataset_train_AG_NEWS:
    counter.update(tokenizer(line))

In [17]:
counter.most_common(5)

[('.', 225971), ('the', 203843), (',', 165685), ('to', 119205), ('a', 110153)]

In [18]:
len(counter)

95810

In [29]:
# torch의 vocabulary를 만들어준다. -> 번호가 매겨져 있는 dictionary 형태임 (빈도수와 관련 있는 번호는 아님)
vocab = torchtext.vocab.vocab(counter, min_freq= 1)
print(f"Vocab size if {len(vocab)}")
print(vocab.get_itos()[50:53],',', vocab.get_stoi()['on'],',',vocab.get_stoi()['another'],',',vocab.get_stoi()['part'])

Vocab size if 95810
['on', 'another', 'part'] , 50 , 51 , 52


In [33]:
def encode(x):
    return [vocab.get_stoi()[s] for n, s in enumerate(tokenizer(x))]
def decode(x):
    return [vocab.get_itos()[i] for n, i in enumerate(x)]

print(encode('I love to play with my words'))
print(decode(encode('I love to play with my words')))

[599, 3279, 97, 1220, 329, 225, 7368]
['i', 'love', 'to', 'play', 'with', 'my', 'words']


counter vectorization

In [36]:
# 예시
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
        'I like hot dogs.',
        'The dog ran fast.',
        'Its hot outside.',
    ]
vectorizer.fit(corpus)
print(vectorizer.vocabulary_)
print(vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray())

# dog(0)가 1번, dogs(1)가 1번, hot(3)가 2번

{'like': 5, 'hot': 3, 'dogs': 1, 'the': 8, 'dog': 0, 'ran': 7, 'fast': 2, 'its': 4, 'outside': 6}


array([[1, 1, 0, 2, 0, 0, 0, 0, 0]])

In [42]:
# counter vectorization 하는 것임
def to_bow(text, bow_vocab_size=len(vocab)):
    res = torch.zeros(bow_vocab_size)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

example = dataset_train_AG_NEWS[128][1]
print(example)
print(encode(example))
print(to_bow(example))

Space Science Pioneer Van Allen Questions Human Spaceflight (SPACE.com) SPACE.com - A leading space scientist has called to question the validity of human spaceflight, suggesting that sending astronauts outward from Earth is outdated, too costly, and the science returned is trivial.
[1509, 1469, 1976, 1977, 1978, 1979, 1585, 1980, 9, 1509, 2, 168, 11, 1509, 2, 168, 12, 36, 1082, 1509, 1873, 35, 1778, 97, 1852, 7, 1981, 19, 1585, 1980, 14, 1982, 260, 1983, 1492, 1984, 78, 1833, 179, 1985, 14, 1136, 1986, 14, 41, 7, 1469, 1987, 179, 1988, 2]
tensor([0., 0., 3.,  ..., 0., 0., 0.])


DataLoader

In [45]:
from torch.utils.data import DataLoader

dataloader_train_AG_NEWS = DataLoader(dataset_train_AG_NEWS, batch_size=4, shuffle=True)
dataloader_test_AG_NEWS = DataLoader(dataset_test_AG_NEWS, batch_size=4, shuffle=True)

In [47]:
c = next(iter(dataloader_train_AG_NEWS))
c

[tensor([3, 4, 2, 2]),
 ("Survey: Incentives dim US cars' image  DETROIT -- The cash rebates and financing deals that Detroit's Big Three automakers have used to drive business in recent years are diminishing the value of their vehicles in the eyes of consumers, a new quarterly survey of customer satisfaction indicates.",
  'Macromedia upgrading Flex presentation server Macromedia on Monday will introduce an upgrade to its Flex presentation server and framework for building rich Internet applications, with improvements in data display, visualization, styling, and performance.',
  'Schwartzel has a very bright future He is so young and so good that the mere sight of him hitting a golf ball might be enough to induce an older player to give up, go into a dark room and babble quietly to himself.',
  'Victorious Vaughan on the right path to glory Michael Vaughan has endured a lean time in one-day cricket but show him an Australian cricketer and he is a man transformed. Yesterday, at Edgbast

In [48]:
# collate_fn을 사용하면 원하는 형태로 바꿔서 loader를 내보낼 수 있음
# (주로 dataset이 고정길이가 아닌 경우 padding을 위해 많이 사용함)

def bowify(b):
    return (
            torch.stack([to_bow(t[1]) for t in b]),
            torch.LongTensor([t[0]-1 for t in b]),
    )
dataloader_train_AG_NEWS = DataLoader(dataset_train_AG_NEWS, batch_size=4, collate_fn=bowify, shuffle=True)
dataloader_test_AG_NEWS = DataLoader(dataset_test_AG_NEWS, batch_size=4, collate_fn=bowify, shuffle=True)
c = next(iter(dataloader_train_AG_NEWS))
c

(tensor([[0., 0., 3.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 5.,  ..., 0., 0., 0.],
         [0., 0., 4.,  ..., 0., 0., 0.]]),
 tensor([2, 3, 3, 3]))

In [49]:
c[0].shape

torch.Size([4, 95810])