In [1]:
import tensorflow_datasets as tfds
import urllib.request
import pandas as pd

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7fe5f6afb0f0>)

In [3]:
train_df = pd.read_csv('IMDb_Reviews.csv')

In [4]:
train_df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [6]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(train_df['review'], target_vocab_size=3000)

In [11]:
print(tokenizer.subwords[:100])

['the_', ', ', '. ', 's_', 'a_', 'and_', 'of_', 'to_', 'is_', 'br', 'in_', 'I_', 'that_', 'this_', 'it_', 't_', ' /><', ' />', 'was_', 'ing_', 'The_', 'd_', 'as_', 'with_', 'ed_', 'for_', 'y_', 'on_', '.<', 'e_', 'but_', 'movie_', ' (', 'are_', 'ly_', 'it', 'have_', 'his_', 'be_', 'film_', 'not_', 'an_', ' "', 'you_', 'at_', 'one_', 'by_', 'he_', 'or_', 'who_', 'from_', '" ', 'like_', 'all_', 'they_', 'so_', ') ', 'just_', 'has_', 'about_', 'her_', 'n_', 'out_', 'This_', 'ing', 'r_', 'some_', 'film', 'movie', 'es_', 'very_', 'more_', 'er_', 've_', 'on', 'm_', 'It_', 'what_', 'would_', 'ed', 'up_', 'when_', 'if_', 'good_', 'my_', 'which_', 'their_', 'only_', 'can_', '? ', 'even_', 'really_', 'had_', 'l_', 'no_', '! ', 'see_', 'were_', 'er', 'al_']


In [13]:
sample = "My family and I normally do not watch local movies for the simple reason that they are poorly made"

In [14]:
tokenized = tokenizer.encode(sample)
# 정수 인코딩
tokenized

[819,
 886,
 6,
 12,
 818,
 503,
 35,
 129,
 41,
 203,
 2185,
 2756,
 223,
 26,
 1,
 2057,
 2756,
 1066,
 13,
 55,
 34,
 2032,
 35,
 1717]

In [15]:
# 문자 디코딩
original = tokenizer.decode(tokenized)
original

'My family and I normally do not watch local movies for the simple reason that they are poorly made'

In [16]:
for ts in tokenized:
  print('{} ----> {}'.format(ts, tokenizer.decode([ts])))

819 ----> My 
886 ----> family 
6 ----> and 
12 ----> I 
818 ----> nor
503 ----> mal
35 ----> ly 
129 ----> do 
41 ----> not 
203 ----> watch 
2185 ----> local
2756 ---->  
223 ----> movies 
26 ----> for 
1 ----> the 
2057 ----> simple
2756 ---->  
1066 ----> reason 
13 ----> that 
55 ----> they 
34 ----> are 
2032 ----> poor
35 ----> ly 
1717 ----> made


In [17]:
sample2 = "My family and I normally do not watch local movies for the simple reasonxyz that they are poorly made"

In [18]:
tokenized = tokenizer.encode(sample2)
tokenized

[819,
 886,
 6,
 12,
 818,
 503,
 35,
 129,
 41,
 203,
 2185,
 2756,
 223,
 26,
 1,
 2057,
 2756,
 1348,
 2844,
 2845,
 2037,
 13,
 55,
 34,
 2032,
 35,
 1717]

In [19]:
original = tokenizer.decode(tokenized)
original

'My family and I normally do not watch local movies for the simple reasonxyz that they are poorly made'

In [20]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")

('ratings_train.txt', <http.client.HTTPMessage at 0x7fe5edfbc780>)

In [21]:
train_data = pd.read_table('ratings_train.txt')

In [22]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [23]:
train_data.isnull().sum()

id          0
document    5
label       0
dtype: int64

In [24]:
train_data = train_data.dropna(how = 'any')

In [25]:
train_data.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [26]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(train_data['document'], target_vocab_size=3000)

In [27]:
print(tokenizer.subwords[:100])

['. ', '이_', '의_', '영화', '다', '을_', '..', '도_', '는_', '...', '이', '은_', '에_', '고_', '가_', '한_', ', ', '고', '지', '를_', '가', '아', '.. ', '어', '게_', '나', '기', '너무_', '정말_', '리', '영화_', '로_', '다_', '스', '도', '요', '지_', '시', '만_', '자', '... ', '진짜_', '한', '서_', '과_', '대', '안', '으로_', '로', '정', '하', '보', '오', '사', '마', '라', '인', '일', '네', '그', '면_', '해', '? ', '음', '에', '하고_', '주', '전', '네요', '와_', '드', '서', '에서_', '수', '만', '우', '부', '니', '성', '미', '는', '비', '무', '나_', '의', '내', '게', '조', '수_', '개', '....', '상', '하는_', '장', '구', '습니다', '거', '은', '보고_', '함']


In [28]:
sample = train_data['document'][21]

In [31]:
sample

'보면서 웃지 않는 건 불가능하다'

In [29]:
tokenized = tokenizer.encode(sample)
tokenized

[889, 299, 37, 923, 254, 2344, 807, 117]

In [30]:
original = tokenizer.decode(tokenized)
original

'보면서 웃지 않는 건 불가능하다'

In [32]:
sample2 = '보면서 웃지 않는 건 불가능하다 ㅋㅋㅋ'

In [33]:
tokenized = tokenizer.encode(sample2)
tokenized

[889, 299, 37, 923, 254, 2344, 807, 1151, 289]

In [34]:
original = tokenizer.decode(tokenized)

In [35]:
for ts in tokenized:
  print('{} ----> {}'.format(ts, tokenizer.decode([ts])))

889 ----> 보면서 
299 ----> 웃
37 ----> 지 
923 ----> 않는 
254 ----> 건 
2344 ----> 불가
807 ----> 능
1151 ----> 하다 
289 ----> ㅋㅋㅋ


In [36]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5.8MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


In [37]:
import sentencepiece as spm

In [39]:
with open('imdb_review.txt', 'w', encoding='utf-8') as f:
  f.write('\n'.join(train_df['review']))

In [42]:
spm.SentencePieceTrainer.Train(
    '--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=1000'
)

In [44]:
import csv
vocab_list = pd.read_csv('imdb.vocab', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [45]:
vocab_list.sample(10)

Unnamed: 0,0,1
1788,gs,-1785
4228,▁professional,-4225
4473,ensional,-4470
1704,▁eyes,-1701
2022,▁Film,-2019
2607,▁screenplay,-2604
1162,▁piece,-1159
3465,▁island,-3462
1654,▁effort,-1651
3269,▁symp,-3266


In [46]:
sp = spm.SentencePieceProcessor()
vocab_file = 'imdb.model'
sp.load(vocab_file)

True

In [47]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]

In [48]:
for l in lines:
  print(l)
  print(sp.encode_as_pieces(l))
  print(sp.encode_as_ids(l))
  print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[30, 547, 4951, 4927, 146, 159, 310, 34, 45, 62, 410, 4946]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[30, 124, 1129, 1213, 4, 671, 252, 94, 1013, 32, 89]



In [49]:
sp.GetPieceSize()

5000

In [50]:
sp.IdToPiece(430)

'▁off'

In [53]:
sp.PieceToId('_character')

0

In [54]:
sample = 'I have waited a long time for someone to film'

sp.encode(sample, out_type=str)

['▁I',
 '▁have',
 '▁wa',
 'ited',
 '▁a',
 '▁long',
 '▁time',
 '▁for',
 '▁someone',
 '▁to',
 '▁film']

In [55]:
sp.encode(sample, out_type=int)

[30, 124, 1129, 1213, 4, 671, 252, 94, 1013, 32, 89]

In [56]:
with open('naver_review.txt', 'w', encoding='utf8') as f:
  f.write('\n'.join(train_data['document']))

In [59]:
spm.SentencePieceTrainer.Train(
    '--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe max_sentence_length=9999'
)

In [60]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)

In [61]:
vocab_list[:10]

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1
5,▁영화,-2
6,▁이,-3
7,▁아,-4
8,...,-5
9,▁그,-6


In [62]:
sp = spm.SentencePieceProcessor()
vocab_file = 'naver.model'
sp.load(vocab_file)

True

In [64]:
lines = [
         "뭐 이딴 것도 영화냐.",
         "진짜 최고의 영화입니다 ㅋㅋ"
]
for l in lines:
  print(l)
  print(sp.encode_as_pieces(l))
  print(sp.encode_as_ids(l))
  print()

뭐 이딴 것도 영화냐.
['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[124, 965, 1280, 2727, 3289]

진짜 최고의 영화입니다 ㅋㅋ
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 200, 816, 84]

