# NLP Basic Assignment
## 과제 : spam.csv를 활용하여 유의미한 해석을 도출해주세요!

In [2]:
import pandas as pd

## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다.
- 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다.

In [3]:
spam = pd.read_csv('spam.csv')

In [4]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [5]:
spam.iloc[8]['v2']

'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

## Tokenizing


- 코퍼스: 분석에 활용하기 위한 자연어 데이터
- 토큰: 코퍼스를 의미 있는 작은 단위로 나누었을 때의 이 단위
- 토큰화: 하나의 코퍼스를 여러 개의 토큰으로 나누는 과정

<참고> https://www.nltk.org/api/nltk.tokenize.html

In [6]:
import nltk

### - word_tokenize

In [7]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize

nltk.download('punkt')
word_tokenize(spam.iloc[5]['v2'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MJHwang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['FreeMsg',
 'Hey',
 'there',
 'darling',
 'it',
 "'s",
 'been',
 '3',
 'week',
 "'s",
 'now',
 'and',
 'no',
 'word',
 'back',
 '!',
 'I',
 "'d",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 '?',
 'Tb',
 'ok',
 '!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 ',',
 'å£1.50',
 'to',
 'rcv']

- word_tokenize : 어퍼스트로피나 콤마를 토큰화의 기준으로 사용함

### - sent_tokenize

In [8]:
from nltk.tokenize import sent_tokenize

sent_tokenize(spam.iloc[5]['v2'])

["FreeMsg Hey there darling it's been 3 week's now and no word back!",
 "I'd like some fun you up for it still?",
 'Tb ok!',
 'XxX std chgs to send, å£1.50 to rcv']

- sent_tokenize : 문장 단위로 토큰화

### - WhitespaceTokenizer

In [9]:
from nltk.tokenize import WhitespaceTokenizer

list(WhitespaceTokenizer().span_tokenize(spam.iloc[5]['v2']))

[(0, 7),
 (8, 11),
 (12, 17),
 (18, 25),
 (26, 30),
 (31, 35),
 (36, 37),
 (38, 44),
 (45, 48),
 (49, 52),
 (53, 55),
 (56, 60),
 (61, 66),
 (67, 70),
 (71, 75),
 (76, 80),
 (81, 84),
 (85, 88),
 (89, 91),
 (92, 95),
 (96, 98),
 (99, 105),
 (106, 108),
 (109, 112),
 (113, 116),
 (117, 120),
 (121, 125),
 (126, 128),
 (129, 134),
 (135, 141),
 (142, 144),
 (145, 148)]

## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram 등이 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [10]:
from collections import Counter

tokenized_word = word_tokenize(spam.iloc[5]['v2'])
vocab = Counter(tokenized_word)
vocab

# counter : 파라미터를 단어 리스트로 받고, 각 단어의 등장 빈도를 딕셔너리 형태로 반환

Counter({'FreeMsg': 1,
         'Hey': 1,
         'there': 1,
         'darling': 1,
         'it': 2,
         "'s": 2,
         'been': 1,
         '3': 1,
         'week': 1,
         'now': 1,
         'and': 1,
         'no': 1,
         'word': 1,
         'back': 1,
         '!': 2,
         'I': 1,
         "'d": 1,
         'like': 1,
         'some': 1,
         'fun': 1,
         'you': 1,
         'up': 1,
         'for': 1,
         'still': 1,
         '?': 1,
         'Tb': 1,
         'ok': 1,
         'XxX': 1,
         'std': 1,
         'chgs': 1,
         'to': 2,
         'send': 1,
         ',': 1,
         'å£1.50': 1,
         'rcv': 1})

### - one-hot encoding

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

tokenizer = Tokenizer()
tokenizer.fit_on_texts([spam.iloc[5]['v2']])

print("단어 집합: ", tokenizer.word_index)

단어 집합:  {'to': 1, 'freemsg': 2, 'hey': 3, 'there': 4, 'darling': 5, "it's": 6, 'been': 7, '3': 8, "week's": 9, 'now': 10, 'and': 11, 'no': 12, 'word': 13, 'back': 14, "i'd": 15, 'like': 16, 'some': 17, 'fun': 18, 'you': 19, 'up': 20, 'for': 21, 'it': 22, 'still': 23, 'tb': 24, 'ok': 25, 'xxx': 26, 'std': 27, 'chgs': 28, 'send': 29, 'å£1': 30, '50': 31, 'rcv': 32}


In [12]:
encoded = tokenizer.texts_to_sequences([spam.iloc[5]['v2']])
print(encoded)

[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 1, 29, 30, 31, 1, 32]]


In [13]:
one_hot = to_categorical([encoded])
print(one_hot)

[[[[0. 0. 1. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 1. 0.]
   [0. 1. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 1.]]]]


- [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 1, 29, 30, 31, 1, 32] 을 순서대로 one-hot encoding이 됨

### - cbow

one-hot 인코딩은 희소표현. cbow, skip-gram은 분산표현
- 희소 표현: 표현하고자 하는 단어의 인덱스 값만 1, 나머지는 0. 각 단어간 유사성을 표현할 수 없음.
<br>
ex) 강아지 = [ 0 0 0 0 1 0 0 0 0 0 0 0 ... 중략 ... 0]
- 분산 표현: 저차원에 단어의 의미를 여러 차원에다가 분산하여 표현. 단어 간 유사도 계산 가능.
<br>
ex) 강아지 = [0.2 0.3 0.5 0.7 0.2 ... 중략 ... 0.2]

In [14]:
tokenized_word

['FreeMsg',
 'Hey',
 'there',
 'darling',
 'it',
 "'s",
 'been',
 '3',
 'week',
 "'s",
 'now',
 'and',
 'no',
 'word',
 'back',
 '!',
 'I',
 "'d",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 '?',
 'Tb',
 'ok',
 '!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 ',',
 'å£1.50',
 'to',
 'rcv']

Word2Vec의 sg에 따라 cbow, skip-gram 선택 가능
- sg = 0 -> cbow
- sg = 1 -> skip-gram

In [17]:
from gensim.models import Word2Vec

cbow = Word2Vec(sentences=spam['v2'].apply(word_tokenize), vector_size=100, window=5, min_count=3, workers=4, sg=0)

<gensim.models.word2vec.Word2Vec at 0x25b12369ca0>

### - skip-gram

In [21]:
skipgram = Word2Vec(sentences=spam['v2'].apply(word_tokenize), vector_size=100, window=5, min_count=3, workers=4, sg=1)

## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

### 1. 단어 간 유사도 측정

In [25]:
similarity_score = cbow.wv.similarity("week", "every")
print("Similarity between word1 and word2:", similarity_score)

Similarity between word1 and word2: 0.99920595


In [26]:
similarity_score = skipgram.wv.similarity("week", "every")
print("Similarity between word1 and word2:", similarity_score)

Similarity between word1 and word2: 0.90321475


### 2. 단어 간 가장 유사한 단어

In [27]:
cbow.wv.most_similar('week')

[('Ur', 0.9993053078651428),
 ('every', 0.9992058873176575),
 ('stop', 0.9991354942321777),
 ('Get', 0.9991350173950195),
 ('Good', 0.9990766644477844),
 ('only', 0.9990458488464355),
 ('free', 0.9989662766456604),
 ('Dear', 0.998926043510437),
 ('new', 0.9988760352134705),
 ('8007', 0.9988464713096619)]

In [28]:
skipgram.wv.most_similar('week')

[('word', 0.9381897449493408),
 ('new', 0.9144180417060852),
 ('For', 0.911250114440918),
 ('video', 0.9106596112251282),
 ('every', 0.903214693069458),
 ('UK', 0.8949734568595886),
 ('SMS', 0.8920695185661316),
 ('NOKIA', 0.8890995979309082),
 ('Get', 0.8816767334938049),
 ('collect', 0.8803457021713257)]

### 3. 단어 벡터 검색

In [29]:
vector = cbow.wv["word"]
print("Vector for 'word':", vector)

Vector for 'word': [-1.42568186e-01  2.31209040e-01  1.07032202e-01  4.81613502e-02
  4.53250632e-02 -4.29760665e-01  1.68147832e-01  5.32940090e-01
 -2.58000195e-01 -1.65584818e-01 -7.27882311e-02 -3.38820040e-01
  1.08320631e-01  8.02676156e-02  1.22988231e-01 -1.58413351e-01
  2.04646647e-01 -2.23385870e-01 -6.67287633e-02 -4.94882107e-01
  1.21062405e-01  7.23120347e-02  1.10856242e-01 -1.40261769e-01
 -8.57421681e-02 -2.73279026e-02 -2.55403399e-01 -1.23022057e-01
 -2.70551592e-01  1.24705732e-02  2.31581196e-01  2.03479901e-02
  1.20088384e-01 -2.73764640e-01 -5.91451600e-02  2.30480075e-01
 -5.60998963e-03 -1.51757821e-01 -2.27854699e-01 -4.11120504e-01
  1.30755976e-02 -2.46428832e-01 -1.27879292e-01  8.72472599e-02
  1.93697870e-01 -1.87982783e-01 -1.48701414e-01 -1.09842546e-01
  8.21743831e-02  1.68384939e-01  2.24017963e-01 -3.43009412e-01
 -7.08083287e-02 -4.81659099e-02 -7.14578032e-02  7.67880976e-02
  1.47160843e-01 -6.14247052e-03 -3.46115619e-01  7.23671690e-02
  1.16

In [30]:
vector = skipgram.wv["word"]
print("Vector for 'word':", vector)

Vector for 'word': [-0.03196335  0.10805499  0.19064097  0.10633983 -0.04288951 -0.36377376
  0.20999326  0.34201235 -0.21890335 -0.04363089 -0.03080299 -0.34259453
 -0.13266449  0.13810052 -0.10727    -0.01850996  0.22304724 -0.19274415
  0.01843401 -0.4310174   0.20201294 -0.07779159  0.04901658 -0.03003365
 -0.10303938 -0.10883077 -0.21747072 -0.17795484 -0.2035624   0.0131456
  0.23820229  0.15389147  0.09024838 -0.2658937  -0.0053693  -0.06856731
 -0.00578836 -0.29147112 -0.12758096 -0.30546114  0.02306102 -0.08499873
  0.13350525 -0.1040848   0.0955472  -0.10975656 -0.07003711 -0.11428246
  0.08415549  0.27035     0.11713465 -0.31333873 -0.21494344  0.06611905
 -0.01526462  0.24918462  0.13923153 -0.00556122 -0.3797307   0.04003341
  0.1880127   0.2078167  -0.16021156 -0.10276476 -0.09697908  0.35390842
  0.0323716   0.1375962  -0.19182122  0.22714925 -0.00261581  0.06428365
  0.07657789  0.05006001  0.21826224 -0.03240235  0.13551939 -0.1028408
 -0.21210921 -0.16377506 -0.364283