<a href="https://colab.research.google.com/github/fininsight/text-mining-tutorial/blob/master/2_%EB%AC%B8%EC%9E%A5%EC%9D%98_%ED%91%9C%ED%98%84_Sentence_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 문장의 표현 (Sentence Representation)

# 1 BoW (Bag of Words)

<img src="https://image.slidesharecdn.com/vector-space-models-170118145044/95/cs571-vector-space-models-3-638.jpg?cb=1485433004" />

https://en.wikipedia.org/wiki/Bag-of-words_model
https://www.slideshare.net/jchoi7s/cs571-vector-space-models

## 1.1 동물원 예제

In [0]:
sentence_ls = [
 '오늘 동물원에서 코끼리를 봤어',
 '오늘 동물원에서 원숭이에게 사과를 줬어'   
]

### 1) 띄어쓰기 단위로 토큰화

In [0]:
sentence_ls = [sentence.split() for sentence in sentence_ls]

In [6]:
sentence_ls

[['오늘', '동물원에서', '코끼리를', '봤어'], ['오늘', '동물원에서', '원숭이에게', '사과를', '줬어']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [0]:
from collections import defaultdict

token_dict = defaultdict(lambda : len(token_dict))

for sentence in sentence_ls:
    for token in sentence:
        token_dict[token]

In [8]:
token_dict

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '봤어': 3,
             '사과를': 5,
             '오늘': 0,
             '원숭이에게': 4,
             '줬어': 6,
             '코끼리를': 2})

### 3) 토큰 인덱스 정렬

In [9]:
index_token_ls = sorted((value, key) for key, value in token_dict.items())
index_token_ls

[(0, '오늘'),
 (1, '동물원에서'),
 (2, '코끼리를'),
 (3, '봤어'),
 (4, '원숭이에게'),
 (5, '사과를'),
 (6, '줬어')]

In [10]:
token_in_order = [tup[1] for tup in index_token_ls]
token_in_order

['오늘', '동물원에서', '코끼리를', '봤어', '원숭이에게', '사과를', '줬어']

### 4) 빈(empty) BOW 생성

In [0]:
import pandas as pd
import numpy as np

n_words = len(token_dict) # 전체 고유 토큰의 수
n_sentence = len(sentence_ls) # 전체 문장의 수

BOW = pd.DataFrame(
    np.zeros((n_sentence, n_words)),
    columns = token_in_order,
    index = ['문장_1', '문장_2'],
    dtype = int,
)

In [12]:
BOW

Unnamed: 0,오늘,동물원에서,코끼리를,봤어,원숭이에게,사과를,줬어
문장_1,0,0,0,0,0,0,0
문장_2,0,0,0,0,0,0,0


### 5) 각 토큰을 BOW에 하나씩 담는다.

In [0]:
for i, sentence in enumerate(sentence_ls):
    for token in sentence:
        
        token_location = token_dict[token] # 해당 토큰의 위치(column)
        BOW.iloc[i, token_location] += 1

In [14]:
BOW

Unnamed: 0,오늘,동물원에서,코끼리를,봤어,원숭이에게,사과를,줬어
문장_1,1,1,1,1,0,0,0
문장_2,1,1,0,0,1,1,1




---



## 1.2 양념치킨과 후라이드치킨 예제

In [0]:
sentence_ls = ['나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해',
               '나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 싫어해']

#### 1) 띄어쓰기 단위로 토큰화

In [0]:
sentence_ls = [sentence.split() for sentence in sentence_ls]

In [17]:
sentence_ls

[['나는', '양념', '치킨을', '좋아해', '하지만', '후라이드', '치킨을', '싫어해'],
 ['나는', '후라이드', '치킨을', '좋아해', '하지만', '양념', '치킨을', '싫어해']]

#### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [0]:
from collections import defaultdict

token_dict = defaultdict(lambda : len(token_dict))

for sentence in sentence_ls:
    for token in sentence:
        token_dict[token]

In [19]:
token_dict

defaultdict(<function __main__.<lambda>>,
            {'나는': 0,
             '싫어해': 6,
             '양념': 1,
             '좋아해': 3,
             '치킨을': 2,
             '하지만': 4,
             '후라이드': 5})

#### 3) 토큰 인덱스 정렬

In [20]:
index_token_ls = sorted((value, key) for key, value in token_dict.items())
index_token_ls

[(0, '나는'),
 (1, '양념'),
 (2, '치킨을'),
 (3, '좋아해'),
 (4, '하지만'),
 (5, '후라이드'),
 (6, '싫어해')]

In [21]:
token_in_order = [tup[1] for tup in index_token_ls]
token_in_order

['나는', '양념', '치킨을', '좋아해', '하지만', '후라이드', '싫어해']

#### 4) 빈(empty) BOW 생성

In [0]:
import pandas as pd
import numpy as np

n_words = len(token_dict) # 전체 고유 토큰의 수
n_sentence = len(sentence_ls) # 전체 문장의 수

BOW = pd.DataFrame(
    np.zeros((n_sentence, n_words)),
    columns = token_in_order,
    index = ['문장_1', '문장_2'],
    dtype = int,
)

In [23]:
BOW

Unnamed: 0,나는,양념,치킨을,좋아해,하지만,후라이드,싫어해
문장_1,0,0,0,0,0,0,0
문장_2,0,0,0,0,0,0,0


#### 5) 각 토큰을 BOW에 하나씩 담는다.

In [0]:
for i, sentence in enumerate(sentence_ls):
    for token in sentence:
        
        token_location = token_dict[token] # 해당 토큰의 위치(column)
        BOW.iloc[i, token_location] += 1

In [25]:
BOW

Unnamed: 0,나는,양념,치킨을,좋아해,하지만,후라이드,싫어해
문장_1,1,1,2,1,1,1,1
문장_2,1,1,2,1,1,1,1




---



https://en.wikipedia.org/wiki/Document-term_matrix

# 2 N-gram

In [26]:
import nltk
from nltk import bigrams, word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

sentence = "I am a boy."
tokens = word_tokenize(sentence)

bigram = bigrams(tokens)
trigram = ngrams(tokens, 3)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
for t in bigram:
    print(t)

('I', 'am')
('am', 'a')
('a', 'boy')
('boy', '.')


In [28]:
for t in trigram:
    print(t)

('I', 'am', 'a')
('am', 'a', 'boy')
('a', 'boy', '.')


In [29]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
from nltk.corpus import movie_reviews

sentences = []
for tokens in movie_reviews.sents():
    bigram = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol="SS", right_pad_symbol="SE")
    sentences += [t for t in bigram]

sentences[:20]

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('SS', 'plot'),
 ('plot', ':'),
 (':', 'two'),
 ('two', 'teen'),
 ('teen', 'couples'),
 ('couples', 'go'),
 ('go', 'to'),
 ('to', 'a'),
 ('a', 'church'),
 ('church', 'party'),
 ('party', ','),
 (',', 'drink'),
 ('drink', 'and'),
 ('and', 'then'),
 ('then', 'drive'),
 ('drive', '.'),
 ('.', 'SE'),
 ('SS', 'they'),
 ('they', 'get'),
 ('get', 'into')]

# 3 TDM(Term-Document Matrix)

In [0]:
d1 = '오늘 동물원에서 원숭이를 봤어'
d2 = '오늘 동물원에서 코끼리를 봤어 봤어'
d3 = '동물원에서 원숭이에게 바나나를 줬어 바나나를'

document_ls = [d1, d2, d3]

### 1) 띄어쓰기 단위 토큰화

In [31]:
document_ls = [document.split() for document in document_ls]
document_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index) 지정

In [0]:
from collections import defaultdict

token_dict = defaultdict(lambda : len(token_dict))

for document in document_ls:
    for token in document:
        token_dict[token]

In [33]:
token_dict

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

### 3) 토큰 인덱스 정렬

In [34]:
index_token_ls = sorted((value, key) for key, value in token_dict.items())
index_token_ls

[(0, '오늘'),
 (1, '동물원에서'),
 (2, '원숭이를'),
 (3, '봤어'),
 (4, '코끼리를'),
 (5, '원숭이에게'),
 (6, '바나나를'),
 (7, '줬어')]

In [35]:
token_in_order = [tup[1] for tup in index_token_ls]
token_in_order

['오늘', '동물원에서', '원숭이를', '봤어', '코끼리를', '원숭이에게', '바나나를', '줬어']

### 4) 빈(empty) TDM 생성

In [0]:
import pandas as pd
import numpy as np

n_words = len(token_dict) # 전체 고유 토큰의 수
n_document = len(document_ls) # 전체 문서의 수

TDM = pd.DataFrame(
    np.zeros((n_document, n_words)),
    columns = token_in_order,
    index = ['문서_1', '문서_2', '문서_3'],
    dtype = int,
)

In [37]:
TDM

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
문서_1,0,0,0,0,0,0,0,0
문서_2,0,0,0,0,0,0,0,0
문서_3,0,0,0,0,0,0,0,0


### 5) 각 토큰을 TDM에 담는다

In [0]:
for i, document in enumerate(document_ls):
    for token in document:
        
        token_location = token_dict[token] # 해당 토큰의 위치(column)
        TDM.iloc[i, token_location] += 1

In [39]:
TDM

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
문서_1,1,1,1,1,0,0,0,0
문서_2,1,1,0,2,1,0,0,0
문서_3,0,1,0,0,0,1,2,1


# 4 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음




<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/81/Logarithm_plots.png/300px-Logarithm_plots.png" />

https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [0]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog" 

## 3.1 직접계산하기 1

weighting schema|weight
--|--
tf (term frequency)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />
idf(inverse document frequency) |<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

### 1) 띄어쓰기 단위 토큰화

In [0]:
bowA = d1.split()
bowB = d2.split()

In [42]:
bowB

['The', 'dog', 'sat', 'on', 'my', 'bed', 'I', 'love', 'a', 'dog']

### 2) TDM 생성

In [0]:
wordSet = set(bowA).union(set(bowB))

In [44]:
wordSet

{'I',
 'The',
 'a',
 'bed',
 'cat',
 'dog',
 'face',
 'hate',
 'love',
 'my',
 'on',
 'sat'}

In [0]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0) 

In [46]:
wordDictA

{'I': 0,
 'The': 0,
 'a': 0,
 'bed': 0,
 'cat': 0,
 'dog': 0,
 'face': 0,
 'hate': 0,
 'love': 0,
 'my': 0,
 'on': 0,
 'sat': 0}

In [0]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [48]:
wordDictA

{'I': 1,
 'The': 1,
 'a': 1,
 'bed': 0,
 'cat': 2,
 'dog': 0,
 'face': 1,
 'hate': 1,
 'love': 0,
 'my': 1,
 'on': 1,
 'sat': 1}

In [49]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,I,The,a,bed,cat,dog,face,hate,love,my,on,sat
0,1,1,1,0,2,0,1,1,0,1,1,1
1,1,1,1,1,0,2,0,0,1,1,1,1


### 3) TF 계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />

TF = 토크빈도/전체토큰갯수

In [0]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [0]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [52]:
tfBowA

{'I': 0.1,
 'The': 0.1,
 'a': 0.1,
 'bed': 0.0,
 'cat': 0.2,
 'dog': 0.0,
 'face': 0.1,
 'hate': 0.1,
 'love': 0.0,
 'my': 0.1,
 'on': 0.1,
 'sat': 0.1}

In [53]:
tfBowB

{'I': 0.1,
 'The': 0.1,
 'a': 0.1,
 'bed': 0.1,
 'cat': 0.0,
 'dog': 0.2,
 'face': 0.0,
 'hate': 0.0,
 'love': 0.1,
 'my': 0.1,
 'on': 0.1,
 'sat': 0.1}

### 4) IDF  계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

IDF = log(문서수/토큰빈도수)

In [0]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict    

In [0]:
idfs = computeIDF([wordDictA, wordDictB])

### 5) TF-IDF 계산

In [0]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf
    

In [0]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [58]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,I,The,a,bed,cat,dog,face,hate,love,my,on,sat
0,0.0,0.0,0.0,0.0,0.060206,0.0,0.030103,0.030103,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.030103,0.0,0.060206,0.0,0.0,0.030103,0.0,0.0,0.0


In [59]:
d1

'The cat sat on my face I hate a cat'

In [60]:
d2

'The dog sat on my bed I love a dog'

## 3.2 직접계산하기2

weighting schema|weight|설명
--|--|--
tf(double normalization 0.5)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" />|=0.5 + 0.5(토큰빈도/문서내최빈토큰)
idf(inverse document frequency smooth)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/23e5ae785c1ddc6bd95d404ea3fac2477fff5eff" />|=log(문서갯수/(1+토큰빈도))

In [61]:
from math import log10

# document 내 토큰이 등장한 빈도수 계산
def f(t, d):
    return d.count(t)

# tf 계산
def tf(t, d):
    return 0.5 + 0.5*f(t,d)/max([f(w,d) for w in d])

# idf 계산
def idf(t, D):
    numerator = len(D)
    denominator = 1 + len([ True for d in D if t in d])
    return log10(numerator/denominator)

# tf-idf 계산
def tfidf(t, d, D):
    return tf(t,d)*idf(t, D)

# 공백을 기준으로 토큰과
def tokenizer(d):
    return d.split()

# tfidf 계산  
def tfidfScorer(D):
    tokenized_D = [tokenizer(d) for d in D]
    result = []
    for d in tokenized_D:
        result.append([(t, tfidf(t, d, tokenized_D)) for t in d])
    return result

corpus = [d1, d2]

for i, doc in enumerate(tfidfScorer(corpus)):
    print('====== document[%d] ======' % i)
    print(doc)

[('The', -0.13206844429176096), ('cat', 0.0), ('sat', -0.13206844429176096), ('on', -0.13206844429176096), ('my', -0.13206844429176096), ('face', 0.0), ('I', -0.13206844429176096), ('hate', 0.0), ('a', -0.13206844429176096), ('cat', 0.0)]
[('The', -0.13206844429176096), ('dog', 0.0), ('sat', -0.13206844429176096), ('on', -0.13206844429176096), ('my', -0.13206844429176096), ('bed', 0.0), ('I', -0.13206844429176096), ('love', 0.0), ('a', -0.13206844429176096), ('dog', 0.0)]


## 3.3 sklearn 사용하여 계산하기

### 1) sklearn 활용 TF-IDF

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

document_ls = [d1, d2, d2]

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(document_ls)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vectorizer.get_feature_names()):
    word2id[feature] = idx

### 2) dataframe으로 변환하여 출력

In [64]:
import pandas as pd
count_vect_df = pd.DataFrame(tfidf.todense(), columns=vectorizer.get_feature_names())
count_vect_df

Unnamed: 0,bed,cat,dog,face,hate,love,my,on,sat,the
0,0.0,0.735448,0.0,0.367724,0.367724,0.0,0.217184,0.217184,0.217184,0.217184
1,0.344779,0.0,0.689558,0.0,0.0,0.344779,0.267752,0.267752,0.267752,0.267752
2,0.344779,0.0,0.689558,0.0,0.0,0.344779,0.267752,0.267752,0.267752,0.267752




---

