# 단어의 임베딩
- 빈도수 계산 : 빈도기반 -TF
- TDM : matrix TF를 행렬로 만든 것 , 사전을 이용한 단순빈도
- TF-IDF : TF*IDF
- IDF : 역문서 빈도

In [1]:
text = "John likes to watch movies. Mary likes movies too. Mary also likes to watch football games."
words = text.replace('.', '').split()
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [2]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))


In [3]:
# 딕셔너리 TF 생성
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
word_to_cnt

{'John': 1,
 'Mary': 2,
 'also': 1,
 'football': 1,
 'games': 1,
 'likes': 3,
 'movies': 2,
 'to': 2,
 'too': 1,
 'watch': 2}

In [4]:
word_to_cnt['movies']

2

In [5]:
# !pip install scikit-learn pandas

# TDM

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
corpus = ["John likes to watch movies. Mary likes movies too." ,
          "Mary also likes to watch football games."]

vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [8]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
# tf_dic_sorted
df = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
df

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TD-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item: item[1]))

tfidf_dtm = pd.DataFrame(tfidf_array,
                         columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [10]:
!pip install --upgrade gensim



In [14]:
word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100, # sg : 0이면 CBOW, 1이면 Skip-gram방식을 사용
                 window=3, min_count=1) # 윈도우의 크기/ 3이면 앞/뒤 3단어를 포함
# min_count: 사용할 단어의 최소 빈도/ 3이면 3회 이하 단어는 무시

print(model.wv.most_similar('likes','movies'))
print(model.wv.similarity('movies', 'games'))
# similarity는 두 단어 사이의 코사인 유사도를 출력

[('John', 0.17164471745491028), ('also', 0.06594578176736832), ('Mary', 0.008838453330099583), ('watch', -0.06765829026699066), ('games', -0.08544928580522537), ('football', -0.08948154747486115), ('too', -0.11860241740942001), ('to', -0.13643866777420044)]
0.0640898


In [13]:
print(model.wv.most_similar('John','Mary'))

[('likes', 0.15334713459014893), ('football', 0.07839643210172653), ('also', 0.015055425465106964), ('too', 0.007465780712664127), ('movies', -0.006201202515512705), ('games', -0.07736953347921371), ('to', -0.12009607255458832), ('watch', -0.16032634675502777)]
