In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

In [3]:
docs = [
  '먹고 싶은 사과', 
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나', 
  '저는 과일이 좋아요' 
]

In [21]:
vect = CountVectorizer()
vv1 =vect.fit_transform(docs)
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [17]:
sorted( vect.vocabulary_.items(), key=lambda v: v[1] )

[('과일이', 0),
 ('길고', 1),
 ('노란', 2),
 ('먹고', 3),
 ('바나나', 4),
 ('사과', 5),
 ('싶은', 6),
 ('저는', 7),
 ('좋아요', 8)]

In [19]:
vv1.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [8]:
df = pd.DataFrame(vv.toarray(), columns=sorted(vect.vocabulary_))
df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


#### TF(Term Frequency)
#### IDF(Inverse Document Frequency)  
    TF-IDF(Term Frequency – Inverse Document Frequency) 인코딩은 단어를 갯수 그대로 카운트하지 않고 모든 문서에 공통적으로 들어있는 단어의 경우 문서 구별 능력이 떨어진다고 보아 가중치를 축소하는 방법이다. 


    구제적으로는 문서 d(document)와 단어 t 에 대해 다음과 같이 계산한다.

        text{tf-idf}(d, t) = text{tf}(d, t) cdot text{idf}(t)


    여기에서

        * text{tf}(d, t): term frequency. 특정한 단어의 빈도수
        * text{idf}(t) : inverse document frequency. 특정한 단어가 들어 있는 문서의 수에 반비례하는 수

          text{idf}(d, t) = \log dfrac{n}{1 + text{df}(t)}

    * n : 전체 문서의 수
    * text{df}(t):  단어 t를 가진 문서의 수

In [9]:
corpus_t=['a new car, used car, car review', 'a friend in need is a friend indeed']

In [10]:
vect = TfidfVectorizer()
vv =vect.fit_transform(corpus_t)
vect.vocabulary_

{'new': 6,
 'car': 0,
 'used': 8,
 'review': 7,
 'friend': 1,
 'in': 2,
 'need': 5,
 'is': 4,
 'indeed': 3}

In [11]:
sorted( vect.vocabulary_.items(), key=lambda v: v[1] )

[('car', 0),
 ('friend', 1),
 ('in', 2),
 ('indeed', 3),
 ('is', 4),
 ('need', 5),
 ('new', 6),
 ('review', 7),
 ('used', 8)]

In [12]:
vv.toarray()

array([[0.8660254 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.28867513, 0.28867513, 0.28867513],
       [0.        , 0.70710678, 0.35355339, 0.35355339, 0.35355339,
        0.35355339, 0.        , 0.        , 0.        ]])

In [15]:
df = pd.DataFrame(vv.toarray(), columns = sorted(vect.vocabulary_))
df

Unnamed: 0,car,friend,in,indeed,is,need,new,review,used
0,0.866025,0.0,0.0,0.0,0.0,0.0,0.288675,0.288675,0.288675
1,0.0,0.707107,0.353553,0.353553,0.353553,0.353553,0.0,0.0,0.0


In [24]:
vect = TfidfVectorizer()
vv =vect.fit(docs)
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [25]:
vv.transform(docs).toarray()

array([[0.        , 0.        , 0.        , 0.52640543, 0.        ,
        0.66767854, 0.52640543, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ],
       [0.        , 0.47212003, 0.47212003, 0.        , 0.7444497 ,
        0.        , 0.        , 0.        , 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.57735027]])

In [26]:
df = pd.DataFrame(vv.transform(docs).toarray(), columns = sorted(vect.vocabulary_))
df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
1,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
2,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
3,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735
