<a href="https://colab.research.google.com/github/baikAnalyst/Seoul_Software_Academy/blob/main/05.NLP/002_NLP_BoW_TDM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TDM
: Term Document Matrix - express  frequency of terms' appearance as matrix

In [1]:
docs = ['동물원 코끼리',
            '동물원 원숭이 바나나',
            '엄마 코끼리 아기 코끼리',
            '원숭이 바나나 코끼리 바나나']

## 1. tokenize by space

In [2]:
doc_ls= [doc.split(' ') for doc in docs]
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

## 2. generate dictionary for unique tokens
      2.1 generate empty dictionary

In [3]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
word2id

defaultdict(<function __main__.<lambda>()>, {})

    2.2 insert unique tokens into empty dictionary

In [4]:
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'동물원': 0, '코끼리': 1, '원숭이': 2, '바나나': 3, '엄마': 4, '아기': 5})

In [5]:
import numpy as np

TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)

for i, doc in enumerate(doc_ls) :
    print(doc)

['동물원', '코끼리']
['동물원', '원숭이', '바나나']
['엄마', '코끼리', '아기', '코끼리']
['원숭이', '바나나', '코끼리', '바나나']


In [6]:
import numpy as np
                          # 6 x 2 matrix
TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)

for i, doc in enumerate(doc_ls) :
    print(doc)
    for token in doc :
        # token's column index[ , ] to impute instead of zeros
        TDM[word2id[token], i] += 1
        print(token)
        print(TDM)
        print('\t')

['동물원', '코끼리']
동물원
[[1 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
	
코끼리
[[1 0 0 0]
 [1 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
	
['동물원', '원숭이', '바나나']
동물원
[[1 1 0 0]
 [1 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
	
원숭이
[[1 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
	
바나나
[[1 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 0 0]
 [0 0 0 0]]
	
['엄마', '코끼리', '아기', '코끼리']
엄마
[[1 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 0]]
	
코끼리
[[1 1 0 0]
 [1 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 0]]
	
아기
[[1 1 0 0]
 [1 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
	
코끼리
[[1 1 0 0]
 [1 0 2 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
	
['원숭이', '바나나', '코끼리', '바나나']
원숭이
[[1 1 0 0]
 [1 0 2 0]
 [0 1 0 1]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
	
바나나
[[1 1 0 0]
 [1 0 2 0]
 [0 1 0 1]
 [0 1 0 1]
 [0 0 1 0]
 [0 0 1 0]]
	
코끼리
[[1 1 0 0]
 [1 0 2 1]
 [0 1 0 1]
 [0 1 0 1]
 [0 0 1 0]
 [0 0 1 0]]
	
바나나
[[1 1 0 0]
 [1 0 2 1]
 [0 1 0

In [9]:
TDM      # matrix: TDM 2dim, BoW 1dim

array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [11]:
TDM.ndim

2

In [19]:
import pandas as pd

doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
print('doc_names', doc_names)

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
print(sorted_vocab)   #v[1] = keys
print(vocab)

df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

doc_names ['문서0', '문서1', '문서2', '문서3']
[(0, '동물원'), (1, '코끼리'), (2, '원숭이'), (3, '바나나'), (4, '엄마'), (5, '아기')]
['동물원', '코끼리', '원숭이', '바나나', '엄마', '아기']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


In [20]:
# sklearn

docs = ['동물원 코끼리',
            '동물원 원숭이 바나나',
            '엄마 코끼리 아기 코끼리',
            '원숭이 바나나 코끼리 바나나']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()           # sklearn vectorizes frequency count as DTM
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [0, 2, 0, 0, 1, 1]])

In [22]:
DTM.toarray().T

array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]])

In [23]:
import pandas as pd

doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names_out()
print(vocab)
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

['동물원' '바나나' '아기' '엄마' '원숭이' '코끼리']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
바나나,0,1,0,2
아기,0,0,1,0
엄마,0,0,1,0
원숭이,0,1,0,1
코끼리,1,0,2,1
