### 동시출현행렬 
- 특정 단어를 기준으로 주변의 범위 안에서 어떠한 단어가 등장했는가? 빈도 수를 확인 하는 행렬
- 예
    - '오늘 날씨가 너무 좋다', '내일 날씨가 조금 흐리다'
    - window(범위)가 1인 경우 
    - 날씨가 -> 주변의 단어들은 ['오늘', '너무', '내일', '조금']
- N-gram 차이 
    - N-gram : 단어 순서 반영 
    - 동시출현행렬 : 의미 관계를 반영 
### PMI
- 두 단어가 우연히 함께 등장한 것인가? 아니면 의미적으로 연관성이 있는가? 측정
- 측정 값이 클 수록 의미적으로 강하게 연결되었다는 의미
### PPMI
- PMI가 음수인 경우는 사용되는 경우가 극히 드물 
- PMI의 음수의 데이터는 0으로 대체하는 값

In [1]:
import math 
from collections import Counter
import pandas as pd 
from konlpy.tag import Okt

In [2]:
docs = [
    '오늘 날씨가 매우 좋다', 
    '오늘 기분이 정말 좋다', 
    '내일 날씨가 조금 흐리다', 
    '기분이 매우 나쁘다'
]

In [34]:
# 좌우 단어 검색의 범위를 지정 
window_size = 2

In [35]:
# 토큰화 함수를 정의 
okt = Okt()

def tokenize(text):
    result = okt.morphs(text)
    return result

tokens = [ tokenize(doc) for doc in docs ]
tokens

[['오늘', '날씨', '가', '매우', '좋다'],
 ['오늘', '기분', '이', '정말', '좋다'],
 ['내일', '날씨', '가', '조금', '흐리다'],
 ['기분', '이', '매우', '나쁘다']]

In [36]:
# tokense데이터에서 나오는 단어들의 목록을 생성
# 중복 데이터를 제거하고 리스트의 형태로 변환 
# 2차원 리스트를 1차원으로 변경 -> 집합의 형태로 변환 (set()) 
# -> 리스트 형태로 변환
vocab = sorted(set(sum(tokens, [])))
# list.sort() -> list class 객체 안에 변수를 변경(return None)
# 아래의 코드에서 vocab2에 저장되는 데이터는 None
# vocab2 = list(set(sum(tokens, []))).sort()
vocab2 = list(set(sum(tokens, [])))
vocab2.sort()

In [37]:
print(vocab)
print(vocab2)

['가', '기분', '나쁘다', '날씨', '내일', '매우', '오늘', '이', '정말', '조금', '좋다', '흐리다']
['가', '기분', '나쁘다', '날씨', '내일', '매우', '오늘', '이', '정말', '조금', '좋다', '흐리다']


In [38]:
# 단어들의 목록을 인덱스와 함께 dict 형태로 저장 
vocab_index = {
    word : idx for idx, word in enumerate(vocab)
}
vocab_index

{'가': 0,
 '기분': 1,
 '나쁘다': 2,
 '날씨': 3,
 '내일': 4,
 '매우': 5,
 '오늘': 6,
 '이': 7,
 '정말': 8,
 '조금': 9,
 '좋다': 10,
 '흐리다': 11}

In [39]:
# 동시출현 행렬 생성 -> 기본값으로 행렬을 먼저 생성 
# len(vocab) 만큼 행렬을 생성 -> 12 * 12 2차원 리스트 생성
co_metrix = [ [0] * len(vocab) for i in range(len(vocab)) ]
co_metrix

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [40]:
# window_size를 기반으로 하여 동시출현 카운트 

for token in tokens:
    # token : tokens의 각각의 원소 (1차원 리스트)
    # print(token)
    for idx, word in enumerate(token):
        # idx : token 에서 각각의 인덱스의 값
        # word : token 에서 각각의 원소의 값
        # word의 인덱스의 값 
        center_idx = vocab_index[word]
        # 윈도우의 범위 지정
        start = max(0, idx - window_size)
        end = min(len(token), idx + window_size + 1)  
        # range()함수에서 end를 종료값이 이용하면 해당 값은 미포함 -> 
        # idx + window_size가 range에서 체크가 불가능 하기때문에 + 1
        for i in range(start, end):
            # 동일한 데이터인 경우에는 카운터 증가x
            if idx != i :
                context = token[i] # 단어 선택
                context_idx = vocab_index[context] # 해당 단어의 인덱스 값
                # context_idx -> 동시출현 행렬의 idx의 값
                co_metrix[center_idx][context_idx] += 1


In [41]:
co_df = pd.DataFrame(co_metrix, index=vocab, columns=vocab)
co_df

Unnamed: 0,가,기분,나쁘다,날씨,내일,매우,오늘,이,정말,조금,좋다,흐리다
가,0,0,0,2,1,1,1,0,0,1,1,1
기분,0,0,0,0,0,1,1,2,1,0,0,0
나쁘다,0,0,0,0,0,1,0,1,0,0,0,0
날씨,2,0,0,0,1,1,1,0,0,1,0,0
내일,1,0,0,1,0,0,0,0,0,0,0,0
매우,1,1,1,1,0,0,0,1,0,0,1,0
오늘,1,1,0,1,0,0,0,1,0,0,0,0
이,0,2,1,0,0,1,1,0,1,0,1,0
정말,0,1,0,0,0,0,0,1,0,0,1,0
조금,1,0,0,1,0,0,0,0,0,0,0,1


In [42]:
# PMI & PPMI을 계산 

# co_df의 value 총 합계 ?
sum(sum(co_df.values))
total_count = sum( sum(row) for row in co_metrix )

In [43]:
# 각 단어별 동시 등장 횟수의 합계 / total_count
p_word = [ sum(row) / total_count for row in co_metrix ]
p_word

[0.15384615384615385,
 0.09615384615384616,
 0.038461538461538464,
 0.11538461538461539,
 0.038461538461538464,
 0.11538461538461539,
 0.07692307692307693,
 0.1346153846153846,
 0.057692307692307696,
 0.057692307692307696,
 0.07692307692307693,
 0.038461538461538464]

In [44]:
p_context = [
    sum(
        co_metrix[i][j] \
            for i in range( len(vocab) )
    ) / total_count  \
            for j in  range(len(vocab))
]
p_context

[0.15384615384615385,
 0.09615384615384616,
 0.038461538461538464,
 0.11538461538461539,
 0.038461538461538464,
 0.11538461538461539,
 0.07692307692307693,
 0.1346153846153846,
 0.057692307692307696,
 0.057692307692307696,
 0.07692307692307693,
 0.038461538461538464]

In [45]:
# PMI 계산식 
def calc_pmi(i, j):
    p_wc = co_metrix[i][j] / total_count
    if p_wc == 0:
        result = 0
    else:
        result = math.log2( p_wc / (p_word[i] * p_context[j]) + 1e-12 )
    return result

pmi_metrix = [ [calc_pmi(i, j) for j in range(len(vocab))] \
              for i in range(len(vocab)) ]

pmi_df = pd.DataFrame(pmi_metrix, index=vocab, columns=vocab)
pmi_df.style.background_gradient(cmap='Blues')

Unnamed: 0,가,기분,나쁘다,날씨,내일,매우,오늘,이,정말,조금,좋다,흐리다
가,0.0,0.0,0.0,1.115477,1.70044,0.115477,0.70044,0.0,0.0,1.115477,0.70044,1.70044
기분,0.0,0.0,0.0,0.0,0.0,0.793549,1.378512,1.571157,1.793549,0.0,0.0,0.0
나쁘다,0.0,0.0,0.0,0.0,0.0,2.115477,0.0,1.893085,0.0,0.0,0.0,0.0
날씨,1.115477,0.0,0.0,0.0,2.115477,0.530515,1.115477,0.0,0.0,1.530515,0.0,0.0
내일,1.70044,0.0,0.0,2.115477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
매우,0.115477,0.793549,2.115477,0.530515,0.0,0.0,0.0,0.308122,0.0,0.0,1.115477,0.0
오늘,0.70044,1.378512,0.0,1.115477,0.0,0.0,0.0,0.893085,0.0,0.0,0.0,0.0
이,0.0,1.571157,1.893085,0.0,0.0,0.308122,0.893085,0.0,1.308122,0.0,0.893085,0.0
정말,0.0,1.793549,0.0,0.0,0.0,0.0,0.0,1.308122,0.0,0.0,2.115477,0.0
조금,1.115477,0.0,0.0,1.530515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.115477
