## Coccurrence Matrix

### import example data

In [16]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import collections, itertools
from itertools import chain, combinations

### 1. corpus의 단어들을 pair로 만드는 과정: () tuple 형식으로
* 지금은 중심 단어 기준으로 window를 정하고, 그 window 안에 있는 애들을 co-occurrence 단어라고 보는 단계는 아님
* 나이브하게 한 문장에 나타난 애들이 모두 co-occurr했다고 보는 것

In [17]:
# toy example
corpus = [
    'this is the first document',
    'ts is the second second document',
    'and the third moment document moment document',
    'Is this the first document?',
    'The last document?',
]

corpus = [i.split(' ') for i in corpus]

def pairwise(arr):
    toks = list(dict.fromkeys(arr))
    pair = list(combinations(sorted(toks), 2))
    return pair

pairs = []


for sent in tqdm(corpus) :
    for pair in pairwise(sent):
        #print([pair])
        pairs+= [pair]

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 4960.15it/s]


In [18]:
# 예시
pairs[:10]

[('document', 'first'),
 ('document', 'is'),
 ('document', 'the'),
 ('document', 'this'),
 ('first', 'is'),
 ('first', 'the'),
 ('first', 'this'),
 ('is', 'the'),
 ('is', 'this'),
 ('the', 'this')]

In [19]:
corpus_count = collections.Counter(list(pairs))

### Gephi 형식으로 co-occurrence을 변환하기
* 이렇게 co-oc 행렬을 저장하지 않고 graph 파일로 변환하는 방법이 있다.

In [57]:
import networkx as nx

def make_graph(co_occurr, output_path):
    
    graph=nx.Graph()
    for key in tqdm(co_occurr.keys()) :
        #print(key)
        graph.add_edge(key[0],key[1],weight=co_occurr.get(key))

    nx.write_graphml(graph, output_path)
    
make_graph(corpus_count, 'count_all_pos.graphml')

## 2. co-occurrence 행렬을 중심단어, window을 정의해서 함.
* counter_to_cooccurr 함수: corpus의 co-occurr 단어가 counter() 형식으로 되어 있을 때 이를 co-occurr 행렬로 변환해줌.
* cal_occ 함수: 대부분 이것을 사용할 것임. corpus가 주어졌을 때, window를 통해 co-occurrence 단어를 정하는 것. 이것이 더 현실적인 가정인게, 중심단어와 비슷할수록 같이 일어났다고 보는 것이 올바름

In [20]:
def counter_to_cooccurr(counter):
    keys = counter.keys()
    vocab_list = []
    for key in keys:
        i, j = key[0], key[1]
        vocab_list.append(i)
        vocab_list.append(j)
    vocab_list = sorted(list(set(vocab_list)))
    n = len(vocab_list)

    co_occurr = np.empty((n,n,))
    co_occurr[:] = np.nan

    for key in keys:
        word1, word2 = key[0], key[1]
        word1_idx = vocab_list.index(word1)
        word2_idx = vocab_list.index(word2)
        value = int(counter[key])
        co_occurr[word1_idx, word2_idx] = value
        co_occurr[word2_idx, word1_idx] = value
        
    df = pd.DataFrame(np.nan_to_num(co_occurr))
    df.index = vocab_list
    df.columns = vocab_list
    
    for col in df.columns:
        df[col] = df[col].map(lambda x: int(x))

    return df

In [21]:
co_occurr = counter_to_cooccurr(corpus_count)

In [22]:
co_occurr

Unnamed: 0,Is,The,and,document,document?,first,is,last,moment,second,the,third,this,ts
Is,0,0,0,0,1,1,0,0,0,0,1,0,1,0
The,0,0,0,0,1,0,0,1,0,0,0,0,0,0
and,0,0,0,1,0,0,0,0,1,0,1,1,0,0
document,0,0,1,0,0,1,2,0,1,1,3,1,1,1
document?,1,1,0,0,0,1,0,1,0,0,1,0,1,0
first,1,0,0,1,1,0,1,0,0,0,2,0,2,0
is,0,0,0,2,0,1,0,0,0,1,2,0,1,1
last,0,1,0,0,1,0,0,0,0,0,0,0,0,0
moment,0,0,1,1,0,0,0,0,0,0,1,1,0,0
second,0,0,0,1,0,0,1,0,0,0,1,0,0,1


In [24]:
words_selected = ['document','moment','first']
# words selected: corpus의 모든 vocab으로 numpy array을 만들면 memory error가 뜬다.
# 사실, 주요하게 보고 싶은 단어가 있을 것인데, 그 단어를 미리 지정함으로써 메모리 에러도 방지하는 것이다.

window = 2

def cal_occ(corpus, window, words_selected):
    vocab = sorted(list(set(words_selected)))
    n = len(words_selected)

    co_occurr = np.zeros([n,n])
    for sent in tqdm(corpus):
        for i,word in enumerate(sent):
            n_sent = len(sent)
            for j in range(max(i-window,0),min(i+window+1,n_sent)):
                try:
                    row_idx = vocab.index(word)
                except:
                    row_idx = -1
                    
                try:
                    col_idx = vocab.index(sent[j])
                except:
                    col_idx = -1
                    
                if row_idx != -1 and col_idx != -1:
                    co_occurr[row_idx, col_idx] += 1
                else: pass
                
    np.fill_diagonal(co_occurr,0)
    
    co_occurr = pd.DataFrame(co_occurr, columns=vocab, index=vocab)
    
    return co_occurr

# toy example
cal_occ(corpus,2, ['document','first'])

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5006.33it/s]


Unnamed: 0,document,first
document,0.0,1.0
first,1.0,0.0
