### **GloVe**  
임베딩 벡터의 내적이 말뭉치 전체에서의 동시출현(co-occurrence) 확률 값이 되는 목적 함수를 갖는다.  
이를 통해 임베딩 벡터간 유사도 측정을 수월하게 하면서도 말뭉치 전체의 통계 정보를 반영할 수 있다. 

> **동시 출현 (Co-occurence)** *이란, 한 문장, 문단 또는 텍스트 단위에서 같이 출현한 단어를 가리다. 언어학적 의미에서 의미적 근접성을 가리킨다.*  




In [34]:
import os
import re
import csv
from glob import iglob
from pathlib import Path

In [35]:
BASE_DIR = "/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, "articles")

ORIGIN_PATH = os.path.join(DATA_BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(DATA_BASE_DIR,"Pretty-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(DATA_BASE_DIR, "StopWordList.txt")
MODEL_PATH = os.path.join(os.path.join(Path(os.getcwd()).parent, "Word-Embedding-Model"))

In [36]:
MIN_COUNT = 3

In [37]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


In [38]:
def append_to_dict(word):
    
    if word in wordDict:
        wordDict[word] += 1
    else :
        wordDict[word] = 1
        
    return wordDict

In [39]:
media_list = os.listdir(PREPROCESSED_PATH)

result = []
forCount = []
wordDict = {}

for idx, proc_article_path in enumerate(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False)):
    
    f_proc= open(proc_article_path, 'r', newline="\n", encoding="utf-8")
    for [idx, title, contents] in csv.reader(f_proc):
        if contents is '': continue

        cont_list = contents.split("\t")
        forCount += [token for sent in cont_list for token in sent.split()]
        result += [sent.split() for sent in cont_list]
        list(map(append_to_dict, [token for sent in cont_list for token in sent.split()]))
        
    f_proc.close()

In [40]:
print("전체 token의 개수 : {len}".format(len=len(forCount)))
print("중복되지 않은 token의 개수 : {len}".format(len=len(list(set(forCount)))))

전체 token의 개수 : 8131868
중복되지 않은 token의 개수 : 617344


In [41]:
wordDict

{'이태원': 968,
 '클럽': 979,
 '발': 310,
 '감염의': 196,
 '영향으로': 292,
 '코로나19': 14057,
 '확진자가': 5331,
 '계속': 3950,
 '늘고': 592,
 '있습니다': 14336,
 '지자체는': 185,
 '감염': 2917,
 '확산을': 1301,
 '막기': 1539,
 '위해': 16537,
 '유흥시설': 301,
 '집합금지': 925,
 '행정': 500,
 '명령을': 975,
 '추가로': 2028,
 '내렸습니다': 301,
 '연결합니다': 228,
 '감염자': 268,
 '증가세가': 69,
 '좀처럼': 119,
 '꺾이지': 51,
 '않는': 2881,
 '군요': 2,
 '어제': 3031,
 '하루': 2428,
 '29명': 51,
 '늘어': 292,
 '누적': 1280,
 '확진자는': 2524,
 '10': 981,
 '991명이': 2,
 '됐습니다': 720,
 '가운데': 7509,
 '20명이': 89,
 '관련': 12065,
 '신규': 1754,
 '확진자입니다': 13,
 '감염이': 1241,
 '확산세를': 119,
 '보이는': 1061,
 '3차': 753,
 '의심': 1042,
 '사례도': 351,
 '나왔습니다': 604,
 '서울': 10045,
 '도봉구에서': 3,
 '지난': 27171,
 '12일': 2013,
 '확진': 3809,
 '판정을': 4019,
 '받은': 5796,
 '10대': 700,
 '남성이': 3106,
 '7일': 2240,
 '코인노래방을': 18,
 '방문한': 1023,
 '뒤': 12110,
 '증상이': 1471,
 '나타났는데': 12,
 '확진자와': 786,
 '접촉한': 832,
 '감염자가': 385,
 '같은': 10222,
 '시간대에': 105,
 '이곳을': 145,
 '방문했던': 103,
 '것으로': 39817,
 '확인됐습니다': 640,
 '서대문구와': 2,

In [42]:
rmMinCount ={}
for key, val in wordDict.items():
    if val < MIN_COUNT : continue
    rmMinCount[key]=val
    
rmMinCount

{'이태원': 968,
 '클럽': 979,
 '발': 310,
 '감염의': 196,
 '영향으로': 292,
 '코로나19': 14057,
 '확진자가': 5331,
 '계속': 3950,
 '늘고': 592,
 '있습니다': 14336,
 '지자체는': 185,
 '감염': 2917,
 '확산을': 1301,
 '막기': 1539,
 '위해': 16537,
 '유흥시설': 301,
 '집합금지': 925,
 '행정': 500,
 '명령을': 975,
 '추가로': 2028,
 '내렸습니다': 301,
 '연결합니다': 228,
 '감염자': 268,
 '증가세가': 69,
 '좀처럼': 119,
 '꺾이지': 51,
 '않는': 2881,
 '어제': 3031,
 '하루': 2428,
 '29명': 51,
 '늘어': 292,
 '누적': 1280,
 '확진자는': 2524,
 '10': 981,
 '됐습니다': 720,
 '가운데': 7509,
 '20명이': 89,
 '관련': 12065,
 '신규': 1754,
 '확진자입니다': 13,
 '감염이': 1241,
 '확산세를': 119,
 '보이는': 1061,
 '3차': 753,
 '의심': 1042,
 '사례도': 351,
 '나왔습니다': 604,
 '서울': 10045,
 '도봉구에서': 3,
 '지난': 27171,
 '12일': 2013,
 '확진': 3809,
 '판정을': 4019,
 '받은': 5796,
 '10대': 700,
 '남성이': 3106,
 '7일': 2240,
 '코인노래방을': 18,
 '방문한': 1023,
 '뒤': 12110,
 '증상이': 1471,
 '나타났는데': 12,
 '확진자와': 786,
 '접촉한': 832,
 '감염자가': 385,
 '같은': 10222,
 '시간대에': 105,
 '이곳을': 145,
 '방문했던': 103,
 '것으로': 39817,
 '확인됐습니다': 640,
 '마포구': 243,
 '클럽을': 188,
 '방문했다가':

In [43]:
len(rmMinCount)

192624

In [44]:
media_list = os.listdir(SUMMARY_PREPROCESSED_PATH)

summary_result = []
summary_forCount = []
wordDict = {}

for idx, proc_article_path in enumerate(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False)):
    
    f_proc= open(proc_article_path, 'r', newline="\n", encoding="utf-8")
    for [idx, title, contents] in csv.reader(f_proc):
        if contents is '': continue

        cont_list = contents.split("\t")
        summary_forCount += [token for sent in cont_list for token in sent.split()]
        summary_result += [sent.split() for sent in cont_list]
        list(map(append_to_dict, [token for sent in cont_list for token in sent.split()]))
        
    f_proc.close()

In [45]:
rmMinCount_summary ={}
for key, val in wordDict.items():
    if val < MIN_COUNT : continue
    rmMinCount_summary[key]=val
    
rmMinCount_summary

{'김': 148,
 '판사는': 9,
 '수사기관이': 7,
 '피의자의': 5,
 '주거지': 3,
 '휴대전화': 42,
 '번호': 6,
 '등을': 249,
 '파악하고': 15,
 '있었고': 5,
 '피의자가': 8,
 '잠을': 7,
 '자고': 7,
 '있어': 51,
 '증거를': 9,
 '아니었다': 6,
 '며': 600,
 '요건을': 3,
 '갖추지': 3,
 '못한': 30,
 '해당한다': 8,
 '이어': 92,
 '있었다': 112,
 '긴급체포': 3,
 '못한다': 9,
 '고': 1148,
 '강조했다': 38,
 '사고': 77,
 '직후': 30,
 '낸': 32,
 '대책': 14,
 '추진': 12,
 '따르면': 399,
 '인공': 4,
 '수초섬이': 4,
 '급류에': 9,
 '보고가': 3,
 '것은': 111,
 '이날': 355,
 '오전': 264,
 '것으로': 615,
 '확인됐다': 76,
 '당시': 254,
 '받은': 128,
 '담당': 13,
 '말고': 8,
 '한': 597,
 '알려졌다': 76,
 '경찰과': 42,
 '소방당국은': 16,
 '춘천시': 5,
 '인근에': 12,
 '설치하고': 4,
 '실종된': 19,
 '나머지': 14,
 '대한': 421,
 '수색': 12,
 '작업을': 17,
 '벌였다': 15,
 '7일': 67,
 '이후': 125,
 '방침이다': 27,
 '텔레그램': 52,
 '박사방': 38,
 '운영자': 16,
 '조주빈': 4,
 '사건과': 39,
 '관련': 215,
 '윤석열': 25,
 '검찰총장은': 7,
 '검찰의': 42,
 '모든': 73,
 '역량을': 4,
 '근본적인': 7,
 '엄정': 10,
 '대응을': 10,
 '지시했다': 8,
 '서울중앙지검은': 7,
 '성착취': 12,
 '불법': 68,
 '영상물': 5,
 '유포': 7,
 '사건에': 44,
 '위해': 273,
 '디지털': 48,
 '성범죄'

In [46]:
len(rmMinCount_summary)

8058

In [47]:
START_TOKEN = ['<SOS>']
END_TOKEN = ['<EOS>']

In [48]:
rmMinCount[START_TOKEN[0]] = rmMinCount[END_TOKEN[0]] = len(result)
rmMinCount_summary[START_TOKEN[0]] = rmMinCount_summary[END_TOKEN[0]] = len(result)

In [49]:
result = list(map(lambda content : START_TOKEN + content + END_TOKEN, result))
summary_result = list(map(lambda content : START_TOKEN + content + END_TOKEN, summary_result))

In [50]:
rmMinCountList = list(map(lambda content : [token for token in content if token in rmMinCount], result))
rmMinCountSummaryList = list(map(lambda content : [token for token in content if token in rmMinCount_summary], summary_result))


In [51]:
rmMinCountList

[['<SOS>',
  '이태원',
  '클럽',
  '발',
  '감염의',
  '영향으로',
  '코로나19',
  '확진자가',
  '계속',
  '늘고',
  '있습니다',
  '<EOS>'],
 ['<SOS>',
  '지자체는',
  '감염',
  '확산을',
  '막기',
  '위해',
  '유흥시설',
  '집합금지',
  '행정',
  '명령을',
  '추가로',
  '내렸습니다',
  '<EOS>'],
 ['<SOS>', '연결합니다', '<EOS>'],
 ['<SOS>', '이태원', '클럽', '발', '감염자', '증가세가', '좀처럼', '꺾이지', '않는', '<EOS>'],
 ['<SOS>',
  '어제',
  '하루',
  '코로나19',
  '확진자가',
  '29명',
  '늘어',
  '누적',
  '확진자는',
  '10',
  '됐습니다',
  '<EOS>'],
 ['<SOS>', '29명', '가운데', '20명이', '이태원', '클럽', '관련', '신규', '확진자입니다', '<EOS>'],
 ['<SOS>',
  '이태원',
  '클럽',
  '발',
  '감염이',
  '확산세를',
  '보이는',
  '가운데',
  '3차',
  '감염',
  '의심',
  '사례도',
  '나왔습니다',
  '<EOS>'],
 ['<SOS>',
  '서울',
  '도봉구에서',
  '지난',
  '12일',
  '확진',
  '판정을',
  '받은',
  '10대',
  '남성이',
  '지난',
  '7일',
  '코인노래방을',
  '방문한',
  '뒤',
  '증상이',
  '나타났는데',
  '이태원',
  '클럽',
  '확진자와',
  '접촉한',
  '감염자가',
  '같은',
  '시간대에',
  '이곳을',
  '방문했던',
  '것으로',
  '확인됐습니다',
  '<EOS>'],
 ['<SOS>', '서울', '마포구', '확진자가', '나왔습니다', '<EOS>'],
 ['<SOS>',
  '이태원',


In [52]:
rmMinCountSummaryList

[['<SOS>',
  '김',
  '판사는',
  '수사기관이',
  '피의자의',
  '주거지',
  '휴대전화',
  '번호',
  '등을',
  '파악하고',
  '있었고',
  '피의자가',
  '잠을',
  '자고',
  '있어',
  '증거를',
  '아니었다',
  '며',
  '요건을',
  '갖추지',
  '못한',
  '해당한다',
  '이어',
  '수사기관이',
  '피의자의',
  '주거지',
  '휴대전화',
  '번호',
  '등을',
  '파악하고',
  '있었다',
  '며',
  '긴급체포',
  '요건을',
  '못한다',
  '고',
  '강조했다',
  '<EOS>'],
 ['<SOS>',
  '사고',
  '직후',
  '낸',
  '사고',
  '대책',
  '추진',
  '따르면',
  '인공',
  '수초섬이',
  '급류에',
  '보고가',
  '것은',
  '이날',
  '오전',
  '것으로',
  '확인됐다',
  '당시',
  '인공',
  '수초섬이',
  '급류에',
  '받은',
  '담당',
  '말고',
  '고',
  '한',
  '것으로',
  '알려졌다',
  '<EOS>'],
 ['<SOS>',
  '경찰과',
  '소방당국은',
  '춘천시',
  '인근에',
  '설치하고',
  '실종된',
  '나머지',
  '대한',
  '수색',
  '작업을',
  '벌였다',
  '경찰과',
  '소방당국은',
  '7일',
  '오전',
  '이후',
  '수색',
  '작업을',
  '방침이다',
  '<EOS>'],
 ['<SOS>',
  '텔레그램',
  '박사방',
  '운영자',
  '조주빈',
  '사건과',
  '관련',
  '윤석열',
  '검찰총장은',
  '검찰의',
  '모든',
  '역량을',
  '근본적인',
  '며',
  '엄정',
  '대응을',
  '지시했다',
  '이날',
  '서울중앙지검은',
  '성착취',
  '불법',
  '영상물',
  '유포',
 

In [53]:
from glove import Corpus, Glove

corpus = Corpus() 
corpus.fit(rmMinCountList, window=5)

corpus_summary = Corpus() 
corpus_summary.fit(rmMinCountSummaryList, window=5)

In [54]:
glove = Glove(no_components=256, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [55]:
lambda x : glove.word_vectors[corpus.dictionary[x]]

<function __main__.<lambda>(x)>

In [56]:
mkdir_p(MODEL_PATH)

input_corpus_path = os.path.join(MODEL_PATH, "input-corpus-256.model")
summary_corpus_path = os.path.join(MODEL_PATH, "summary-corpus-256.model")
model_path = os.path.join(MODEL_PATH, 'glove-256.model')

In [57]:
glove.save(model_path)
corpus.save(input_corpus_path)
corpus_summary.save(summary_corpus_path)