# Word2Vec
## 1. Word2Vec 모델 생성하기
- Doc2Vec 모델 생성시 사용한 코퍼스 뭉치를 사용하여 Word2Vec 모델을 생성한다.
- 작성 일시: 2018-06-12
- 수정 일시: 2018-06-12
- 작성자: 부현경 (hyunkyung.boo@gmail.com)

#### 1. DB로부터 코퍼스 불러오기

In [None]:
import codecs
from konlpy.tag import Twitter
import pandas as pd
import mysql.connector
from mysql.connector.errors import Error


table_config = {
    'user': 'root',
    'password': '1234',
    'host': 'localhost',
    'port': 3306,
    'database': 'db_test',
    'raise_on_warnings': True,
    'charset' : 'utf8'
}

try:
    conn = mysql.connector.connect(**table_config)
    curs = conn.cursor()

    sql_select_data1 =  "select idx, tokenized_user_review from naver_movie_info where data_type = 'Train' and tokenized_user_review != \"\""
    sql_select_data2 =  "select idx, tokenized_user_review from naver_movie_info where data_type = 'Validate' and tokenized_user_review != \"\""

    df1 = pd.read_sql(sql_select_data1, con=conn, columns=True)
    df2 = pd.read_sql(sql_select_data2, con=conn, columns=True)

    print("데이터 프레임 변환 완료!")
    
except Exception as e:
    print(e)
    
finally:
    conn.close()
    
# 모델 학습시 사용
T_tokenized_df = df1['tokenized_user_review']
# 모델 정확도 평가시 사용
V_tokenized_df = df2['tokenized_user_review']

#### 2. Word2Vec 모델 생성하기

In [None]:
from gensim.models import Word2Vec
import logging
import time

# 모델 생성시 작업자의 경우 멀티프로세싱처리가 가능
# import multiprocessing
# 'workers': multiprocessing.cpu_count()

In [None]:
# 문장 리스트로 읽기
def readRows(df):
    sentences = []
    for row in df.iteritems():
         if row[1] != '':
            sentences.append([w for w in row[1].split(', ')])
    return sentences


# Word2Vec 학습모델 생성
def create_word2vec_model(sentences_vocab, config):
    start = time.time()
    model = Word2Vec(**config)
    model.build_vocab(sentences_vocab)
    model.train(sentences_vocab, total_examples=len(sentences_vocab), epochs=model.iter)
    end = time.time()
    print("During Time: {}".format(end - start))
    return model


# 저장 모델 로드
def load_word2vec_model(model_path):
    model = Word2Vec.load(model_path)
    return model
    

# multi-configs에 따른 모델 생성 함수
# 다른 parameter 수정시에는 코드를 일부 수정해야 한다.
# config에 따른 모델 생성 + 저장을 한꺼번에 한다.
def getModels(sentences_vocab, s_list, w_list, e_list, mc_list):
    cnt = 0
    for s in s_list:
        for w in w_list:
            for e in e_list:
                for mc in mc_list:
                    config = {
                                'sg': 1,
                                'window': w,  # distance between the predicted word and context words
                                'size': s,  # vector size
                                'batch_words': 10000,
                                'epochs ': e,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
                                'min_count': mc,  # ignore with freq lower
                                'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
                                }

                    cnt += 1
                    label = "s{0}_w{1}_e{2}_mc{3}".format(str(s), str(w), str(e), str(mc))
                    name = "W2V_{0}_setting_{1}.model".format(cnt, label)
                    model = create_word2vec_model(sentences_vocab, config)
                    save_root = "D:\Word2Vec_model_20180612\\"
                    model_save_path = save_root + name
                    model.save(model_save_path)
                    print("저장 완료! ", "저장 위치:", model_save_path)
                    model.init_sims(replace=True)



# sentences_train = readRows(T_tokenized_df)

# # 9 * 5 * 3 * 8 =  1,080
# size = [5, 10, 15, 20, 40, 60, 100, 200, 300]
# window = [1, 3, 5, 8, 10]
# iter_count = [3, 5, 10]
# min_count = [1, 2, 3, 4, 5, 10, 20, 50]

# # 모델 생성&저장
# getModels(sentences_train, size, window, iter_count, min_count)
# # print(model.wv.vocab.keys())

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences_train = readRows(T_tokenized_df)

config = {
            'sg': 1,
            'window': 10,  # distance between the predicted word and context words
            'size': 200,  # vector size
            'batch_words': 10000,
            'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'min_count': 100,  # ignore with freq lower
            'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
        }

name = "W2V.model"
model = create_word2vec_model(sentences_train, config)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + name
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)


In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences_train = readRows(T_tokenized_df)

config = {
            'sg': 1,
            'window': 10,  # distance between the predicted word and context words
            'size': 300,  # vector size
            'batch_words': 10000,
            'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'min_count': 100,  # ignore with freq lower
            'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
        }

name = "W2V2.model"
model = create_word2vec_model(sentences_train, config)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + name
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)


In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences_train = readRows(T_tokenized_df)

config = {
            'sg': 1,
            'window': 10,  # distance between the predicted word and context words
            'size': 200,  # vector size
            'batch_words': 10000,
            'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'min_count': 50,  # ignore with freq lower
            'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
        }

name = "W2V3.model"
model = create_word2vec_model(sentences_train, config)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + name
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)


In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences_train = readRows(T_tokenized_df)

config = {
            'sg': 1,
            'window': 10,  # distance between the predicted word and context words
            'size': 200,  # vector size
            'batch_words': 10000,
            'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'min_count': 1,  # ignore with freq lower
            'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
        }

name = "W2V4.model"
model = create_word2vec_model(sentences_train, config)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + name
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)


In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences_train = readRows(T_tokenized_df)

config = {
            'sg': 1,
            'window': 10,  # distance between the predicted word and context words
            'size': 200,  # vector size
            'batch_words': 10000,
            'iter': 15,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
            'min_count': 1,  # ignore with freq lower
            'workers': 1,  # multi cpu. 1이 속도가 느리지만 그나마.. 메모리를 적게 먹는다.
        }

name = "W2V5.model"
model = create_word2vec_model(sentences_train, config)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + name
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)


In [25]:
#-*- coding: utf-8 -*-
# 파일 읽기
model1 = load_word2vec_model("D:\Word2Vec_model_20180612\W2V.model")
model2 = load_word2vec_model("D:\Word2Vec_model_20180612\W2V2.model")
model3 = load_word2vec_model("D:\Word2Vec_model_20180612\W2V3.model")
model4 = load_word2vec_model("D:\Word2Vec_model_20180612\W2V4.model")
model5 = load_word2vec_model("D:\Word2Vec_model_20180612\W2V5.model")


models = [model1, model2, model3, model4, model5]

cnt = 0
for model in models:
    cnt += 1
    print(cnt, model.estimate_memory())
    print(cnt, model.wv.most_similar(positive=['공포', '로맨스'], negative=['재미있다'], topn=3))
    print(cnt, model.wv.most_similar('공포', topn=3))
    print("---------------------------------------------------------------------------------------------------")
# model.wv.log_accuracy('D:\\Word2Vec_model_20180612\\validationSet.txt')


2018-06-12 18:42:55,147 : INFO : loading Word2Vec object from D:\Word2Vec_model_20180612\W2V.model
2018-06-12 18:42:55,166 : INFO : loading wv recursively from D:\Word2Vec_model_20180612\W2V.model.wv.* with mmap=None
2018-06-12 18:42:55,167 : INFO : setting ignored attribute vectors_norm to None
2018-06-12 18:42:55,168 : INFO : loading vocabulary recursively from D:\Word2Vec_model_20180612\W2V.model.vocabulary.* with mmap=None
2018-06-12 18:42:55,169 : INFO : loading trainables recursively from D:\Word2Vec_model_20180612\W2V.model.trainables.* with mmap=None
2018-06-12 18:42:55,171 : INFO : setting ignored attribute cum_table to None
2018-06-12 18:42:55,171 : INFO : loaded D:\Word2Vec_model_20180612\W2V.model
2018-06-12 18:42:55,174 : INFO : loading Word2Vec object from D:\Word2Vec_model_20180612\W2V2.model
2018-06-12 18:42:55,208 : INFO : loading wv recursively from D:\Word2Vec_model_20180612\W2V2.model.wv.* with mmap=None
2018-06-12 18:42:55,211 : INFO : setting ignored attribute vec

1 {'vocab': 652000, 'vectors': 1043200, 'syn1neg': 1043200, 'total': 2738400}
1 [('스릴러', 0.4708366394042969), ('멜로', 0.4661795496940613), ('공포영화', 0.37364476919174194)]
1 [('공포영화', 0.5705064535140991), ('스릴러', 0.49120429158210754), ('무섭다', 0.4490625858306885)]
---------------------------------------------------------------------------------------------------
2 {'vocab': 652000, 'vectors': 1564800, 'syn1neg': 1564800, 'total': 3781600}
2 [('멜로', 0.46827614307403564), ('스릴러', 0.4577628970146179), ('공포영화', 0.3604389727115631)]
2 [('공포영화', 0.5403527617454529), ('스릴러', 0.4729056656360626), ('깜짝', 0.4339043200016022)]
---------------------------------------------------------------------------------------------------
3 {'vocab': 1122500, 'vectors': 1796000, 'syn1neg': 1796000, 'total': 4714500}
3 [('멜로', 0.4420172870159149), ('스릴러', 0.40668985247612), ('장르', 0.38747286796569824)]
3 [('공포영화', 0.5405879020690918), ('공포물', 0.5004182457923889), ('미스테리', 0.49281299114227295)]
---------------------

In [None]:
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import gensim 
import gensim.models as g

# 폰트 설정
mpl.rcParams['axes.unicode_minus'] = False
font_name = mpl.font_manager.FontProperties(fname='C:/Windows/Fonts/malgun.ttf').get_name()
mpl.rc('font', family=font_name)
# print (plt.rcParams['font.family'] )

for model in models:
    print(model)
    vocab = list(model.wv.vocab)
    X = model.wv[vocab]
    # print(len(model.wv.vocab.keys()))
    # print(len(X))
    # print(X[0][:10])
    tsne = TSNE(n_components=2)

    # 100개의 단어에 대해서만 시각화
    X_tsne = tsne.fit_transform(X[:150])
    # X_tsne = tsne.fit_transform(X)

    df = pd.DataFrame(X_tsne, index=vocab[:150], columns=['x', 'y'])
    df.shape

    fig = plt.figure()
    fig.set_size_inches(40,20)
    ax = fig.add_subplot(1, 1, 1)

    ax.scatter(df['x'], df['y'])

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=30)
    plt.show()
    print("------------------------------------")

#### 3. 단어장 및 모델 갱신

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Word2Vec 학습 모델 갱신
def update_word2vec_model(model, sentences_updated):
    model.build_vocab(sentences_updated, update=True)
    model.train(sentences_updated, total_examples=len(sentences_updated), epochs=model.epochs)
    return model


sentences_train = readRows(T_tokenized_df)
sentences_add = readRows(V_tokenized_df)
sentences_new= sentences_train + sentences_add

model = load_word2vec_model("D:\Word2Vec_model_20180612\W2V4.model")
model = update_word2vec_model(model, sentences_new)
save_root = "D:\Word2Vec_model_20180612\\"
model_save_path = save_root + 'update_W2V.model'
model.save(model_save_path)
print("저장 완료! ", "저장 위치:", model_save_path)

In [28]:
print(model.estimate_memory())
print(model.wv.most_similar(positive=['공포', '로맨스'], negative=['재미있다'], topn=10))
print(model.wv.most_similar('공포', topn=10))

2018-06-12 18:46:24,828 : INFO : estimated required memory for 35335 words and 200 dimensions: 74203500 bytes


{'vocab': 17667500, 'vectors': 28268000, 'syn1neg': 28268000, 'total': 74203500}
[('무속신앙', 0.42442092299461365), ('스릴러', 0.4134938716888428), ('환타지', 0.4115058183670044), ('멜로', 0.40173518657684326), ('호러물', 0.39570868015289307), ('월씬', 0.3922324776649475), ('매개', 0.39122796058654785), ('공포영화', 0.3878144323825836), ('옥죄', 0.3769144117832184), ('호러', 0.3763198256492615)]
[('공포영화', 0.6274300217628479), ('무속신앙', 0.6033227443695068), ('스텐바이미', 0.5815481543540955), ('넘쳣', 0.5712491273880005), ('드래그미투헬', 0.5686925649642944), ('감관', 0.5659478902816772), ('옥죄', 0.5618089437484741), ('월씬', 0.5522574186325073), ('호러물', 0.5518032312393188), ('뻐근하네', 0.5506236553192139)]
