# Word2Vec model
> * Positive or Negative

In [1]:
import pickle
import numpy as np
from tqdm import tqdm
from numba import jit

## Word2Vec model 생성을 위한 함수 정의

In [2]:
import multiprocessing
cores = multiprocessing.cpu_count()
#sg (int {1, 0}) – Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed.
def Make_Word2Vec_Model(data, size, epoch, sg, window,min_count, cbow_mean, workers, negative, hs, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import word2vec
    start = datetime.now()
    modelPath = './model/'
    modelName = 'word2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_sg-{}_cbow_mean-{}_min_count-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, sg, cbow_mean, min_count, tagger)
    modelName = modelPath+modelName
    print (modelName)
    w2v_model = word2vec.Word2Vec(size = size, sg = sg, cbow_mean = cbow_mean, negative = negative,
                                  hs = hs, window = window, workers = workers, iter=epoch, min_count = min_count)
    w2v_model.build_vocab(tqdm(data))
    w2v_model.train(tqdm(data), total_examples=w2v_model.corpus_count, epochs=w2v_model.iter) 
    end = datetime.now()
    w2v_model.save(modelName)
    print ("Total running time: ", end-start)
    return d2v_model

## rawdata

In [3]:
import pandas as pd

In [4]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None,encoding='utf-8')
print (rawdata.shape)

(491510, 2)


In [5]:
rawdata.head()

Unnamed: 0,0,1
0,주택조합 승인 자체가 몹시 어려웠던 시절 한국기자협회 주택조합 형식으로 추진된 이 ...,1
1,"서 의원은 여의도 회견을 열어 “압도적으로 승리할 것이고 국민, 당원이 주문하신 ...",1
2,박근혜 대통령이 14일 열린 새누리당 전당대회에 참석한 것은 그만큼 아직 여권에서 ...,1
3,"“살기 좋은 전북 만들기, 중앙정부와 연결고리 최선” “월드컵에 출전한 우리 국가대...",1
4,"""여러분 너무나 고맙고 감사합니다. 도와주신 여러분께 저의 마음속의 큰 절을 받아주...",1


### STOPWORDS 

In [6]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

### Word2Vec 포맷으로 만들기

In [7]:
def Tagging(sentence, tagger, stopwords):
    pos = tagger.pos(sentence)
    pos = [x[0] for x in pos]
    pos = [x for x in pos if not x in stopwords]
    return pos

In [8]:
@jit
def Tagging2(sentence, tagger, stopwords):
    pos = pd.Series(tagger.pos(sentence)).str[0]
    pos = pos[~pos.isin(stopwords)]
    return pos.tolist()

In [9]:
def Make_Pre_Data_Sub(series, tagger, stopwords):
    from gensim.models.doc2vec import TaggedDocument
    pos = Tagging2(series[0], tagger, stopwords)
    label = series[1]
    return TaggedDocument(pos, [label])

In [10]:
@jit
def Make_Pre_Data(rawdata, tagger, stopwords):
    outList = list()
    for idx in tqdm(rawdata.index):
        outList.append([Tagging2(rawdata.loc[idx, 0], tagger, stopwords), rawdata.loc[idx, 1]])
    return outList

#### Tagging Twitter

In [11]:
from ckonlpy.tag import Twitter as ctwitter
ct = ctwitter()

In [12]:
%%time
tqdm.pandas(desc="progress")
pre_data = rawdata.progress_apply(lambda x: Make_Pre_Data_Sub(x, ct, stopwords), axis = 1).tolist()

progress: 100%|██████████| 491510/491510 [2:20:05<00:00, 58.48it/s]   


CPU times: user 2h 10min 35s, sys: 3min 3s, total: 2h 13min 38s
Wall time: 2h 20min 6s


In [13]:
pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_word2vec_sentiment_by_ct.pickled','wb'))

In [14]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [15]:
del pre_data

In [16]:
pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_ct.pickled','wb'))
pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_ct.pickled','wb'))

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_ct.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_ct.pickled','rb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_train = [ x.tags for x in tqdm(train)] 
x_test = [ x.words for x in tqdm(test)] 
y_test = [ x.tags for x in tqdm(test)] 

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7,
                    cbow_mean = 1, hs = 0, tagger = 'ct')

In [None]:
del model

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7,
                    cbow_mean = 0, hs = 0, tagger = 'ct')

In [None]:
del model

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7,
                    cbow_mean = 0, hs = 0, tagger = 'ct')

In [None]:
del model

In [None]:
del ct

#### Tagging Mecab

In [18]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
%%time
tqdm.pandas(desc="progress-bar")
pre_data = rawdata.progress_apply(lambda x: Make_Pre_Data_Sub(x, mecab, stopwords), axis = 1).tolist()

progress-bar: 100%|██████████| 491510/491510 [22:27<00:00, 364.82it/s]   


CPU times: user 18min 31s, sys: 3min 16s, total: 21min 48s
Wall time: 22min 31s


In [None]:
%%time
pre_data = Make_Pre_Data(rawdata, mecab, stopwords)

 23%|██▎       | 113689/491510 [06:41<22:13, 283.37it/s]

In [None]:
pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_word2vec_sentiment_by_mecab.pickled','wb'))

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [None]:
del pre_data

In [None]:
pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_mecab.pickled','wb'))
pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_mecab.pickled','wb'))

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_mecab.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_mecab.pickled','rb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_train = [ x.tags for x in tqdm(train)] 
x_test = [ x.words for x in tqdm(test)] 
y_test = [ x.tags for x in tqdm(test)] 

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7,
                    cbow_mean = 1, hs = 0, tagger = 'mecab')

In [None]:
del model

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7,
                    cbow_mean = 0, hs = 0, tagger = 'mecab')

In [None]:
del model

In [None]:
model = Make_Word2Vec_Model(data = x_train, size = 3000, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7,
                    cbow_mean = 0, hs = 0, tagger = 'mecab')

In [None]:
del model

In [None]:
del mecab

In [None]:
# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [w2v_model[w] for w in list(w2v_model.wv.vocab.keys())[:10000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(w2v_model.wv.vocab.keys())[:10000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform(x_train)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

In [None]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, 3000) for z in tqdm(map(lambda x: x, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 3000) for z in tqdm(map(lambda x: x, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

In [None]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, embeddings, merge
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=3000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, np.array(y_train), epochs=100, verbose=2)

In [None]:
score = model.evaluate(test_vecs_w2v, np.array(y_test), verbose=2)
print (score[1])

* 1500 : 0.835
* 3000 :  0.8513

In [None]:
model.save('./model/keras_model_size3000_window8_iter20_from_word2vec.model')