In [1]:
import pandas as pd
from tqdm import tqdm
from gensim.models import doc2vec



In [2]:
model = doc2vec.Doc2Vec.load('doc2vec_nsmc.model')

In [3]:
model

<gensim.models.doc2vec.Doc2Vec at 0x2082cf6cc18>

In [4]:
model.wv.most_similar('공포/Noun')

[('공포영화/Noun', 0.5339546799659729),
 ('스릴러/Noun', 0.5142008662223816),
 ('미스터리/Noun', 0.5078734159469604),
 ('서스펜스/Noun', 0.43593716621398926),
 ('호러/Noun', 0.43546515703201294),
 ('공포물/Noun', 0.41218090057373047),
 ('귀신/Noun', 0.40097883343696594),
 ('미스테리/Noun', 0.39496350288391113),
 ('호러영화/Noun', 0.37969350814819336),
 ('SF/Alpha', 0.37460654973983765)]

In [5]:
train = pd.read_csv("ratings_train.txt",
                    header=0,
                    delimiter="\t",
                    quoting=3)
test = pd.read_csv("ratings_test.txt",
                    header=0,
                    delimiter="\t",
                    quoting=3)

In [6]:
train.dropna(axis=0, how='any', inplace=True)
test.dropna(axis=0, how='any', inplace=True)

print(train.isnull().sum())
print(test.isnull().sum())

print("train %s, test %s" %
      (train.shape, test.shape))

id          0
document    0
label       0
dtype: int64
id          0
document    0
label       0
dtype: int64
train (149995, 3), test (49997, 3)


In [7]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize(doc):
    # norm, stem은 optional
    try:
        return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
    except:
        print(doc)

train_docs = [(tokenize(row[1]), row[2]) for row in tqdm(train.values)]
test_docs = [(tokenize(row[1]), row[2]) for row in tqdm(test.values)]

100%|██████████████████████████████████████████████████████████████████████████████████| 149995/149995 [02:07<00:00, 1172.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 49997/49997 [00:42<00:00, 1173.95it/s]


In [8]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
# 여기서는 15만개 training documents 전부 사용함
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in tqdm(train_docs)]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in tqdm(test_docs)]

train_x = [model.infer_vector(doc.words) for doc in tqdm(tagged_train_docs)]
train_y = [doc.tags[0] for doc in tagged_train_docs]

test_x = [model.infer_vector(doc.words) for doc in tqdm(tagged_test_docs)]
test_y = [doc.tags[0] for doc in tagged_test_docs]

100%|████████████████████████████████████████████████████████████████████████████████| 149995/149995 [00:00<00:00, 356596.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 49997/49997 [00:00<00:00, 256457.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 149995/149995 [00:34<00:00, 4288.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 49997/49997 [00:11<00:00, 4443.27it/s]


In [9]:
from sklearn.cluster import KMeans
import time

start = time.time()

word_vectors = model.wv.vectors
num_clusters = word_vectors.shape[0] // 5

kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: %d seconds" % elapsed)

Time taken for K Means clustering: 668 seconds


In [10]:
# kmean과 word2vec index(word) 합치기
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [11]:
for cluster in range(0, 10):
    print("\nCluster %d" % cluster)
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if(idx[i] == cluster):
            words.append(model.wv.index2word[i])
    print(words)


Cluster 0
['일부/Noun', '산다/Noun', '주먹/Noun', '뿌리/Noun', '해대/Noun', '지하/Noun', '벌이/Noun', '부시/Noun', '누리/Noun', '설치/Noun']

Cluster 1
['만약/Noun', '도무지/Noun', '절대로/Noun', '조차/Noun', '차마/Noun', '고로/Noun', '눈뜨다/Verb', '내/VerbPrefix', '보장/Noun', '한일/Noun', '자가/Noun', '동의/Noun', '은희/Noun', '사정/Noun', '장담/Noun', '위인/Noun', '대중성/Noun', '한눈/Noun', '갈아타다/Verb', '쉴드/Noun', '망정/Noun', '알아듣다/Verb', '사족/Noun', '갈피/Noun', '잠도/Noun', '만도/Noun', '백년/Noun', '본전/Noun', '당분간/Noun', '가늠/Noun', '그러기에/Conjunction', '대항/Noun', '생기다/Adjective', '타국/Noun']

Cluster 2
['차갑다/Adjective', '덩어리/Noun', '인척/Noun', '걸치다/Verb', '불안/Noun']

Cluster 3
['용다/Verb', '밝다/Verb', '에서만/Josa', '상이/Noun', '흐뭇하다/Adjective', '하듯/Josa', '최대한/Noun', '런닝/Noun', '만이라도/Josa', '라기보다는/Josa', '열악하다/Adjective']

Cluster 4
['어딘가/Noun', '흔적/Noun', '내기/Noun', '그려지다/Verb', '치기/Noun', '미술/Noun', '현대인/Noun', '여실히/Noun', '이라기/Josa', '잡기/Noun', '정성/Noun', '쌀/Noun', '두기/Noun']

Cluster 5
['경악/Noun', '우롱/Noun', '출현/Noun', '조명/Noun', '퇴보/Noun', '섭외/Nou

In [12]:
num_clusters

2961

In [13]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    
    return bag_of_centroids

In [14]:
import numpy as np
train_centroids = np.zeros((len(train_x), num_clusters), dtype="float32")

counter = 0
for doc, tag in tqdm(train_docs):
    train_centroids[counter] = create_bag_of_centroids(doc, word_centroid_map)
    counter += 1

test_centroids = np.zeros((len(test_x), num_clusters), dtype="float32")

counter=0
for doc, tag in tqdm(test_docs):
    test_centroids[counter] = create_bag_of_centroids(doc, word_centroid_map)
    counter += 1

100%|██████████████████████████████████████████████████████████████████████████████████| 149995/149995 [00:53<00:00, 2811.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 49997/49997 [00:17<00:00, 2821.44it/s]


In [15]:
print(train_centroids.shape, test_centroids.shape)

(149995, 2961) (49997, 2961)


In [16]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [17]:
tf.reset_default_graph()

# hyper Parameter
learning_rate = 0.001
training_epochs = 100
feature_count = train_centroids.shape[1]
hidden_layers = feature_count // 2
label_count = 1

In [18]:
X = tf.placeholder(tf.float32, [None, feature_count])
Y = tf.placeholder(tf.int32, [None])
print(Y)
Y_onehot = tf.one_hot(Y, 1)
print(Y_onehot)
# Y_onehot = tf.reshape(Y_onehot, [-1, 1])
# print(Y_onehot)

Tensor("Placeholder_1:0", shape=(?,), dtype=int32)
Tensor("one_hot:0", shape=(?, 1), dtype=float32)


In [19]:
# models
initializer = tf.contrib.layers.xavier_initializer()
h0 = tf.layers.dense(X, hidden_layers, activation=tf.nn.relu, kernel_initializer=initializer)
h0 = tf.nn.dropout(h0, 0.95)
h1 = tf.layers.dense(h0, label_count, activation=None)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y_onehot, logits=h1)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

predicted = tf.nn.sigmoid(h1)
correct_pred = tf.equal(tf.round(predicted), Y_onehot)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Instructions for updating:
Use the retry module or similar alternatives.


In [20]:
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [21]:
# Train Model
for step in range(training_epochs + 1):
    sess.run(optimizer, feed_dict={X: train_centroids, Y: train_y})
    loss, _, acc = sess.run([cost, optimizer, accuracy], feed_dict={
                             X: train_centroids, Y: train_y})
    if step % 10 == 0:
        correct_prediction = tf.equal(tf.round(predicted), Y_onehot)
        test_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        test_accuracy_result = sess.run(test_accuracy, feed_dict={X:test_centroids, Y:test_y})
        print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}\tTest Acc: {:.2%}".format(
            step, loss, acc, test_accuracy_result))

Step:     0	Loss: 0.658	Acc: 74.27%	Test Acc: 77.03%
Step:    10	Loss: 0.374	Acc: 84.29%	Test Acc: 82.79%
Step:    20	Loss: 0.315	Acc: 86.68%	Test Acc: 83.77%
Step:    30	Loss: 0.272	Acc: 89.00%	Test Acc: 84.40%
Step:    40	Loss: 0.227	Acc: 91.62%	Test Acc: 84.78%
Step:    50	Loss: 0.182	Acc: 94.13%	Test Acc: 85.13%
Step:    60	Loss: 0.143	Acc: 95.99%	Test Acc: 85.28%
Step:    70	Loss: 0.112	Acc: 97.15%	Test Acc: 85.37%
Step:    80	Loss: 0.090	Acc: 97.85%	Test Acc: 85.43%
Step:    90	Loss: 0.073	Acc: 98.27%	Test Acc: 85.42%
Step:   100	Loss: 0.061	Acc: 98.55%	Test Acc: 85.46%


In [22]:
correct_prediction = tf.equal(tf.round(predicted), Y_onehot)
test_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('accuray: ', sess.run(test_accuracy, feed_dict={X:test_centroids, Y:test_y}))

accuray:  0.8546713


## doc2vec DL

In [35]:
tf.reset_default_graph()

# hyper Parameter
learning_rate = 0.001
training_epochs = 1000
feature_count = len(train_x[0])
hidden_layers = feature_count // 2
label_count = 1

In [36]:
X = tf.placeholder(tf.float32, [None, feature_count])
Y = tf.placeholder(tf.int32, [None])
print(Y)
Y_onehot = tf.one_hot(Y, 1)
print(Y_onehot)
# Y_onehot = tf.reshape(Y_onehot, [-1, 1])
# print(Y_onehot)

Tensor("Placeholder_1:0", shape=(?,), dtype=int32)
Tensor("one_hot:0", shape=(?, 1), dtype=float32)


In [37]:
# models
initializer = tf.contrib.layers.xavier_initializer()
h0 = tf.layers.dense(X, hidden_layers, activation=tf.nn.relu, kernel_initializer=initializer)
h0 = tf.nn.dropout(h0, 0.95)
h1 = tf.layers.dense(h0, label_count, activation=None)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y_onehot, logits=h1)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

predicted = tf.nn.sigmoid(h1)
correct_pred = tf.equal(tf.round(predicted), Y_onehot)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [38]:
# Initialize
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [39]:
# Train Model
for step in range(training_epochs + 1):
    sess.run(optimizer, feed_dict={X: train_x, Y: train_y})
    loss, _, acc = sess.run([cost, optimizer, accuracy], feed_dict={
                             X: train_x, Y: train_y})
    if step % 100 == 0:
        correct_prediction = tf.equal(tf.round(predicted), Y_onehot)
        test_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        test_accuracy_result = sess.run(test_accuracy, feed_dict={X:test_x, Y:test_y})
        print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}\tTest Acc: {:.2%}".format(
            step, loss, acc, test_accuracy_result))

Step:     0	Loss: 0.692	Acc: 51.73%	Test Acc: 54.78%
Step:   100	Loss: 0.574	Acc: 68.51%	Test Acc: 68.05%
Step:   200	Loss: 0.530	Acc: 72.79%	Test Acc: 69.98%
Step:   300	Loss: 0.503	Acc: 74.96%	Test Acc: 70.37%
Step:   400	Loss: 0.485	Acc: 76.25%	Test Acc: 70.04%
Step:   500	Loss: 0.471	Acc: 77.13%	Test Acc: 70.16%
Step:   600	Loss: 0.461	Acc: 77.76%	Test Acc: 70.06%
Step:   700	Loss: 0.453	Acc: 78.27%	Test Acc: 69.91%
Step:   800	Loss: 0.447	Acc: 78.56%	Test Acc: 69.67%
Step:   900	Loss: 0.441	Acc: 78.99%	Test Acc: 69.53%
Step:  1000	Loss: 0.435	Acc: 79.32%	Test Acc: 69.23%


In [40]:
correct_prediction = tf.equal(tf.round(predicted), Y_onehot)
test_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('accuray: ', sess.run(test_accuracy, feed_dict={X:test_x, Y:test_y}))

accuray:  0.69464165
