In [10]:
import os
import time
import datetime
from tensorflow import flags
import tensorflow as tf
import numpy as np
import util.text as tool
import sys

tf.reset_default_graph()

In [11]:
# data loading
test_csv = '/home/beomgon2/medical_chart/movie/nsmc/test.csv'
train_csv = '/home/beomgon2/medical_chart/movie/nsmc/train.csv'
files = [train_csv, test_csv]

# work_dir = '/home/beomgon2/medical_chart/movie/nsmc/'
# data_path = work_dir + 'train.csv'
contents, points = tool.loading_rdata(files, eng=True, num=True, punc=False)
contents = tool.cut(contents,cut=2)

# tranform document to vector
max_document_length = 50
x, vocabulary, vocab_size = tool.make_input(contents,max_document_length)
print('사전단어수 : %s' % (vocab_size))
y = tool.make_output(points,threshold=0.5)

# divide dataset into train/test set
x_train, x_test, y_train, y_test = tool.divide(x,y,train_prop=0.8)

0 docs / 1 save
100000 docs / 99939 save
사전단어수 : 49128


In [9]:
vocab_size

49128

In [4]:
len(contents)

149907

In [12]:
#         - sequence_length: 최대 문장 길이
#         - num_classes: 클래스 개수
#         - vocab_size: 등장 단어 수
#         - embedding_size: 각 단어에 해당되는 임베디드 벡터의 차원
#         - filter_sizes: convolutional filter들의 사이즈 (= 각 filter가 몇 개의 단어를 볼 것인가?) (예: "3, 4, 5")
#         - num_filters: 각 filter size 별 filter 수
#         - l2_reg_lambda: 각 weights, biases에 대한 l2 regularization 정도

sequence_length = max_document_length
num_classes = 2
embedding_size = 64
filter_sizes = 3
num_filters = 128
l2_reg_lambda = 0
none = None
batch_size = 16
num_epochs = 5
evaluate_every = 100
dropout_keep_p = 0.5


In [13]:
# Placeholders for input, output and dropout
tf.reset_default_graph()
input_x = tf.placeholder(tf.int32, [none, sequence_length], name="input_x")
input_y = tf.placeholder(tf.float32, [none, num_classes], name="input_y")
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)



In [14]:
#with tf.device('/gpu:0'), tf.name_scope("embedding"):
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    W = tf.Variable(
        tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
        name="W")
    embedded_chars = tf.nn.embedding_lookup(W, input_x)
    embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
                                             

pooled_outputs = []
filter_size = filter_sizes
with tf.name_scope("conv-maxpool-%s" % filter_size):
    # Convolution Layer
    filter_shape = [filter_size, embedding_size, 1, num_filters]
    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
    conv = tf.nn.conv2d(
        embedded_chars_expanded,
        W,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    # Apply nonlinearity
    h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
    # Maxpooling over the outputs
    pooled = tf.nn.max_pool(
        h,
        ksize=[1, sequence_length - filter_size + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool")
    pooled_outputs.append(pooled)
                                             

# Combine all the pooled features
num_filters_total = num_filters * 1 #len(filter_sizes)
print(num_filters_total)
h_pool = tf.concat(pooled_outputs, axis=1)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

# Add dropout
with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)


# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
#     W = tf.get_variable(
#         "W",
#         shape=[num_filters_total, num_classes],
#         initializer=tf.contrib.layers.xavier_initializer())
    W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
    l2_loss += tf.nn.l2_loss(W)
    l2_loss += tf.nn.l2_loss(b)
    scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
    predictions = tf.argmax(scores, 1, name="predictions")

# Calculate Mean cross-entropy loss

#with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(labels=input_y, logits=scores)
loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

# Accuracy
#with tf.name_scope("accuracy"):
correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
    

128
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [29]:
(loss)

<tf.Tensor 'add:0' shape=() dtype=float32>

In [15]:
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(0.0002)
train_op = optimizer.minimize(loss)
#train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

In [17]:
import sys
# 3. train the model and test
#with tf.Graph().as_default():
session_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config=session_config)
#sess = tf.Session()
with sess.as_default():
    sess.run(tf.global_variables_initializer())

    def batch_iter(data, batch_size, num_epochs, shuffle=True):
        """
        Generates a batch iterator for a dataset.
        """
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    total_num = len(list(zip(x_train, y_train)))
    num_batches_per_epoch = int((total_num - 1)/batch_size) +1

    batches = batch_iter(
            list(zip(x_train, y_train)), batch_size, num_epochs)
    
    test_batches = batch_iter(
            list(zip(x_test, y_test)), 256, num_epochs=1)    
    
    print("debug ",batches) # batches is generator

    print("num_batches_per_epoch", num_batches_per_epoch)
    #sys.exit()
    for epoch in range(num_epochs):
        totloss = 0
        totacc = 0
        print("************epoch number************* ", epoch)
        for iter in range(num_batches_per_epoch):
            denom  = iter+1
            x_batch, y_batch = zip(*next(batches)) 
            #print("debug: ", y_batch)
            x_batch = np.array(x_batch)
            y_batch = np.array(y_batch)
            #print("debug: ", type(y_batch))

            feed_dict = {
                input_x: x_batch,
                input_y: y_batch,
                dropout_keep_prob: dropout_keep_p
            }   
            _, step, loss1, accuracy1 = sess.run(
                [train_op, global_step,  loss, accuracy], feed_dict) 
            totloss += loss1
            totacc += accuracy1
            if iter % 500 == 0:
                print("loss {} acc {}".format(totloss/denom, totacc/denom))
            if iter % 1000 == 0:
                x_batch, y_batch = zip(*next(test_batches)) 
                x_batch = np.array(x_batch)
                y_batch = np.array(y_batch)    
                feed_dict = {
                    input_x: x_batch,
                    input_y: y_batch,
                    dropout_keep_prob: dropout_keep_p
                }  
                _, step, loss1, accuracy1 = sess.run(
                    [train_op, global_step,  loss, accuracy], feed_dict)  
                print("#################################test loss {} acc {}".format(loss1, accuracy1))
                
    
    test_batches = batch_iter(
            list(zip(x_test, y_test)), batch_size, num_epochs=1)   
    total_num_batch = len(list(zip(x_test, y_test)))
    num_batches_per_epoch = int((total_num_batch - 1)/batch_size) +1
    totacc_test = 0
    totloss_test = 0
    for iter in range(num_batches_per_epoch):
        x_batch, y_batch = zip(*next(test_batches)) 
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)    
        feed_dict = {
            input_x: x_batch,
            input_y: y_batch,
            dropout_keep_prob: 1
        }     
        _, step, loss1, accuracy1 = sess.run(
            [train_op, global_step,  loss, accuracy], feed_dict)  
        totacc_test += accuracy1
        totloss_test += loss1
        
    print("###############################################total test loss {} acc {}".format(
        totloss_test/total_num_batch, totacc_test/num_batches_per_epoch))            
    

        
        


        


debug  <generator object batch_iter at 0x7f4c0e208410>
num_batches_per_epoch 9994
************epoch number*************  0
loss 0.6543869972229004 acc 0.6875
#################################test loss 1.216841459274292 acc 0.4375
loss 0.9035086342079672 acc 0.5081087824351297
loss 0.8514387085066213 acc 0.520541958041958
#################################test loss 0.8007152676582336 acc 0.51171875
loss 0.8154151566937159 acc 0.5330196535642905
loss 0.7871399691913915 acc 0.5446964017991005
#################################test loss 0.6875177621841431 acc 0.56640625
loss 0.7652619268025745 acc 0.5541783286685326
loss 0.7479073910544769 acc 0.5632914028657114
#################################test loss 0.6546580791473389 acc 0.6328125
loss 0.7329985172331112 acc 0.5714974293059126
loss 0.7195551649566532 acc 0.5805736065983504
#################################test loss 0.6123121976852417 acc 0.67578125
loss 0.7071109576454112 acc 0.5901049766718507
loss 0.6964075358789745 acc 0.59809288142

In [None]:
zip(*batches)