# 카글 텍스트 분류 - 합성곱 신경망 활용 접근방법

In [19]:
import sys
import os
import string
import tempfile
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
from functools import partial

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [20]:
DEFAULT_PATH ='~/.kaggle/competitions/word2vec-nlp-tutorial/'
FILE_DIR_PATH = './data_in/'
INPUT_TRAIN_DATA_FILE_NAME = 'train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'train_label.npy'
INPUT_TEST_DATA_FILE_NAME = 'test_input.npy'

DATA_CONFIGS_FILE_NAME = 'data_configs.json'

train_input_data = np.load(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
train_label_data = np.load(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
test_input_data = np.load(open(FILE_DIR_PATH + INPUT_TEST_DATA_FILE_NAME, 'rb'))

prepro_configs = None

with open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'r') as f:
    prepro_configs = json.load(f)

In [22]:
# 파라메터 변수
RNG_SEED = 1234
BATCH_SIZE = 128
NUM_EPOCHS = 10
VOCAB_SIZE = len(prepro_configs)
EMB_SIZE = 128
VALID_SPLIT = 0.2
MAX_SEQ_LEN = 604 # 문장 최대 길이

input_train, input_valid, label_train, label_valid = train_test_split(train_input_data, train_label_data, test_size=VALID_SPLIT, random_state=RNG_SEED)

#문장 길이 구하는 값, 전처리로 같은 길이를 맞추어 놔서 의미 없음
len_train = np.array([min(len(x), MAX_SEQ_LEN) for x in input_train])
len_valid = np.array([min(len(x), MAX_SEQ_LEN) for x in input_valid])

## tf.data 세팅

In [23]:
def mapping_fn(X, Y=None):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)

    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)

    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

## 모델 세팅

In [24]:
def model_fn(features, labels, mode, params):

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    #embedding layer를 선언합니다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['x'],
                    VOCAB_SIZE,
                    EMB_SIZE,
                    initializer=params['embedding_initializer']
                    )
    
    embed_seq = layers.Reshape((MAX_SEQ_LEN, EMB_SIZE, 1))(input_layer)
    
    # 현재 모델이 학습모드인지 여부를 확인하는 변수입니다.
        
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                   rate=0.5,
                                   training=training)
    
    
    conv = tf.layers.conv1d(
            inputs=dropout_emb,
            filters=32,
            kernel_size=3,
            padding='same',
            activation=tf.nn.relu)
    
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1, name='logits')
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits)
            }
        )

In [17]:
params = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}

model_dir = os.path.join(os.getcwd(), "data_out/checkpoint/cnn/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()
config_tf._save_checkpoints_secs = 100
config_tf._keep_checkpoint_max =  2
config_tf._log_step_count_steps = 100

cnn_est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf, params=params)

# class EarlyStoppingLossHook(tf.train.SessionRunHook):
#     def __init__(self, loss_tensor_name, value, threshold=3):
#         '''
#         A train hook to stop the training at specified train loss
#         Usage:
#         loss_monitor = EarlyStoppingLossHook("reduced_mean:0", 0.35, 3)
#         estimator.train(input_fn=train_input_fn, hooks=[loss_monitor], ...)

#         :param loss_tensor_name: Name of the loss tensor eg: loss:0
#         :param value: Value at which the trianing should stop
#         :param threshold: number of times to check for the loss value, before stopping the training
#         '''
#         self._best_loss = value
#         self.threshold = threshold
#         self.count  = 0
#         self.loss_tensor_name = loss_tensor_name
#         logging.info("Create EarlyStoppingLossHook for {}".format(self.loss_tensor_name))

#     def before_run(self, run_context):
#         graph = run_context.session.graph
#         tensor_name = self.loss_tensor_name
#         loss_tensor_name = graph.get_tensor_by_name(tensor_name)
#         return session_run_hook.SessionRunArgs(loss_tensor_name)

#     def after_run(self, run_context, run_values):
#         last_loss = run_values.results

#         if last_loss <= self._best_loss:
#             self.count += 1
#             if self.count == self.threshold:
#                 logging.info("EarlyStoppingHook: Request early stop")
#                 run_context sss.request_stop()

# early_stopping = EarlyStoppingLossHook('sigmoid_cross_entropy_loss/value:0', 0.1, 5)

# train_spec = tf.estimator.TrainSpec(input_fn=custom_input_fn(X=input_train, y=label_train, is_training=True), max_steps=NUM_EPOCHS, hooks=[early_stopping])
# eval_spec = tf.estimator.EvalSpec(input_fn=custom_input_fn(X=input_train, y=label_train, is_training=True))

# tf.estimator.train_and_evaluate(cnn_est, train_spec, eval_spec)

INFO:tensorflow:Using config: {'_model_dir': '/Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 100, '_session_config': None, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10bec7128>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
cnn_est.train(train_input_fn)
cnn_est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/model.ckpt-278
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 278 into /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/model.ckpt.
INFO:tensorflow:loss = 0.87576187, step = 279
INFO:tensorflow:global_step/sec: 1.2192
INFO:tensorflow:loss = 0.8132618, step = 379 (82.023 sec)
INFO:tensorflow:Saving checkpoints for 394 into /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/model.ckpt.
INFO:tensorflow:global_step/sec: 1.14472
INFO:tensorflow:loss = 0.78982425, step = 479 (87.357 sec)
INFO:tensorflow:Saving checkpoints for 512 into /Users/sinseongjin/githu

# 평가하기

In [27]:
# 예측된 모델을 불러 체크포인트로 결과치를 불러온다.
prediction_fn = cnn_est.predict(input_fn=input_fn(X=test_input_data, y=None, is_training=False))
predictions = np.array([p['prob'][0] for p in cnn_est.predict(input_fn=input_fn(X=test_input_data, y=None, is_training=False))])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.


Exception ignored in: <generator object Estimator.predict at 0x112929db0>
Traceback (most recent call last):
  File "/Users/sinseongjin/tf110/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 571, in predict
    for key, value in six.iteritems(preds_evaluated)
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/Users/sinseongjin/tf110/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 5023, in get_controller
    yield g
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/Users/sinseongjin/tf110/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 4839, in get_controller
    type(default))
AssertionError: Nesting violated for default stack of <class 'tensorflow.pyt

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [None]:
for epoch in range(NUM_EPOCHS):
    
    cnn_est.train(custom_input_fn(X=input_train, y=label_train, is_training=True))
    cnn_est.evaluate(custom_input_fn(X=input_valid, y=label_valid, is_training=True))
    
    # name scopes의 재사용을 위해 graph를 reset한다.
    tf.reset_default_graph()
    
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool),
                             num_thresholds=21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()

{'x': 'IteratorGetNext:0'}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/4.TEXT_CLASSIFICATION/data_out/checkpoint/cnn/model.ckpt-278
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [28]:
import pandas as pd
#테스트 데이터 로드
test = pd.read_csv(DEFAULT_PATH+"testData.tsv", header=0, delimiter="\t", quoting=3 )

print ("test dataset shape: {}".format(test.shape))

#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":list(predictions)} )

#지금까지 처리한 결과를 파일로 저장합니다.
output.to_csv("./data_out/Bag_of_Words_model_test.csv", index=False, quoting=3 )

test dataset shape: (25000, 2)


In [86]:
# 예측된 모델을 불러 체크포인트로 결과치를 불러온다.
test_input_data = np.load(open(FILE_DIR_PATH + INPUT_TEST_DATA_FILE_NAME, 'rb')) #테스트데이터 로드

predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x":test_input_data}, shuffle=False) #numpy 형태로 저장
cnn_classifier.predict(input_fn=predict_input_fn)

predictions = np.array([p['logits'][0] for p in cnn_classifier.predict(input_fn=predict_input_fn)])

INFO:tensorflow:Could not find trained model in model_dir: /Users/user/git/DeepNLP/7.NLPBOOK/5.TEXT_CLASSIFICATION/checkpoint/cnn_model/cnn, running initialization to predict.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
test dataset shape: (25000, 2)


In [68]:
def print_predictions(sentences):
    indexes = [text_to_index(sentence) for sentence in sentences]
    x = sequence.pad_sequences(indexes, 
                               maxlen=sentence_size, 
                               truncating='post',
                               padding='post',
                               value=pad_id)
    length = np.array([min(len(x), sentence_size) for x in indexes])
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": x, "len": length}, shuffle=False)
    predictions = {}
    for path, classifier in all_classifiers.items():
        predictions[path] = [p['logistic'][0] for p in classifier.predict(input_fn=predict_input_fn)]
    for idx, sentence in enumerate(sentences):
        print(sentence)
        for path in all_classifiers:
            print("\t{} {}".format(path, predictions[path][idx]))
#             predictions[path][idx]
    
    return predictions[path][idx]

In [59]:
# 예측된 모델을 불러 체크포인트로 결과치를 불러온다.

sentimental = []

for i in range(len(clean_test_reviews)):
    if ( (i+1) % 1000 == 0):
        print ("Current Progress %d \n" % (i+1))
    sentimental.append(print_predictions([clean_test_reviews[i]]))
    
#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":sentimental} )

#지금까지 처리한 결과를 파일로 저장합니다.
output.to_csv( "Bag_of_Words_model_test.csv", index=False, quoting=3 )

NameError: name 'word_index' is not defined

In [14]:
def custom_input_fn(X, y=None, is_training=False):

    def internal_input_fn(X, y=None, is_training=False):
        
        if (not isinstance(X, dict)):
            X = {"x": X}
        
        if (y is None):
            dataset = tf.data.Dataset.from_tensor_slices(X)
        else:
            dataset = tf.data.Dataset.from_tensor_slices((X, y))
        
        if (is_training):
            dataset = dataset.repeat().shuffle(len(X['x']))
            batch_size = BATCH_SIZE
        else:
            batch_size = 1

        dataset = dataset.batch(batch_size)
        dataset_iter = dataset.make_initializable_iterator()

        if (y is None):
            features = dataset_iter.get_next()
            labels = None
        else:
            features, labels = dataset_iter.get_next()

        input_tensor_map = dict()
        for input_name, tensor in features.items():
            input_tensor_map[input_name] = tensor.name
            
        print(input_tensor_map)

        with open(os.path.join(FILE_DIR_PATH, 'input_tensor_map.pickle'), 'wb') as f:
            pickle.dump(input_tensor_map, f, protocol=pickle.HIGHEST_PROTOCOL)
        tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, dataset_iter.initializer)
        
        return (features, labels) if (not labels is None) else features
    
    return partial(internal_input_fn, X=X, y=y, is_training=is_training)

custom_input_fn(input_train, label_train)

functools.partial(<function custom_input_fn.<locals>.internal_input_fn at 0x10be8ce18>, X=array([[4353,  728,    1, ...,    0,    0,    0],
       [4874, 6507,   62, ...,    0,    0,    0],
       [8817,  290, 3665, ...,    0,    0,    0],
       ...,
       [  16, 6306,    2, ...,    0,    0,    0],
       [1249, 1972,    1, ...,    0,    0,    0],
       [1243,  347,  462, ...,    0,    0,    0]], dtype=int32), y=array([1, 1, 0, ..., 1, 1, 0]), is_training=False)

# CNN Classification

CNN을 활용하여 text를 분류해보자, n-gram의 효과로 활용

https://www.semanticscholar.org/paper/Learning-to-Rank-Short-Text-Pairs-with-Deep-Neural-Severyn-Moschitti/452f7411af7d471dd3ba84c2b06b2aaffc38cdb9

Embedding Layer -> Dropout -> Conv1D -> GlobalMax1D -> Hidden Dense Layer -> Dropout -> Output Layer

In [None]:
all_classifiers = {}

def train_and_evaluate(classifier):
    # 예측 테스트를 위해 모델을 학습시키고 저장한다.
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps=1)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
    
    # name scopes의 재사용을 위해 graph를 reset한다.
    tf.reset_default_graph()
    
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool),
                             num_thresholds=21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()

In [23]:
#head: pre-made estimator로 평가를 할 때, 일정한 함수를 사용하게 세팅
head = tf.contrib.estimator.binary_classification_head()

def cnn_model_fn(features, labels, mode, params):
    #embedding layer를 선언한다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['x'],
                    vocab_size,
                    EMB_SIZE,
                    initializer=params['embedding_initializer']
                    )

    training = (mode == tf.estimator.ModeKeys.TRAIN)
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                   rate=0.2,
                                   training=training)

    conv = tf.layers.conv1d(
            inputs=dropout_emb,
            filters=32,
            kernel_size=3,
            padding='same',
            activation=tf.nn.relu)
    
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)  
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    
    optimizer = tf.train.AdamOptimizer() #여러가지 Optimizer 활용가능
    
    def _train_op_fn(loss):
#         tf.summary('loss', loss)
        return optimizer.minimize(
                loss=loss,
                global_step=tf.train.get_global_step())

    
    return head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits,
        train_op_fn=_train_op_fn)


cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                        model_dir=os.path.join(model_dir, 'cnn'),
                                        params=params)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/Users/user/git/DeepNLP/7.NLPBOOK/5.TEXT_CLASSIFICATION/checkpoint/cnn_model/cnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11776c978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
#학습 후, 결과치를 tensorboard로 확인
# tensorboard --logdir=./checkpoint/cnn_classifier/
train_and_evaluate(cnn_classifier)

NameError: name 'train_and_evaluate' is not defined

In [22]:
predictions = np.array([p['logistic'][0] for p in cnn_classifier.predict(input_fn=eval_input_fn)])

NameError: name 'cnn_classifier' is not defined

In [None]:
# 직접 prediction으로 테스트 해 본다

def text_to_index(sentence):
    # Remove punctuation characters except for the apostrophe
    translator = str.maketrans('', '', string.punctuation.replace("'", ''))
    tokens = sentence.translate(translator).lower().split()
    return np.array([1] + [word_index[t] if t in word_index else oov_id for t in tokens])

def print_predictions(sentences):
    indexes = [text_to_index(sentence) for sentence in sentences]
    x = sequence.pad_sequences(indexes, 
                               maxlen=sentence_size, 
                               truncating='post',
                               padding='post',
                               value=pad_id)
    length = np.array([min(len(x), sentence_size) for x in indexes])
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": x, "len": length}, shuffle=False)
    predictions = {}
    for path, classifier in all_classifiers.items():
        predictions[path] = [p['logistic'][0] for p in classifier.predict(input_fn=predict_input_fn)]
    for idx, sentence in enumerate(sentences):
        print(sentence)
        for path in all_classifiers:
            print("\t{} {}".format(path, predictions[path][idx]))
#             predictions[path][idx]
    
    return predictions[path][idx]

In [None]:
print_predictions([
    'I do not like this movie'
])

In [None]:
print_predictions(['fuck you', 'this movie sucks'])

지금까지 했던 것을 모두 활용하여 제출용 데이터를 만들어봅시다.

In [None]:
import pandas as pd

default_path = '/Users/user/.kaggle/competitions/word2vec-nlp-tutorial/'

In [None]:
#테스트 데이터 로드
test = pd.read_csv(default_path+"testData.tsv", header=0, delimiter="\t", quoting=3 )

print ("test dataset shape: {}".format(test.shape))

# 불용어 제거 및 태그를 삭제 후, 데이터를 저장할 장소를 만들자
num_reviews = len(test["review"])
clean_test_reviews = []

print ("테스트 영화 리뷰 전처리 진행...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

In [None]:
#테스트 파일은 이렇게 생겼다고 합니다.
print (test.head())

#이 파일은 "sentiment" 행이 없습니다.

In [None]:
# 예측된 모델을 불러 체크포인트로 결과치를 불러온다.

sentimental = []

for i in range(len(clean_test_reviews)):
    if ( (i+1) % 1000 == 0):
        print ("Current Progress %d \n" % (i+1))
    sentimental.append(print_predictions([clean_test_reviews[i]]))
    
#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":sentimental} )

#지금까지 처리한 결과를 파일로 저장합니다.
output.to_csv( "Bag_of_Words_model_test.csv", index=False, quoting=3 )

In [24]:
#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":final_result} )

#결과값 저장
output.to_csv( "final_bof.csv", index=False, quoting=3 )'

#0.5 기준으로 값들을 변환

def correct_val(x):
    if x >= 0.5:
        x = 1
    else:
        x = 0
    
    return x

final_result = output['sentiment'].apply(correct_val)

SyntaxError: EOL while scanning string literal (<ipython-input-24-45b20891aaa6>, line 5)

In [None]:
def model_fn(features, labels, mode, params):

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    #embedding layer를 선언합니다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['x'],
                    VOCAB_SIZE,
                    EMB_SIZE,
                    initializer=params['embedding_initializer']
                    )
    # 현재 모델이 학습모드인지 여부를 확인하는 변수입니다.
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                   rate=0.5,
                                   training=training)

    ## filters = 32이고 kernel_size = 3이면, 길이가 3인 32개의 다른 필터를 생성합니다. 32개의 컨볼루션들을 생성합니다.
    ## conv1d는 (배치사이즈, 길이, 채널)로 입력값을 받는데, 배치사이즈: 문장 숫자 | 길이: 각 문장의 단어의 개수 | 채널: 임베딩 출력 차원수임
    conv = tf.layers.conv1d(
            inputs=dropout_emb,
            filters=32,
            kernel_size=3,
            padding='same',
            activation=tf.nn.relu)
    
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1, name='logits')
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits)
            }
        )