# 카글 텍스트 분류 - 합성곱 신경망 활용 접근방법


- 이번 장은 앞서 00장에서 간략하게 설명하였던 합성곱 신경망을 활용하여 텍스트 분류 문제를 풀어보고자 한다. 합성곱 신경망은 주로 이미지에서 특징을 추출하여 이미지 판단을 하는 역할로 큰 성능을 이루었는데, 텍스트에서도 좋은 효과를 낼 수 있다는 점을 Yoon Kim (2014) 박사가 "Convolution Neural Network for Sentence Classificaion" (http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf) 활용하여 입증하였다.

<img src="./Fig/fig1-cnn_text_classification.png"> 

**Fig1. Yoon Kim's Text Classification**

- RNN이 단어의 입력의 순서를 중요하게 반영한다면, CNN은 문장의 지역 정보를 보존하면서 각 문장 성분의 등장 정보들을 학습에 반영하는 구조로 풀어가고 있습니다. 학습을 하면서 각 필터 사이즈를 조절하면서 언어의 특징 값을 추출하게 되는데, 기존의 N-gram (2 그램, 3 그램) 방식과 유사하다고 볼 수 있습니다.

- 예를 들어 "나는 배가 고프다" 라는 문장을 2그램을 사용한다면, "나 는 / 는 배 / 배 가 / 가 고프 / 고프 다/" 로 각각 문장의 단어 성분을 쪼개어 활용 하는 접근방법을, 단어를 각 백터값을 투영하여 컨볼루션 필터값에 적용하는 원리입니다.

# 코드 설명

 - 이전 내용에서 기본적인 Kaggle 연동 및 데이터 분석 및 전처리를 진행하여, 여기에서는 관련된 주제 위주로 설명을 하겠습니다. Full Code는 http://Kaggle/BagOfWordsMeetsBagsOfPopcorn/cnn_text_classification-bagofwords-book_ver.ipynb 를 참조하시기 바랍니다. 

In [15]:
import os
from datetime import datetime
import tensorflow as tf

In [2]:
import numpy as np
import json

FILE_DIR_PATH = './data/'
INPUT_TRAIN_DATA_FILE_NAME = 'input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

input_data = np.load(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
label_data = np.load(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
prepro_configs = None

from sklearn.model_selection import train_test_split

TEST_SPLIT = 0.1
RNG_SEED = 13371447

input_train, input_test, label_train, label_test = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [21]:
BATCH_SIZE = 16
NUM_EPOCHS = 10
vocab_size = 74065
embedding_size = 128

def mapping_fn(X, Y):
    input, label = {'text': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_test, label_test))
    dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [26]:
def model_fn(features, labels, mode, params):
    #embedding layer를 선언합니다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['text'],
                    vocab_size,
                    embedding_size,
                    initializer=params['embedding_initializer']
                    )
    # 현재 모델이 학습모드인지 여부를 확인하는 변수입니다.
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                   rate=0.2,
                                   training=training)
    
    conv = tf.layers.conv1d(
            inputs=dropout_emb,
            filters=32,
            kernel_size=3,
            padding='same',
            activation=tf.nn.relu)
    
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)  
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'prob':tf.nn.sigmoid(logits)
                  })
    else:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)
    
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [27]:
params = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}

model_dir = os.path.join(os.getcwd(), "checkpoint/cnn_model")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()
config_tf._save_checkpoints_steps = 100
config_tf._save_checkpoints_secs = None
config_tf._keep_checkpoint_max =  2
config_tf._log_step_count_steps = 100

In [28]:
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf, params=params)
est.train(train_input_fn)

# prediction = est.predict(eval_input_fn)
# for i, p in enumerate(prediction):
#     print(i, p['q_sem'])

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))


1.8.0
Experiment started at 05:51:36
.......................................
INFO:tensorflow:Using config: {'_model_dir': '/Users/user/git/DeepNLP/Kaggle/BagOfWordsMeetsBagsOfPopcorn/checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11cdab8d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INF

KeyboardInterrupt: 

In [2]:
import os
import string
import tempfile
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence

from tensorboard import summary as summary_lib

#전처리 lib
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords

import re

ModuleNotFoundError: No module named 'bs4'

In [5]:
vocab_size = 150000
sentence_size = 200
embedding_size = 50
# model_dir = tempfile.mkdtemp()
model_dir = './checkpoint/cnn_classifier'


# we assign the first indices in the vocabulary to special tokens that we use
# for padding, as start token, and for indicating unknown words
pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
x_test = sequence.pad_sequences(x_test_variable, 
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post', 
                                value=pad_id)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

Pad sequences (samples x time)
x_train shape: (20000, 200)
x_test shape: (5000, 200)


In [15]:
for i in range(0, 10):
    print(i, word_inverted_index[i])
  
def index_to_text(indexes):
    return ' '.join([word_inverted_index[i] for i in indexes])

print(index_to_text(variables[0]))

0 <PAD>
1 <START>
2 <OOV>
3 movie
4 film
5 one
6 like
7 good
8 time
9 even
stories look age korda happy lonely done back manages taking together fanshawe together knoxville dvd us well form jungle music enough would hilarious uwe dvd made less oscar aliens painful knoxville find testament find actual <OOV> tell look story title constant scary touching monkey korda fight c brosnan bad james giving redeeming much im context solid although wanted fashion begin nazi film korda type look feature makes women thought turned korda kazakos aymeric script <START> korda based time end first based maybe would idea working actual <OOV> whole dialogue car far kristel thrills carry local killing lordi tension mukhsin casting mysterious extreme close korda help much sequence korda halleck cartoons baloo killing lordi think journeymen turn get best frewer redeeming gave dyer dvd rick korda done surprised hilarious quite film korda industry kill circumstances least favor mexican local bad years right un

In [8]:
#input function

#각 문장의 길이를 계산한다, max 길이는 200

x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable])
x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable])

def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y

#len을 활용하여 기존 전처리 이후의 길이를 보존
#from_tensor_slices를 활용하면 numpy 데이터 구조에서 쉽게 변환

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(x_train_variable))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# CNN Classification

CNN을 활용하여 text를 분류해보자, n-gram의 효과로 활용

https://www.semanticscholar.org/paper/Learning-to-Rank-Short-Text-Pairs-with-Deep-Neural-Severyn-Moschitti/452f7411af7d471dd3ba84c2b06b2aaffc38cdb9

Embedding Layer -> Dropout -> Conv1D -> GlobalMax1D -> Hidden Dense Layer -> Dropout -> Output Layer

In [9]:
all_classifiers = {}

def train_and_evaluate(classifier):
    # 예측 테스트를 위해 모델을 학습시키고 저장한다.
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps=1)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
    
    # name scopes의 재사용을 위해 graph를 reset한다.
    tf.reset_default_graph()
    
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool),
                             num_thresholds=21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()

In [25]:
#head: pre-made estimator로 평가를 할 때, 일정한 함수를 사용하게 세팅
head = tf.contrib.estimator.binary_classification_head()

def cnn_model_fn(features, labels, mode, params):
    #embedding layer를 선언한다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['x'],
                    vocab_size,
                    embedding_size,
                    initializer=params['embedding_initializer']
                    )

    training = (mode == tf.estimator.ModeKeys.TRAIN)
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                   rate=0.2,
                                   training=training)

    conv = tf.layers.conv1d(
            inputs=dropout_emb,
            filters=32,
            kernel_size=3,
            padding='same',
            activation=tf.nn.relu)
    
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)  
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    
    optimizer = tf.train.AdamOptimizer() #여러가지 Optimizer 활용가능
    
    def _train_op_fn(loss):
#         tf.summary('loss', loss)
        return optimizer.minimize(
                loss=loss,
                global_step=tf.train.get_global_step())

    
    return head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits,
        train_op_fn=_train_op_fn)


cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                        model_dir=os.path.join(model_dir, 'cnn'),
                                        params=params)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/Users/user/git/DeepNLP/Kaggle/BagOfWordsMeetsBagsOfPopcorn/checkpoint/cnn_model/cnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11d325748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
#학습 후, 결과치를 tensorboard로 확인
# tensorboard --logdir=./checkpoint/cnn_classifier/
train_and_evaluate(cnn_classifier)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101002
INFO:tensorflow:Saving checkpoints for 101003 into ./checkpoint/cnn_classifier/cnn/model.ckpt.
INFO:tensorflow:loss = 6.933278e-09, step = 101003
INFO:tensorflow:Loss for final step: 6.933278e-09.
INFO:tensorflow:Starting evaluation at 2018-05-13-11:33:43
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
INFO:tensorflow:Finished evaluation at 2018-05-13-11:33:44
INFO:tensorflow:Saving dict for global step 101003: accuracy = 0.8706, accuracy_baseline = 0.5024, auc = 0.89279175, auc_precision_recall = 0.91575927, average_loss = 2.343806, global_step = 101003, label/mean = 0.5024, loss = 234.3806, prediction/mean = 0.49696568
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003


In [12]:
# 직접 prediction으로 테스트 해 본다

def text_to_index(sentence):
    # Remove punctuation characters except for the apostrophe
    translator = str.maketrans('', '', string.punctuation.replace("'", ''))
    tokens = sentence.translate(translator).lower().split()
    return np.array([1] + [word_index[t] if t in word_index else oov_id for t in tokens])

def print_predictions(sentences):
    indexes = [text_to_index(sentence) for sentence in sentences]
    x = sequence.pad_sequences(indexes, 
                               maxlen=sentence_size, 
                               truncating='post',
                               padding='post',
                               value=pad_id)
    length = np.array([min(len(x), sentence_size) for x in indexes])
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": x, "len": length}, shuffle=False)
    predictions = {}
    for path, classifier in all_classifiers.items():
        predictions[path] = [p['logistic'][0] for p in classifier.predict(input_fn=predict_input_fn)]
    for idx, sentence in enumerate(sentences):
        print(sentence)
        for path in all_classifiers:
            print("\t{} {}".format(path, predictions[path][idx]))
#             predictions[path][idx]
    
    return predictions[path][idx]

In [32]:
print_predictions([
    'I do not like this movie'
])

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101002
I do not like this movie
	./checkpoint/cnn_classifier/cnn 0.9999970197677612


0.999997

In [33]:
print_predictions(['fuck you', 'this movie sucks'])

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101002
fuck you
	./checkpoint/cnn_classifier/cnn 0.1322726458311081
this movie sucks
	./checkpoint/cnn_classifier/cnn 2.31997950535464e-19


2.3199795e-19

지금까지 했던 것을 모두 활용하여 제출용 데이터를 만들어봅시다.

In [13]:
#테스트 데이터를 읽어봅시다.
test = pd.read_csv(default_path+"testData.tsv", header=0, delimiter="\t", quoting=3 )

#테스트 파일은 이렇게 생겼다고 합니다.
print (test.head())

#이 파일은 "sentiment" 행이 없습니다.
print (test.shape)

#불용어 제거도하고 태그들도 지우고 난후 데이터를 저장할 장소를 만들어둡니다.
num_reviews = len(test["review"])
clean_test_reviews = []

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

           id                                             review
0  "12311_10"  "Naturally in a film who's main themes are of ...
1    "8348_2"  "This movie is a disaster within a disaster fi...
2    "5828_4"  "All in all, this is a movie for kids. We saw ...
3    "7186_2"  "Afraid of the Dark left me with the impressio...
4   "12128_7"  "A very accurate depiction of small time mob l...
(25000, 2)
Cleaning and parsing the test set movie reviews...

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Revi

In [14]:
# 예측된 모델을 불러 체크포인트로 결과치를 불러온다.

sentimental = []

for i in range(len(clean_test_reviews)):
    if ( (i+1) % 1000 == 0):
        print ("Current Progress %d \n" % (i+1))
    sentimental.append(print_predictions([clean_test_reviews[i]]))

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty
	./checkpoint/cnn_classifier/cnn 1.0
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
movie disaster within disaster film full great action scenes meaningful throw away sense reality let see word wise lava burns steam burns stand next lava diverting minor lava flow difficult let alone significant one scares think might 

saw film phoenix film festival today loved synopsis listed program old shakespearean actor invites three children suicide party sure going see read liked idea suicide party sounded interesting old shakespearean actor worried film would kind dry boring decided give try glad dry boring least dialogue great funny clever way pretentious difficult understand peter falk terrific role stole show also pleasantly surprised laura san giacomo performance usually bugs enjoyed watching film much think judge reinhold part could done better another actor times seemed kind cheesy looked like acting like watching character movie good able forgive one actor awkwardness would recommend film anyone already told people see soon available general public knew suicide could hilarious
	./checkpoint/cnn_classifier/cnn 1.0
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
love letter one movies could really clever wasted focusing letter wreaking havoc small town movie st

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
one worst sandra bullock movie since speed quite bad really lost blue special effect guys insomniac go girl see movie give three sleepies
	./checkpoint/cnn_classifier/cnn 7.357160721118613e-30
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
watched flick saturday afternoon cable man drag got metaphors symbolism stuff care one way another sexuality characters pacing story scripting almost put sleep ruth marshall got naked breast man homo phobic may want rent ruth lesbian sex scene pretty hot hetero sex scene notch higher standard movie fare jiggly cups made film worth watch mighty avatar
	./checkpoint/cnn_classifier/cnn 1.0262750649303598e-08
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
went see tkia high expectations might influence opinion seen dogme films tkia far worst story intertwines themes shakespea

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
alas another costner movie hour long credible performances script go hurry get first offered unrelated string events story script center randall wife randall fischer fischer thomas end real front story ever develops characters artificially propped monologues third parties singer explains randall randall explains fischer finally long care anymore learn something script meetings three endings doubt proffered one could make decision end result three used one another another hang past th yawn able pick despite transparent attempt gain points dedication coast guard one washed first day
	./checkpoint/cnn_classifier/cnn 0.011258931830525398
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
	./checkpoint/cnn_classifier/cnn 1.5396114938504013e-11
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
still living parents aired

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
footprints interesting movie somewhat difficult categorize psychological thriller appropriate description think female protagonist alice cespi discovers remember anything last three days clue torn photo hotel also haunted recurring vivid dream science fiction movie believes saw many years ago pursuit truth behind amnesia trust anyone little little becomes obvious visited town hotel located exciting flick whose main virtue virtually impossible predict events unfold particularly end unusual loneliness main character unreliability everyone else ensure good old paranoid feeling present throughout film whereas beautiful colors spectacularly filmed sequences make visually attractive movie well important part one nicoletta elmi everyone time favorite redheaded obnoxious child star italian horror extra bonus
	./checkpoint/cnn_classifier/cnn 1.0
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_

wonderful family movie beautiful horse movie entertainment casey buddy kelly marsh interesting lovable characters horses real beauties horse racing backdrop showing luck sometimes nothing good commonsense shows kids stupid things stupid reasons shows adults stupid things selfish reasons realistically portrayed characters transform unrealistic theme film something everyone relate andrew rubin puts wonderful performance buddy sensible elder brother somewhat reminded aidan quinn eyes speech delivery facial appearance casey makes fall love character earnestness sarah blue also nicely portrayed alexis smith lloyd bourdelle father played walter matthau character though room improvement movie enjoyable feel good movie
	./checkpoint/cnn_classifier/cnn 0.9999998807907104
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
really film two halves first detailing lives friendship two boys one privileged pashtun trodden hazara late afghanistan invasion ussr w

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
tangier wealthy american woman two children kidnapped berbers murderous desert pirates scorn moroccan government kidnap american pestilence attracts attention u president theodore roosevelt fictitious historical epic less grand adventure peculiar somewhat exhaustive throwback desert sheik films bit king interjected besides portraying cloaked mustachioed bloodthirsty leader snippy haughty captive sean connery candice bergen could acting two entirely different movies neither one seems know far carry camp elements characters dialogue seem singularly without proper direction various anonymous slashings beheadings occur arbitrary know victims big action scenes become blurry noisy montages sand swept violence horseback pluses much lauded music score jerry goldsmith oscar nominated loser john williams jaws fine location shooting cinematography
	./checkpoint/cnn_classifier/cnn 4.626404552254826e-05
INFO

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
review contains spoilers expectations movie pulled video store rack movie white noise first credits stupid movie run minutes pacing start finish slooooow main heroines like wear bra director appears enjoy jiggle effect anna paquin descends stairs like movies boobies one low level buzz factor second nice movie rips elements lovecraft horror genre mechanisms better movies least rational consistently irrational behavior stinker tries establish sense modernity reality situations one calls police even though uncovered treasure trove potentially incriminating forensic evidence otherworldly rituals nicely spelled comprehensive book otherworldly rituals like buffy waiting miles show give consultation slay certain demon type premise possible open age darkness creatures crawl ceiling cut throat turn meat grinder effect ho hum need sacrificial circle seven kids must throat cuts people love opens world age 

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
surprised much enjoyed sure bit slow moving parts else would one expect rollin also plenty nudity nothing wrong particularly includes lots gorgeous brigitte lahaie also spectacularly eroticised female dead bit dodgey perhaps effective also sci fi like storyline brief explanation end bother much interesting exploration memory effect memory loss extent one still alive without memory dvd sleeve mentions david cronenberg whilst perhaps quite good best films similarity particularly great use seemingly menacing architecture effective creepy use inside space tried indicate means rip roaring thriller captivating nightmare like movie makes locations including stunning railway setting end
	./checkpoint/cnn_classifier/cnn 0.9999995231628418
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
picture scene bored student empty day ahead video shop special offer video w

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
film started got feeling going something special acting camera work undoubtedly good also liked characters could grown empathise film good atmosphere hint fantasy however film went plot never appeared takeoff rolled scene scene unable understand connection stories could see characters occasionally bumping references ships bottles without connection left unremarkable short stories surprised well cannes
	./checkpoint/cnn_classifier/cnn 0.9999971389770508
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
consider huge horror movie fan one night wanted rent newer horror movies make fun rented milo surprised movie scared hell outta something say often usually halloween scares two friends watched left disturbed feeling good feeling ask watching horror movie unaffected point movie original follow normal guidelines horror movie movies like valentine bombed cours

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
aside fact women film stunningly beautiful camp prisoners fat film rings true chaos post war beautiful photography powerful national expression polish national character slow points entire pacing different american western european films quite refreshing lead actors good job dvd version see interviews principal actors crew lead actress stanislawa celinska gained lbs lost beauty stunner
	./checkpoint/cnn_classifier/cnn 1.0
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
cant put simpler terrible film worked industry made several short films okay standard pretty high seriously absolutely hate film made comment imdb hated film much literally come warn others piece sh writer director idiot idea make write good film writing skills adolescent teenager characters unrealistic lead woman think taking policeman pistol yet resourceful enough improvise molotov coc

INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
usually difficult time watching tv movie extra long commercial breaks break concentration give find good book one however made put adds stay end realize movie based true story brought took long find denny name would presume social security number move around lot would seem would found soon number entered job etc actors seemed bit old part buried metal object dug rust technical glitches take file lifetime movie better
	./checkpoint/cnn_classifier/cnn 0.9979889392852783
INFO:tensorflow:Restoring parameters from ./checkpoint/cnn_classifier/cnn/model.ckpt-101003
movie painful probably best way describe minutes life never able get back well actually like minutes way anyone would want sit credits stinking pile dog feces immediately tell movie producer mortal kombat due thumping annoying techno soundtrack drains laughably enjoyable moments movie give rest drained completely uninteresting annoying chara

KeyboardInterrupt: 

In [None]:
#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":sentimental} )

#지금까지 처리한 결과를 파일로 저장합니다.
output.to_csv( "Bag_of_Words_model_test.csv", index=False, quoting=3 )

In [114]:
#0.5 기준으로 값들을 변환

def correct_val(x):
    if x >= 0.5:
        x = 1
    else:
        x = 0
    
    return x

final_result = output['sentiment'].apply(correct_val)

In [115]:
#알아보기 쉽게 데이터랑 붙여두는 편이 좋을 거 같습니다.
output = pd.DataFrame( data={"id":test["id"], "sentiment":final_result} )

#지금까지 처리한 결과를 파일로 저장합니다.
output.to_csv( "final_bof.csv", index=False, quoting=3 )

#최종 점수는 0.8344가 나왔습니다. 단순한 cnn 1 layer 모델 기준. 향후 파인 튜닝 하면 더 증가 할 가능성이 있ㄴ에ㅛ