In [1]:
import os
from os import listdir, remove
from os.path import isfile, join
import numpy as np

## Download Dataset

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/aksdmj/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

## Preprocessing

In [3]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

In [4]:
# divide filenames by its sentiment
# error-avoiding method
documents = defaultdict(list)
for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

In [5]:
documents.keys()

dict_keys(['neg', 'pos'])

In [6]:
print(len(documents['pos']), len(documents['neg']))

1000 1000


In [7]:
# calculate maximum length of text
lens = [len(mr.words(i)) for i  in mr.fileids()]
max_num_word = max(lens)
max_num_word

2879

In [8]:
# count the number of unique words used in all texts
unique_words = len(set(mr.words()))
unique_words

39768

In [9]:
mr.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [10]:
# map from index to word
vocab_dict = {i+1:v for i,v in enumerate(set(mr.words()))}
test_vocab = vocab_dict[10495]
test_vocab

'detailing'

In [11]:
# for padding token
vocab_dict[0] = "PAD"

In [12]:
# map from word to index
vocab_dict_inv = {v:i for i,v in vocab_dict.items()}
vocab_dict_inv[test_vocab]

10495

In [13]:
# add padding
unique_words+=1

## Word2Vec pretrained

In [14]:
from gensim.models import KeyedVectors
from google_drive_downloader import GoogleDriveDownloader as gdd

In [15]:
# if you already have pretrained wordvector weights, replace weightpath variable with a path of it
weight_dir = os.path.join(os.path.abspath('..'),"word2vec")
weight_path = os.path.join(weight_dir, "GoogleNews-vectors-negative300.bin")

In [16]:
if not os.path.exists(weight_dir):
    os.makedirs(weight_dir)
else:
    print("Directory is already exist")

Directory is already exist


In [17]:
if not os.path.isfile(weight_path):
    print("No pretrained weight file, start download...")
    gdd.download_file_from_google_drive(file_id='0B7XkCwpI5KDYNlNUTTlSS21pQmM',
                                        dest_path=weight_path + '.gz',
                                        unzip=False)
    inF = gzip.open(weight_path + '.gz', 'rb')
    outF = open(weight_path, 'wb')
    outF.write(inF.read())
    inF.close()
    outF.close()

    remove(w2v_path + '.gz')

    print("Done")
else:
    print("pretrained weight is already exist")


pretrained weight is already exist


In [18]:
# load pretrained vector
w2v = KeyedVectors.load_word2vec_format(weight_path, binary=True)

In [19]:
# make w2v matrix for our dataset's words
weights = np.array([w2v[v] if v in w2v else np.zeros(w2v.vector_size) for i ,v in vocab_dict.items()])
# (number of words, dimension of wordvectors)
weights.shape

(39769, 300)

## Data Preprocess

In [20]:
def words2indexs(words):
    return np.array([vocab_dict_inv[word] for word in words])

# make index array with size of (max_num_word), with 0("PAD" word) padding
def preprocess(document):
    indexs = words2indexs(mr.words(document))
    return np.concatenate([indexs, np.zeros([max_num_word - indexs.shape[0]], dtype="int64")])

In [21]:
X = list()
y = list()

# 0 label for negative , 1 for positive
for i in documents['neg']:
    tx = preprocess(i)
    X.append(tx)
    y.append(0)
    
for i in documents['pos']:
    tx = preprocess(i)
    X.append(tx)
    y.append(1)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Model

In [23]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

In [24]:
from cnn_sentence import CNN_classification_single, CNN_classification_multi

In [None]:
device = 'gpu:0' if tfe.num_gpus() > 0 else 'cpu:0'

In [25]:
cnn_classfication_single_static = CNN_classification_single(
    num_words=unique_words, in_dim=max_num_word, out_dim=2, is_static=True, device_name=device)

In [26]:
cnn_classfication_single_static.copy_pretrained(weights)

In [27]:
cnn_classfication_single_static(tf.convert_to_tensor(X_train[:1]), True)
cnn_classfication_single_static.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  11930700  
_________________________________________________________________
conv11 (Conv1D)              multiple                  90100     
_________________________________________________________________
conv12 (Conv1D)              multiple                  120100    
_________________________________________________________________
conv13 (Conv1D)              multiple                  150100    
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
out (Dense)                  multiple                  602       
Total para

In [28]:
# if you get OOM error, use smaller batch_size
cnn_classfication_single_static.fit(X_train, y_train, X_val, y_val, batch_size =32, epochs=10, verbose=2, tqdm_option="normal")

TRAIN   1: 100%|██████████| 50/50 [00:12<00:00,  9.85it/s]
VAL     1: 100%|██████████| 13/13 [00:10<00:00,  1.27s/it]


[EPOCH 1 / STEP 1]
TRAIN loss   : 24.3521
VAL   loss   : 20.8941
VAL   acc    : 69.7500%


TRAIN   2: 100%|██████████| 50/50 [00:14<00:00,  3.47it/s]
VAL     2: 100%|██████████| 13/13 [00:10<00:00,  1.29s/it]


[EPOCH 2 / STEP 2]
TRAIN loss   : 16.9989
VAL   loss   : 19.4675
VAL   acc    : 75.0000%


TRAIN   3: 100%|██████████| 50/50 [00:11<00:00,  9.86it/s]
VAL     3: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]
TRAIN   4: 100%|██████████| 50/50 [00:14<00:00,  3.45it/s]
VAL     4: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]


[EPOCH 4 / STEP 4]
TRAIN loss   : 9.4260
VAL   loss   : 17.8212
VAL   acc    : 78.2500%


TRAIN   5: 100%|██████████| 50/50 [00:11<00:00, 10.02it/s]
VAL     5: 100%|██████████| 13/13 [00:10<00:00,  1.27s/it]
TRAIN   6: 100%|██████████| 50/50 [00:11<00:00,  9.94it/s]
VAL     6: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]


[EPOCH 6 / STEP 6]
TRAIN loss   : 5.7961
VAL   loss   : 16.5816
VAL   acc    : 79.0000%


TRAIN   7: 100%|██████████| 50/50 [00:11<00:00,  9.87it/s]
VAL     7: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]
TRAIN   8: 100%|██████████| 50/50 [00:11<00:00,  9.82it/s]
VAL     8: 100%|██████████| 13/13 [00:10<00:00,  1.26s/it]


[EPOCH 8 / STEP 8]
TRAIN loss   : 4.0197
VAL   loss   : 15.9876
VAL   acc    : 80.5000%


TRAIN   9: 100%|██████████| 50/50 [00:11<00:00,  9.79it/s]
VAL     9: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]
TRAIN  10: 100%|██████████| 50/50 [00:11<00:00,  9.87it/s]
VAL    10: 100%|██████████| 13/13 [00:10<00:00,  1.31s/it]


[EPOCH 10 / STEP 10]
TRAIN loss   : 3.3394
VAL   loss   : 15.8101
VAL   acc    : 80.0000%


In [29]:
cnn_classfication_single_non_static = CNN_classification_single(
    num_words=unique_words, in_dim=max_num_word, out_dim=2, is_static=False, device_name=device)
cnn_classfication_single_non_static.copy_pretrained(weights)
cnn_classfication_single_non_static(tf.convert_to_tensor(X_train[:1]), True)
cnn_classfication_single_non_static.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  11930700  
_________________________________________________________________
conv11 (Conv1D)              multiple                  90100     
_________________________________________________________________
conv12 (Conv1D)              multiple                  120100    
_________________________________________________________________
conv13 (Conv1D)              multiple                  150100    
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
out (Dense)                  multiple                  602       
Total para

In [30]:
# if you get OOM error, use smaller batch_size
cnn_classfication_single_non_static.fit(X_train, y_train, X_val, y_val, batch_size =32, epochs=10, verbose=2, tqdm_option="normal")

TRAIN   1: 100%|██████████| 50/50 [00:24<00:00,  2.78it/s]
VAL     1: 100%|██████████| 13/13 [00:10<00:00,  1.28s/it]


[EPOCH 1 / STEP 1]
TRAIN loss   : 23.0523
VAL   loss   : 20.5473
VAL   acc    : 66.0000%


TRAIN   2: 100%|██████████| 50/50 [00:24<00:00,  2.82it/s]
VAL     2: 100%|██████████| 13/13 [00:10<00:00,  1.33s/it]


[EPOCH 2 / STEP 2]
TRAIN loss   : 13.9169
VAL   loss   : 19.3215
VAL   acc    : 70.7500%


TRAIN   3: 100%|██████████| 50/50 [00:25<00:00,  2.80it/s]
VAL     3: 100%|██████████| 13/13 [00:10<00:00,  1.31s/it]
TRAIN   4: 100%|██████████| 50/50 [00:24<00:00,  2.83it/s]
VAL     4: 100%|██████████| 13/13 [00:10<00:00,  1.27s/it]


[EPOCH 4 / STEP 4]
TRAIN loss   : 5.3293
VAL   loss   : 17.0537
VAL   acc    : 79.0000%


TRAIN   5: 100%|██████████| 50/50 [00:24<00:00,  2.69it/s]
VAL     5: 100%|██████████| 13/13 [00:10<00:00,  1.29s/it]
TRAIN   6: 100%|██████████| 50/50 [00:24<00:00,  2.80it/s]
VAL     6: 100%|██████████| 13/13 [00:10<00:00,  1.31s/it]


[EPOCH 6 / STEP 6]
TRAIN loss   : 3.3064
VAL   loss   : 16.2035
VAL   acc    : 79.7500%


TRAIN   7: 100%|██████████| 50/50 [00:24<00:00,  2.78it/s]
VAL     7: 100%|██████████| 13/13 [00:10<00:00,  1.31s/it]
TRAIN   8: 100%|██████████| 50/50 [00:24<00:00,  2.80it/s]
VAL     8: 100%|██████████| 13/13 [00:10<00:00,  1.29s/it]


[EPOCH 8 / STEP 8]
TRAIN loss   : 2.6412
VAL   loss   : 15.6427
VAL   acc    : 82.7500%


TRAIN   9: 100%|██████████| 50/50 [00:24<00:00,  2.82it/s]
VAL     9: 100%|██████████| 13/13 [00:10<00:00,  1.32s/it]
TRAIN  10: 100%|██████████| 50/50 [00:25<00:00,  2.74it/s]
VAL    10: 100%|██████████| 13/13 [00:10<00:00,  1.31s/it]


[EPOCH 10 / STEP 10]
TRAIN loss   : 2.2993
VAL   loss   : 15.2547
VAL   acc    : 80.7500%


In [31]:
cnn_classfication_multi = CNN_classification_multi(
    num_words=unique_words, in_dim=max_num_word, out_dim=2, device_name=device)
cnn_classfication_multi.copy_pretrained(weights)
cnn_classfication_multi(tf.convert_to_tensor(X_train[:1]), True)
cnn_classfication_multi.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_trainable (Embedding)    multiple                  11930700  
_________________________________________________________________
w2v_nontrainable (Embedding) multiple                  11930700  
_________________________________________________________________
conv11 (Conv1D)              multiple                  90100     
_________________________________________________________________
conv12 (Conv1D)              multiple                  120100    
_________________________________________________________________
conv13 (Conv1D)              multiple                  150100    
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dropout (Dropout)            multiple                  0         
__________

In [32]:
# if you get OOM error, use smaller batch_size
cnn_classfication_multi.fit(X_train, y_train, X_val, y_val, batch_size =32, epochs=10, verbose=2, tqdm_option="normal")

TRAIN   1: 100%|██████████| 50/50 [00:30<00:00,  2.13it/s]
VAL     1: 100%|██████████| 13/13 [00:11<00:00,  1.07s/it]


[EPOCH 1 / STEP 1]
TRAIN loss   : 27.0469
VAL   loss   : 19.7353
VAL   acc    : 68.7500%


TRAIN   2: 100%|██████████| 50/50 [00:30<00:00,  2.12it/s]
VAL     2: 100%|██████████| 13/13 [00:11<00:00,  1.07s/it]


[EPOCH 2 / STEP 2]
TRAIN loss   : 11.4859
VAL   loss   : 18.4603
VAL   acc    : 76.0000%


TRAIN   3: 100%|██████████| 50/50 [00:30<00:00,  2.16it/s]
VAL     3: 100%|██████████| 13/13 [00:11<00:00,  1.09s/it]
TRAIN   4: 100%|██████████| 50/50 [00:30<00:00,  2.08it/s]
VAL     4: 100%|██████████| 13/13 [00:11<00:00,  1.07s/it]


[EPOCH 4 / STEP 4]
TRAIN loss   : 4.6140
VAL   loss   : 16.9017
VAL   acc    : 79.5000%


TRAIN   5: 100%|██████████| 50/50 [00:30<00:00,  2.16it/s]
VAL     5: 100%|██████████| 13/13 [00:11<00:00,  1.06s/it]
TRAIN   6: 100%|██████████| 50/50 [00:30<00:00,  2.08it/s]
VAL     6: 100%|██████████| 13/13 [00:11<00:00,  1.07s/it]


[EPOCH 6 / STEP 6]
TRAIN loss   : 3.2240
VAL   loss   : 15.9325
VAL   acc    : 81.0000%


TRAIN   7: 100%|██████████| 50/50 [00:30<00:00,  2.14it/s]
VAL     7: 100%|██████████| 13/13 [00:11<00:00,  1.08s/it]
TRAIN   8: 100%|██████████| 50/50 [00:30<00:00,  2.11it/s]
VAL     8: 100%|██████████| 13/13 [00:11<00:00,  1.04s/it]


[EPOCH 8 / STEP 8]
TRAIN loss   : 2.6693
VAL   loss   : 15.4559
VAL   acc    : 79.2500%


TRAIN   9: 100%|██████████| 50/50 [00:30<00:00,  2.12it/s]
VAL     9: 100%|██████████| 13/13 [00:11<00:00,  1.02it/s]
TRAIN  10: 100%|██████████| 50/50 [00:27<00:00,  2.41it/s]
VAL    10: 100%|██████████| 13/13 [00:11<00:00,  1.04s/it]


[EPOCH 10 / STEP 10]
TRAIN loss   : 2.3960
VAL   loss   : 15.2340
VAL   acc    : 80.5000%
