In [1]:
import os
from os import listdir, remove
from os.path import isfile, join
import numpy as np

## Download Dataset

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/aksdmj/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

## Preprocessing

In [3]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

In [4]:
# divide filenames by its sentiment
# error-avoiding method
documents = defaultdict(list)
for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

In [7]:
documents.keys()

dict_keys(['pos', 'neg'])

In [10]:
print(len(documents['pos']), len(documents['neg']))

1000 1000


In [11]:
# calculate maximum length of text
lens = [len(mr.words(i)) for i  in mr.fileids()]
max_num_word = max(lens)
max_num_word

2879

In [13]:
# count the number of unique words used in all texts
unique_words = len(set(mr.words()))
unique_words

39768

In [14]:
mr.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [23]:
# map from index to word
vocab_dict = {i:v for i,v in enumerate(set(mr.words()))}
vocab_dict[10495]

'hello'

In [20]:
# map from word to index
vocab_dict_inv = {v:i for i,v in vocab_dict.items()}
vocab_dict_inv['hello']

10495

## Word2Vec pretrained

In [24]:
from gensim.models import KeyedVectors

In [25]:
from google_drive_downloader import GoogleDriveDownloader as gdd

In [26]:
# if you already have pretrained wordvector weights, replace weightpath variable with a path of it
weight_dir = os.path.join(os.path.abspath('..'),"word2vec")
weight_path = os.path.join(weight_dir, "GoogleNews-vectors-negative300.bin")

In [27]:
if not os.path.exists(weight_dir):
    os.makedirs(weight_dir)
else:
    print("Directory is already exist")

Directory is already exist


In [28]:
if not os.path.isfile(weight_path):
    print("No pretrained weight file, start download...")
    gdd.download_file_from_google_drive(file_id='0B7XkCwpI5KDYNlNUTTlSS21pQmM',
                                        dest_path=weight_path + '.gz',
                                        unzip=False)
    inF = gzip.open(weight_path + '.gz', 'rb')
    outF = open(weight_path, 'wb')
    outF.write(inF.read())
    inF.close()
    outF.close()

    remove(w2v_path + '.gz')

    print("Done")
else:
    print("pretrained weight is already exist")


pretrained weight is already exist


In [29]:
# load pretrained vector
w2v = KeyedVectors.load_word2vec_format(weight_path, binary=True)

In [32]:
# make w2v matrix for our dataset's words
weights = np.array([w2v[v] if v in w2v else np.zeros(w2v.vector_size) for i ,v in vocab_dict.items()])
# (number of words, dimension of wordvectors)
weights.shape

(39768, 300)

## Model

In [33]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe

In [34]:
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

## Data Preprocess

In [35]:
def words2indexs(words):
    return np.array([vocab_dict_inv[word] for word in words])

# make index array with size of (max_num_word), with -1("UNK" word) padding
def preprocess(document):
    indexs = words2indexs(mr.words(document))
    return np.concatenate([indexs, -1*np.ones([max_num_word - indexs.shape[0]], dtype="int64")])

In [36]:
X = list()
y = list()

# 0 label for negative , 1 for positive
for i in documents['neg']:
    tx = preprocess(i)
    X.append(tx)
    y.append(0)
    
for i in documents['pos']:
    tx = preprocess(i)
    X.append(tx)
    y.append(1)

In [40]:
# train-validation-test split with the ratio of (0.72, 0.18, 0.1)
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)


In [41]:
data_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
data_val = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(len(y_val))
data_test = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(len(y_test))


In [45]:
class CNN_Sentence(tf.keras.Model):
    """ Classifier for Movie Review dataset
    Args:
        num_words: uniwue words in dataset. including the words that is not in pretrained w2v model.
        in_dim: dimension of input array, which is maximum word counts among texts.
        w2v_dim: word2vectors representationo dimension. 300 for GoogleNews-vectors-negative300 weights.
        out_dim: softmax output dimension.
        is_static: if True, use static(no change in w2v pretrained weights), otherwise modify weights.
    """
    def __init__(self,
                 num_words=unique_words,
                 in_dim= max_num_word,
                 w2v_dim= 300,
                 out_dim=2,
                 learning_rate=0.001,
                 is_static=True,
                 checkpoint_directory="checkpoints/",
                 device_name="cpu:0"):
        super(CNN_Sentence, self).__init__()
        self.in_dim = in_dim
        self.num_words = num_words
        self.out_dim = out_dim
        self.is_static = is_static
        self.learning_rate = learning_rate
        self.checkpoint_directory = checkpoint_directory
        self.device_name = device_name

        self.w2v = tf.keras.layers.Embedding(num_words,w2v_dim)

        self.conv11 = tf.layers.Conv1D(filters=100, kernel_size=3, padding="valid")
        self.conv12 = tf.layers.Conv1D(100, 4, padding="valid")
        self.conv13 = tf.layers.Conv1D(100, 5, padding="valid")

        self.maxpool1 = tf.layers.MaxPooling1D(pool_size=2, strides=2)
        self.maxpool2 = tf.layers.MaxPooling1D(pool_size=2, strides=2)
        self.maxpool3 = tf.layers.MaxPooling1D(pool_size=2, strides=2)

        self.flatten = tf.layers.Flatten()
        self.dropout = tf.layers.Dropout(0.5)

        self.out = tf.layers.Dense(self.out_dim)        
        
        # optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        
        # global step
        self.global_step = 0
        
        # verbose logging
        self.epoch_loss = 0
        
        # plotting
        self.train_losses = []
        self.val_losses = []
        self.test_accuracies = []
        

    def copy_pretrained(self, weights):
        # init
        self.predict(tf.convert_to_tensor(np.zeros([1, self.num_words]), dtype="float64"), False)
        temp = tf.convert_to_tensor(weights.astype("float32"))
        tf.assign(self.w2v.weights[0], temp)

    def predict(self, X, training):
        
        X = self.w2v(X)

        x11 = self.conv11(X)
        x12 = self.conv12(X)
        x13 = self.conv13(X)

        x11m = tf.reduce_max(x11,1)
        x12m = tf.reduce_max(x12,1)
        x13m = tf.reduce_max(x13,1)

        xf1 = self.flatten(x11m)
        xf2 = self.flatten(x12m)
        xf3 = self.flatten(x13m)

        xf = tf.concat([xf1, xf2, xf3], axis=1)

        if training:
            xf = self.dropout(xf)

        pred = self.out(xf)
        return pred

    def loss(self, X, y, training):
        prediction = self.predict(X, training)
        loss_val = tf.losses.sparse_softmax_cross_entropy(y, prediction)
        loss_val += tf.nn.l2_loss(self.out.weights[0])
        self.epoch_loss += loss_val.numpy()
        
        return loss_val

    def grad(self, X, y, training):
        with tfe.GradientTape() as tape:
            loss_val = self.loss(X, y, training)
        return tape.gradient(loss_val, self.variables) 
        
    def fit(self, train_data, validation_data, test_data, epochs=1, verbose=1, 
            batch_size=50, saving=False, early_stopping=0):
    
        train_data_batch = train_data.shuffle(100).batch(batch_size)
        
        with tf.device(self.device_name):
            for i in range(epochs):
                self.epoch_loss = 0
                for X, y in tfe.Iterator(train_data_batch):
                    grads = self.grad(X, y, True)
                    if self.is_static:
                        self.optimizer.apply_gradients(zip(grads[1:], self.variables[1:]))
                    else :
                        self.optimizer.apply_gradients(zip(grads, self.variables))
                    
                self.global_step += 1
                
                self.train_losses.append(self.epoch_loss)
                
                for X, y in tfe.Iterator(validation_data):
                    self.val_losses.append(self.loss(X, y, False).numpy())
                    
                    
                accuracy = tfe.metrics.Accuracy('train_acc')

                for X, y in tfe.Iterator(test_data):
                    logits = self.predict(X=X, training=False)
                    predictions = tf.argmax(logits, axis=1, output_type="int32")
                    accuracy(predictions, y)     
                    
                self.test_accuracies.append(accuracy.result().numpy())

                if (i+1)%verbose == 0 :
                    print("[EPOCH %d / STEP %d]" % ((i + 1), self.global_step))
                    print("Train loss : %s" % self.train_losses[-1])
                    print("Val   loss : %s" % self.val_losses[-1])
                    print("TEST accuracy: %.4f%%" % (self.test_accuracies[-1] * 100))
                    print()
                          

                accuracy.init_variables()
                
                if early_stopping:
                    ok = False
                    if len(self.val_losses) <= early_stopping:
                        continue
                    for i in range(early_stopping):
                        if self.val_losses[-(i+1)] < self.val_losses[-(i+2)]:
                            ok = True
                            break
                    if ok:
                        continue
                    print("early stopping on step %s" % self.global_step)
                    break
                                              
                    
                    
    def save(self):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=self.global_step)

    def load(self, global_step="latest"):
        # init
        self.predict(tf.convert_to_tensor(np.zeros([1, self.num_words]), dtype="float64"), False)

        saver = tfe.Saver(self.variables)
        if global_step == "latest":
            saver.restore(tf.train.latest_checkpoint(self.checkpoint_directory))
            self.global_step = int(tf.train.latest_checkpoint(self.checkpoint_directory).split('/')[-1][1:])
        else:
            saver.restore(self.checkpoint_directory + "-" + str(global_step))
            self.global_step = global_step

            

In [51]:
cnn_static = CNN_Sentence(device_name="gpu:0")
# if you don't have tensorflow-gpu, then erase device_name parameter
# cnn_static = CNN_Sentence()
cnn_static.copy_pretrained(weights)

In [52]:
cnn_static.fit(data_train, data_val,  data_test, epochs=10, verbose=2, early_stopping=False)

[EPOCH 2 / STEP 2]
Train loss : 38.306246876716614
Val   loss : 1.1973715
TEST accuracy: 66.0000%

[EPOCH 4 / STEP 4]
Train loss : 20.28722643852234
Val   loss : 0.771232
TEST accuracy: 75.0000%

[EPOCH 6 / STEP 6]
Train loss : 14.574689149856567
Val   loss : 0.6395987
TEST accuracy: 81.0000%

[EPOCH 8 / STEP 8]
Train loss : 12.121951282024384
Val   loss : 0.58996224
TEST accuracy: 82.0000%

[EPOCH 10 / STEP 10]
Train loss : 10.397034466266632
Val   loss : 0.56847554
TEST accuracy: 83.5000%



In [53]:
cnn_non_static = CNN_Sentence(device_name="gpu:0", is_static=False)
cnn_non_static.copy_pretrained(weights)

In [54]:
cnn_non_static.fit(data_train, data_val,  data_test, epochs=10, verbose=2, early_stopping=False)

[EPOCH 2 / STEP 2]
Train loss : 37.34070706367493
Val   loss : 1.2378596
TEST accuracy: 71.0000%

[EPOCH 4 / STEP 4]
Train loss : 17.38844782114029
Val   loss : 0.76585174
TEST accuracy: 83.0000%

[EPOCH 6 / STEP 6]
Train loss : 10.876617908477783
Val   loss : 0.6033069
TEST accuracy: 84.0000%

[EPOCH 8 / STEP 8]
Train loss : 7.60062512755394
Val   loss : 0.53781915
TEST accuracy: 82.0000%

[EPOCH 10 / STEP 10]
Train loss : 5.6968440264463425
Val   loss : 0.50722325
TEST accuracy: 83.0000%



## Task 2: regression on Amazon ratings

In [55]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [56]:
import glob

In [57]:
path_name = os.path.join(os.path.dirname(os.path.abspath("__file__")), "amazon_reviews")

In [58]:
# you can get dataset in 
# http://jmcauley.ucsd.edu/data/amazon/
df = getDF('../amazon_reviews/reviews_Beauty_5.json.gz')

In [59]:
df.head()

Unnamed: 0,summary,reviewerName,helpful,reviewText,reviewTime,unixReviewTime,overall,reviewerID,asin
0,Don't waste your money,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,"01 30, 2014",1391040000,1.0,A1YJEY40YUW4SE,7806397051
1,OK Palette!,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,"04 18, 2014",1397779200,3.0,A60XNB876KYML,7806397051
2,great quality,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,"09 6, 2013",1378425600,4.0,A3G6XNM240RMWA,7806397051
3,Do not work on my face,Norah,"[2, 2]",I really can't tell what exactly this thing is...,"12 8, 2013",1386460800,2.0,A1PQFP6SAJ6D80,7806397051
4,It's okay.,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...","10 19, 2013",1382140800,3.0,A38FVHZTNQ271F,7806397051


In [60]:
def text2wordcounts(text):
    words = text.lower().split()
    return np.array(len(words))

In [61]:
df['counts'] = df['reviewText'].apply(text2wordcounts)

In [62]:
df.head()

Unnamed: 0,summary,reviewerName,helpful,reviewText,reviewTime,unixReviewTime,overall,reviewerID,asin,counts
0,Don't waste your money,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,"01 30, 2014",1391040000,1.0,A1YJEY40YUW4SE,7806397051,28
1,OK Palette!,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,"04 18, 2014",1397779200,3.0,A60XNB876KYML,7806397051,27
2,great quality,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,"09 6, 2013",1378425600,4.0,A3G6XNM240RMWA,7806397051,102
3,Do not work on my face,Norah,"[2, 2]",I really can't tell what exactly this thing is...,"12 8, 2013",1386460800,2.0,A1PQFP6SAJ6D80,7806397051,35
4,It's okay.,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...","10 19, 2013",1382140800,3.0,A38FVHZTNQ271F,7806397051,65


In [64]:
wordlen = 500
print("%s rows among %s rows have length more than %s" % (df[df['counts']>wordlen].count()[0], len(df), wordlen))

1306 rows among 198502 rows have length more than 500


In [65]:
def text2words(text):
    words = np.array(text.lower().split())
    return words

In [68]:
X = df['reviewText'].apply(text2words).values
X[0]

array(['very', 'oily', 'and', 'creamy.', 'not', 'at', 'all', 'what', 'i',
       'expected...', 'ordered', 'this', 'to', 'try', 'to', 'highlight',
       'and', 'contour', 'and', 'it', 'just', 'looked', 'awful!!!',
       'plus,', 'took', 'forever', 'to', 'arrive.'], dtype='<U11')

In [69]:
# apply normalization on score from 1 ~ 5 to 0.2 ~ 1.0
y = np.array(df['overall'].values)/5.0
y[0]

0.2

In [78]:
# warning, it tooks more than 100GB for all preprocessed data.
# recommand to use minimized version using code below
# X = X[:2000]
# y = y[:2000]

In [71]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.01, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.02, random_state=42)

In [75]:
class CNN_Regression(tf.keras.Model):
    def __init__(self,
                 in_dim= wordlen,
                 w2v_dim= 300,
                 out_dim=1,
                 learning_rate=0.001,
                 checkpoint_directory="checkpoints/",
                 device_name="cpu:0"):
        super(CNN_Regression, self).__init__()
        self.in_dim = in_dim
        self.w2v_dim = w2v_dim
        self.out_dim = out_dim
        self.learning_rate = learning_rate
        self. checkpoint_directory = checkpoint_directory
        self.device_name = device_name

        self.conv11 = tf.layers.Conv1D(filters=100, kernel_size=3, padding="valid")
        self.conv12 = tf.layers.Conv1D(100, 4, padding="valid")
        self.conv13 = tf.layers.Conv1D(100, 5, padding="valid")

        self.maxpool1 = tf.layers.MaxPooling1D(pool_size=2, strides=2)
        self.maxpool2 = tf.layers.MaxPooling1D(pool_size=2, strides=2)
        self.maxpool3 = tf.layers.MaxPooling1D(pool_size=2, strides=2)

        self.flatten = tf.layers.Flatten()
        self.dropout = tf.layers.Dropout(0.5)

        self.out = tf.layers.Dense(self.out_dim, activation=tf.nn.sigmoid)        
        
        # optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        
        # global step
        self.global_step = 0
        
        # verbose logging
        self.epoch_loss = 0
        
        # plotting
        self.train_losses = []
        self.val_losses = []
        self.test_losses = []
        
    def preprocess(self, X):
        x = np.zeros((len(X), self.in_dim, self.w2v_dim), dtype="float32")
        for i, batch in enumerate(X):
            for j, word in enumerate(batch):
                if j >= 500:
                    break
                if word in w2v:
                    x[i][j] = w2v[word]
        return tf.convert_to_tensor(x)
        
    def predict(self, X, training):
        
        x11 = self.conv11(X)
        x12 = self.conv12(X)
        x13 = self.conv13(X)

        x11m = tf.reduce_max(x11,1)
        x12m = tf.reduce_max(x12,1)
        x13m = tf.reduce_max(x13,1)

        xf1 = self.flatten(x11m)
        xf2 = self.flatten(x12m)
        xf3 = self.flatten(x13m)

        xf = tf.concat([xf1, xf2, xf3], axis=1)

        if training:
            xf = self.dropout(xf)

        pred = self.out(xf)
        return pred

    def loss(self, X, y, training):
        prediction = self.predict(X, training)
        loss_val = tf.losses.mean_squared_error(y, prediction)
#         loss_val += tf.nn.l2_loss(self.out.weights[0])
        self.epoch_loss += loss_val.numpy()
        
        return loss_val

    def grad(self, X, y, training):
        with tfe.GradientTape() as tape:
            loss_val = self.loss(X, y, training)
        return tape.gradient(loss_val, self.variables) 
        
    def batch_preprocess(self, X_train, y_train, batch_size):
        from datetime import datetime
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        batch_dir = os.path.abspath("/tmp/%s" % timestamp)
        os.makedirs(batch_dir)
        for i in range(len(X_train)//batch_size + 1):
            X = X_train[50*i:min(50*(i+1), len(X_train))]
            X = self.preprocess(X)
            y = y_train[50*i:min(50*(i+1), len(y_train))].reshape(-1,1)
            filenameX = os.path.join(batch_dir, "batchX_%s.npy" % i)
            filenamey = os.path.join(batch_dir, "batchy_%s.npy" % i)
            np.save(filenameX, X)
            np.save(filenamey, y)
        
        return batch_dir
            
        
    
    def fit(self, X_train, y_train, X_val, y_val, X_test, y_test, epochs=1, verbose=1, 
            batch_size=50, saving=False, early_stopping=0):
        import time
        start = time.time()
        print("preprocessing...  ", end="")
        batch_dir = self.batch_preprocess(X_train, y_train, batch_size)
        print("Done %s second" % (time.time()-start))
        
        X_val = self.preprocess(X_val)
        X_test = self.preprocess(X_test)
        
        with tf.device(self.device_name):
            for i in range(epochs):
                self.epoch_loss = 0
                for j in range(len(X_train)//batch_size + 1):
                    filenameX = os.path.join(batch_dir, "batchX_%s.npy" % j)
                    filenamey = os.path.join(batch_dir, "batchy_%s.npy" % j)
                    X = tf.convert_to_tensor(np.load(filenameX))
                    y = tf.convert_to_tensor(np.load(filenamey))
                    
                    grads = self.grad(X, y, True)

                    self.optimizer.apply_gradients(zip(grads, self.variables))
                    
                self.global_step += 1
                
                self.train_losses.append(self.epoch_loss)
                
                self.val_losses.append(self.loss(X_val, y_val.reshape(-1,1), False).numpy())
                    
                self.test_losses.append(self.loss(X_test, y_test.reshape(-1,1), False).numpy())
                

                if (i+1)%verbose == 0 :
                    print("[EPOCH %d / STEP %d]" % ((i + 1), self.global_step))
                    print("Train loss : %s" % self.train_losses[-1])
                    print("Val   loss : %s" % self.val_losses[-1])
                    print("Test  loss : %.4f" % (self.test_losses[-1]))
                    print()
                          

                
                if early_stopping:
                    ok = False
                    if len(self.val_losses) <= early_stopping:
                        continue
                    for i in range(early_stopping):
                        if self.val_losses[-(i+1)] < self.val_losses[-(i+2)]:
                            ok = True
                            break
                    if ok:
                        continue
                    print("early stopping on step %s" % self.global_step)
                    break
                                              
                    
                    
    def save(self):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=self.global_step)

    def load(self, global_step="latest"):
        # init
        self.predict(tf.convert_to_tensor(np.zeros([1, self.in_dim]), dtype="float32"), False)

        saver = tfe.Saver(self.variables)
        if global_step == "latest":
            saver.restore(tf.train.latest_checkpoint(self.checkpoint_directory))
            self.global_step = int(tf.train.latest_checkpoint(self.checkpoint_directory).split('/')[-1][1:])
        else:
            saver.restore(self.checkpoint_directory + "-" + str(global_step))
            self.global_step = global_step

            

In [76]:
cnnr = CNN_Regression(device_name="gpu:0")

In [77]:
cnnr.fit(X_train, y_train, X_val, y_val, X_test, y_test, 5, 1)

preprocessing...  Done 827.2608127593994
[EPOCH 1 / STEP 1]
Train loss : 108.88072858471423
Val   loss : 0.024247907
Test  loss : 0.0240

[EPOCH 2 / STEP 2]
Train loss : 82.50371589511633
Val   loss : 0.023112873
Test  loss : 0.0230

[EPOCH 3 / STEP 3]
Train loss : 68.39071556646377
Val   loss : 0.023961807
Test  loss : 0.0240

[EPOCH 4 / STEP 4]
Train loss : 56.8140720189549
Val   loss : 0.025315262
Test  loss : 0.0249

[EPOCH 5 / STEP 5]
Train loss : 52.97851409902796
Val   loss : 0.025482649
Test  loss : 0.0255

