In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import numpy as np
from numpy.random import seed
import tensorflow as tf

from tensorflow import set_random_seed

In [3]:
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 100
N_HIDDEN_HL1 = 10
RANDOM_STATE = 42

seed(RANDOM_STATE)
set_random_seed(RANDOM_STATE)

In [4]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [5]:
X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [7]:
X_tr = np.expand_dims(X_train, axis=0)
X_te = np.expand_dims(X_test, axis=0)

In [8]:
X_train = X_train.todense()

In [9]:
X_test = X_test.todense()

In [10]:
X_test.shape

(25000, 3686)

In [11]:
y_tr = np.reshape(y_train_original, (len(y_train_original), 1))
y_te = np.reshape(y_test_original, (len(y_test_original), 1))


In [12]:
y_tr.shape

(25000, 1)

### Start here

minimizing -cost is the same as maximizing cost <br>
    
https://github.com/Mazecreator/tensorflow-hints/tree/master/maximize

In [13]:
tf.reset_default_graph()

X_train_tensor = tf.placeholder(tf.float32, [None, X_train.shape[1]], name='review')
Y_train_tensor = tf.placeholder(tf.float32, [None, 1], name='label')

W = tf.get_variable(name='weights',
                   shape=(X_train.shape[1], 1), 
                   initializer=tf.initializers.glorot_uniform(seed=RANDOM_STATE))

b = tf.get_variable(name='bias', 
                   initializer=tf.constant(1.0))

# Final output logits
logits = tf.matmul(X_train_tensor, W) + b

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                             labels=Y_train_tensor))

optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

preds = tf.nn.sigmoid(logits)
correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y_train_tensor)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

writer = tf.summary.FileWriter('./graphs/imdb_simple', tf.get_default_graph())

# Start the session

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(EPOCHS):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X_train_tensor: X_train, Y_train_tensor: y_tr})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X_train_tensor: X_test, Y_train_tensor: y_te})
    
    print('Accuracy {0}'.format(accuracy_test))
writer.close()

AttributeError: module 'tensorflow.python.ops.initializers_ns' has no attribute 'glorot_uniform'

### Human term IMDB

In [None]:
def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)


In [None]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [None]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_train.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, 1], name='label')

W = tf.get_variable(name='weights',
                   shape=(X_train.shape[1], 1), 
                   initializer=tf.initializers.glorot_uniform(seed=RANDOM_STATE))

b = tf.get_variable(name='bias', 
                   initializer=tf.constant(1.0))

# Final output logits
relu_op = tf.nn.relu(X)
logits = tf.matmul(relu_op, W) + b

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                             labels=Y))

optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

preds = tf.nn.sigmoid(logits)
correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

writer = tf.summary.FileWriter('./graphs/imdb_simple', tf.get_default_graph())

# Start the session

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(EPOCHS):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_train, Y: y_tr})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X: X_test, Y: y_te})
    
    print('Accuracy {0}'.format(accuracy_test))
writer.close()