In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
from numpy.random import seed
import tensorflow as tf
import re

from tensorflow import set_random_seed
seed(42)
set_random_seed(42)

In [1]:


import tensorflow as tf

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5464277124126046302
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7492209869
locality {
  bus_id: 1
}
incarnation: 5168416852241616085
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1"
]


In [2]:
with tf.device('/device:GPU:2'):
  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
  c = tf.matmul(a, b)
# Creates a session with allow_soft_placement and log_device_placement set
# to True.
sess = tf.Session(config=tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=True))
# Runs the op.
print(sess.run(c))

[[22. 28.]
 [49. 64.]]


In [5]:
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 100

N_HIDDEN_HL1 = 10
RANDOM_STATE = 42

seed(RANDOM_STATE)
set_random_seed(RANDOM_STATE)

In [6]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [7]:
X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

### Preprocess

In [12]:
# Truncate the vector by len = 80
# k = 40

word_list = []
connotation = {}
path = r'./imdb-unigrams.txt'

with open(path, 'r', encoding='utf8') as f:
    for line in f:
        word_list.append(line.strip())


In [13]:
len(word_list)

83

In [14]:
len(X_train_original)

25000

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

token_pattern = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 5, token_pattern=token_pattern, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [16]:
# make sure all the 'human-term' exists

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

words = intersection(cv.get_feature_names(), word_list)
len(words)

83

In [17]:
cv_ht = CountVectorizer(token_pattern=token_pattern, vocabulary=word_list)
X_train_ht = cv.fit_transform(X_train_original)

### Placeholder here

In [18]:
# calculate log-ratio

'''
Count the negative and positive frequency
'''
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

'''
Count the ratio : log(#pos/#neg)
'''
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

'''
Sort top words w.r.t log ratio and write into file
'''
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], pos_count[i], neg_count[i]))
            w.write('\n')
        w.close()

In [19]:
sort_top_words_with_count(X_train_ht, y_train_original, word_list, 'human-terms-log-ratio', top_k=len(words))

In [20]:
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

log_ratio_list = load_list('human-terms-log-ratio.txt', '\t')

In [21]:
log_ratio_list

array([['2/10', '-3.87', '0', '47'],
       ['annoying', '-3.15', '9', '233'],
       ['badly', '-2.30', '0', '9'],
       ['best', '2.27', '328', '33'],
       ['funny', '1.99', '21', '2'],
       ['solid', '1.95', '6', '0'],
       ['waste', '1.95', '13', '1'],
       ['fantastic', '1.79', '5', '0'],
       ['awful', '-1.54', '2', '13'],
       ['subtle', '1.39', '27', '6'],
       ['8/10', '1.39', '7', '1'],
       ['obnoxious', '-1.25', '1', '6'],
       ['wasted', '1.25', '6', '1'],
       ['worse', '1.25', '6', '1'],
       ['1/10', '-1.15', '36', '116'],
       ['insult', '-1.05', '20', '59'],
       ['worst', '0.98', '7', '2'],
       ['6/10', '0.92', '14', '5'],
       ['7/10', '0.92', '4', '1'],
       ['brilliant', '-0.92', '1', '4'],
       ['forgettable', '-0.92', '1', '4'],
       ['refreshing', '0.92', '4', '1'],
       ['10/10', '0.92', '4', '1'],
       ['disappointing', '0.92', '4', '1'],
       ['unfortunately', '-0.92', '1', '4'],
       ['5/10', '0.88', '11', '4'],

In [22]:
X_tr = np.expand_dims(X_train, axis=0)
X_te = np.expand_dims(X_test, axis=0)

In [23]:
y_tr = np.reshape(y_train_original, (len(y_train_original), 1))
y_te = np.reshape(y_test_original, (len(y_test_original), 1))

In [24]:
y_ht_tr = np.zeros([X_train.shape[0], X_train.shape[1]])
y_ht_te = np.zeros([X_test.shape[0], X_train.shape[1]])

In [25]:
y_ht_tr.shape

(25000, 26266)

In [26]:
# maybe get intermediate layer of tanh h from Mitchell code.

def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

In [27]:
# get function index

vocabulary_ = cv.get_feature_names()
vocab_index = {}

for i,ht in enumerate(word_list):
    for j, voc in enumerate(vocabulary_):
        if voc == ht:
            vocab_index[ht] = j
            
inv_voc = {v: k for k, v in vocab_index.items()}

In [28]:
vocab_index

{'1/10': 11,
 '2/10': 185,
 '3/10': 225,
 '4/10': 250,
 '5/10': 268,
 '6/10': 285,
 '7/10': 302,
 '8/10': 320,
 '9/10': 338,
 '10/10': 18,
 'amazing': 1065,
 'annoying': 1262,
 'avoid': 1858,
 'awful': 1890,
 'bad': 1976,
 'badly': 1984,
 'beautiful': 2276,
 'beautifully': 2277,
 'best': 2474,
 'bland': 2656,
 'boring': 2942,
 'brilliant': 3096,
 'cheap': 3956,
 'disappointed': 6575,
 'disappointing': 6576,
 'disappointment': 6579,
 'dreadful': 7107,
 'dull': 7239,
 'enjoyable': 7836,
 'enjoyed': 7838,
 'excellent': 8205,
 'fails': 8481,
 'fantastic': 8551,
 'fascinating': 8589,
 'favorite': 8638,
 'forgettable': 9221,
 'fun': 9496,
 'funny': 9518,
 'funniest': 9516,
 'gem': 9709,
 'great': 10202,
 'horrible': 11280,
 'incredible': 11863,
 'insult': 12168,
 'lacks': 13259,
 'lame': 13292,
 'laughable': 13399,
 'lousy': 13996,
 'loved': 14007,
 'mediocre': 14778,
 'mess': 14905,
 'mst3k': 15498,
 'noir': 16018,
 'obnoxious': 16251,
 'pathetic': 17057,
 'perfect': 17246,
 'perfectly': 17

In [29]:
len(vocab_index)

83

In [30]:
def generate_appearance(X_train, X_test, word_index, connotation):
    y_train_agreement = []
    for i in range(X_train.shape[0]):
        doc_agreement = []
        for w,j in word_index.items():
            if X_train[i,j] == 1:
                if connotation[w] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(X_test.shape[0]):
        doc_agreement = []
        for w,j in word_index.items():
            if X_test[i,j] == 1:
                if connotation[w] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

In [31]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [32]:
y_train_agreement, y_test_agreement = generate_appearance(X_train, X_test, word_index=vocab_index, connotation=connotation)

In [33]:
pos_ratio = {}
for i in range(len(log_ratio_list)):
    pos_ratio[log_ratio_list[i, 0]] = float(log_ratio_list[i, 1])

In [34]:
def get_ht_sum(y_agreement):
    tr_ge2 = np.zeros(y_agreement.shape[0])

    #X_reject_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)==0))
    X_ge2_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)>1))
    X_1_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)==1))

    tr_ge2[X_ge2_indices] = 2
    tr_ge2[X_1_indices] = 1
    
    return tr_ge2

def where_sample_ht_index(y_agreement, pos_ratio, word_list):
    
    ht_ge_one = get_ht_sum(y_agreement)
    
    ht_sample_index = []
    
    for i,y in enumerate(ht_ge_one):
        if y == 2:
            indices = np.squeeze(np.where(y_agreement[i, :] != 0))

            list_ = []
            for j in indices:
                try:
                    list_.append(pos_ratio[word_list[j]])
                except KeyError:
                    continue

            list_ = np.asarray(np.absolute(list_))
            ht_sample_index.append(indices[np.argmax(list_)])
        elif y == 1:
            ht_sample_index.append(np.where(y_agreement[i,:] != 0)[0][0])
        else:
            ht_sample_index.append(-1)
            
    return np.asarray(ht_sample_index)

In [35]:
def generate_sequence_sample(X_, y_agreement, pos_ratio, word_list, token_pattern=r"(?u)\b[\w\'/]+\b", input_seq_k=40, output_seq_t=2):
    # get which human terms to be extracted
    
    ht_sample_index = where_sample_ht_index(y_agreement, 
                                            pos_ratio, 
                                            word_list)
    X_sample = []
    y_target = []

    for idx, doc in enumerate(X_):
        if ht_sample_index[idx] == -1:
    #         X_sample.append('--NONE--')
    #         y_target.append('--NONE--')
    # discard doc with no human-terms
            continue

        join = '  '
        target = '  ' 
        token = re.findall(token_pattern, doc)

        for i,tok in enumerate(token):

            if tok==word_list[ht_sample_index[idx]]:
                # check if the length of document less than k
                # then just use the whole document
                if len(token) < input_seq_k and len(token)>(2*output_seq_t+1):
                    join = ' '.join(token)
                    target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])
                    break

                # less than k, less than EOF
                elif i < input_seq_k-1 and i<len(token)-1-input_seq_k:
                    join = ' '.join(token[:i+input_seq_k+1])

                    # define target
                    if output_seq_t > i:
                        target = ' '.join(token[:i+output_seq_t+1])
                    else:
                        target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])

                    break
                # more than k, more than EOF
                elif i>input_seq_k-1 and i>=len(token)-1-input_seq_k:
                    join = ' '.join(token[i-input_seq_k:])

                    #define target
                    if output_seq_t >= len(token)-1-output_seq_t:
                        target = ' '.join(token[i-output_seq_t:])

                    else:
                        target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])

                    break
                else:
                    join = ' '.join(token[i-input_seq_k:i+input_seq_k+1])
                    target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])
                    break

        X_sample.append(join)
        y_target.append(target)
    
    return X_sample, y_target

In [36]:
X_tr_sample, y_tr_target = generate_sequence_sample(X_train_original, y_train_agreement, pos_ratio, word_list)

In [37]:
X_te_sample, y_te_target = generate_sequence_sample(X_test_original, y_test_agreement, pos_ratio, word_list)

In [38]:
# # dump pickles
# import pickle



# pickle.dump(X_tr_sample, open('../data/imdb/X_tr_sample_original.pkl', 'wb'))
# pickle.dump(y_tr_target, open('../data/imdb/y_tr_target_original.pkl', 'wb'))
# pickle.dump(X_te_sample, open('../data/imdb/X_te_sample_original.pkl', 'wb'))
# pickle.dump(y_te_target, open('../data/imdb/y_te_target_original.pkl', 'wb'))

In [39]:
len(X_tr_sample[0])

405

In [40]:
X_tr_sample[0]

'even kind of a happy ending of sort whee a step up from part 4 but not much of one again brian yuzna is involved and screaming mad george so some decent special effect but not enough to make this great a few leftover from part 4 are hanging around too like clint howard and neith hunter but that does not really make any difference anyway i now have seeing the whole series out of my system now if i could'

In [41]:
len(X_te_sample)

22701

### Sample generated
#### Test on Seq2Seq architecture. 

Implement first in Keras <br>
Preprocess the sequence using one-hot representation (omit the embedding for this stage) <br> <br>

train, val, test : 25%, 25%, 50% <br> <br>

<b>DO NOT MODIFY TEST SAMPLES</b>

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_tr_matrix = cv.fit_transform(X_tr_sample)
X_te_matrix = cv.transform(X_te_sample)
y_tr_target_matrix = cv.transform(y_tr_target)
y_te_target_matrix = cv.transform(y_te_target)

In [45]:
words = cv.get_feature_names()

In [None]:
with open('cv_vocabulary.txt', 'w') as f:
    for w in words:
        f.write("%s\n" % item)

In [40]:
X_tr_ = X_tr_matrix.todense()
X_te_ = X_te_matrix.todense()
y_tr_ = y_tr_target_matrix.todense()
y_te_ = y_te_target_matrix.todense()

In [41]:
pickle.dump(X_tr_, open('X_train_sequence.pkl', 'wb'))
pickle.dump(y_tr_, open('y_train_target.pkl', 'wb'))
pickle.dump(X_te_, open('X_test_sequence.pkl', 'wb'))
pickle.dump(y_te_, open('y_test_target.pkl', 'wb'))

In [42]:
pickle.dump(X_tr_matrix, open('X_train_sequence_sparse.pkl', 'wb'))
pickle.dump(y_tr_target_matrix, open('y_train_target_sparse.pkl', 'wb'))
pickle.dump(X_te_matrix, open('X_test_sequence_sparse.pkl', 'wb'))
pickle.dump(y_te_target_matrix, open('y_test_target_sparse.pkl', 'wb'))

In [43]:
np.sum(y_te_[0])

5

In [44]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_tr_matrix.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, y_tr_target_matrix.shape[1]], name='label')

W = tf.get_variable(name='weights',
                   shape=(X_tr_matrix.shape[1], y_tr_target_matrix.shape[1]), 
                   initializer=tf.glorot_uniform_initializer(seed=RANDOM_STATE))

b = tf.get_variable(name='bias', 
                   initializer=tf.constant(0.0))

# Final output logits
# relu_op = tf.nn.relu(X)
logits = tf.matmul(X, W) + b
preds = tf.contrib.sparsemax.sparsemax(logits)
loss = tf.reduce_mean(tf.contrib.sparsemax.sparsemax_loss(logits=logits,
                                                          sparsemax=preds,
                                                             labels=Y))

optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)


# correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0)), tf.float32), Y)

output_num = tf.cast(tf.greater(preds, tf.constant(0.0)), tf.float32)

# accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

writer = tf.summary.FileWriter('./graphs/imdb_simple', tf.get_default_graph())

# Start the session

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(10):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_tr_, Y: y_tr_})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(preds, 
                             feed_dict={X: X_te_, Y: y_te_})
    
#     print('Accuracy {0}'.format(accuracy_test))
    print('output matrix', accuracy_test)
writer.close()

Epoch 0 out of 100 loss: 2.5474157
Epoch 1 out of 100 loss: 0.5094834
Epoch 2 out of 100 loss: -1.3256953
Epoch 3 out of 100 loss: -3.1212695
Epoch 4 out of 100 loss: -4.912244
Epoch 5 out of 100 loss: -6.7046037
Epoch 6 out of 100 loss: -8.501533
Epoch 7 out of 100 loss: -10.303265
Epoch 8 out of 100 loss: -12.108536
Epoch 9 out of 100 loss: -13.916003
output matrix [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [45]:
np.sum(np.sum(accuracy_test, axis=1)>0)

22701

In [46]:
np.squeeze(np.where(np.sum(accuracy_test, axis=1)>0))

array([    0,     1,     2, ..., 22698, 22699, 22700])

In [47]:
np.sum(np.sum(accuracy_test > 0, axis=1)>100)

115

In [48]:
idx=6

In [49]:
indices = np.squeeze(np.where(accuracy_test[idx] > 0))
accuracy_test[idx, indices], y_te_[idx, indices]

(array([0.02606535, 0.16891003, 0.01238585, 0.06271696, 0.09399891,
        0.01385641, 0.0420413 , 0.02645731, 0.08950949, 0.37073612,
        0.03328943, 0.06003714], dtype=float32),
 matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]))

In [50]:
# Let's see if they can predict the human terms
y_te_target_one_hot = []
for i in y_te_target:
    seq = re.findall(token_pattern,i)
    length = len(seq)
    if length%2==1:
        y_te_target_one_hot.append(seq[int((length/2))])
    elif length%2==0 and length != 0:
        y_te_target_one_hot.append(seq[int(length/2)-1])
    else:
        y_te_target_one_hot.append(' ')

In [51]:
## Now see the words

words = cv.get_feature_names()
ht_exist = np.zeros(accuracy_test.shape[0])

for i in range(100):
    indices = np.squeeze(np.where(accuracy_test[i]>0))
#     print(words[indices[0]])
#     print(y_te_target_one_hot[i])
    for j in indices:  
        if words[j] == y_te_target_one_hot[i]:
            print(i, words[j], y_te_target_one_hot[i])
            ht_exist[i] = 1
            break
    

0 excellent excellent
1 best best
3 best best
5 badly badly
10 best best
11 best best
24 funny funny
29 best best
31 great great
33 worst worst
34 funny funny
35 best best
39 funny funny
41 best best
43 funny funny
45 best best
47 best best
53 stupid stupid
55 best best
56 waste waste
60 best best
67 a a
69 bad bad
70 best best
78 best best
79 funny funny
81 subtle subtle
83 excellent excellent
88 excellent excellent
90 amazing amazing
94 excellent excellent
95 great great
98 boring boring


In [52]:
## Now see the words

words = cv.get_feature_names()
matched_target_sum = np.zeros(accuracy_test.shape[0])

for i in range(100):
    indices = np.squeeze(np.where(accuracy_test[i]>0))
#     print(words[indices[0]])
#     print(y_te_target_one_hot[i])
    for j in indices:  
        if words[j] == y_te_target_one_hot[i]:
            # modify the target sequence into some words
            print(i, words[j], y_te_target_one_hot[i])
            matched_target_sum[i] += 1
            break
    

0 excellent excellent
1 best best
3 best best
5 badly badly
10 best best
11 best best
24 funny funny
29 best best
31 great great
33 worst worst
34 funny funny
35 best best
39 funny funny
41 best best
43 funny funny
45 best best
47 best best
53 stupid stupid
55 best best
56 waste waste
60 best best
67 a a
69 bad bad
70 best best
78 best best
79 funny funny
81 subtle subtle
83 excellent excellent
88 excellent excellent
90 amazing amazing
94 excellent excellent
95 great great
98 boring boring


In [53]:
## Now see the words

words = cv.get_feature_names()
ht_exist = np.zeros(accuracy_test.shape[0])

for i in range(100):
    indices = np.squeeze(np.where(accuracy_test[i]>0))
#     print(words[indices[0]])
#     print(y_te_target_one_hot[i])
    for j in indices:  
        
        if words[j] == y_te_target_one_hot[i]:
            print(i, words[j], y_te_target_one_hot[i])
            ht_exist[i] = 1
            break
    

0 excellent excellent
1 best best
3 best best
5 badly badly
10 best best
11 best best
24 funny funny
29 best best
31 great great
33 worst worst
34 funny funny
35 best best
39 funny funny
41 best best
43 funny funny
45 best best
47 best best
53 stupid stupid
55 best best
56 waste waste
60 best best
67 a a
69 bad bad
70 best best
78 best best
79 funny funny
81 subtle subtle
83 excellent excellent
88 excellent excellent
90 amazing amazing
94 excellent excellent
95 great great
98 boring boring


In [54]:
np.sum(ht_exist)

33.0

In [55]:
y_te_target

['was an excellent show it',
 "of hollywood's best oriental actor",
 'so far wonderful world of',
 'average at best and i',
 'by the laughable effect of',
 'art film badly dubbed tend',
 'of the best study of',
 'small child great value without',
 'film such great fun the',
 'that final annoying moment when',
 'is the best i have',
 'underdog the best crime fighter',
 'made of solid gold and',
 'the bill perfectly a light',
 'is so bad about this',
 'it is funny even randy',
 'embarrassing and annoying that million',
 'a beautiful sensitive film',
 'catherine is subtle and sincerely',
 'did an amazing job of',
 'the most disappointing thing is',
 'found alan annoying i have',
 'i liked best about this',
 'do not waste your time',
 'it is funny though because',
 'not only funny he was',
 'it entirely 1/10',
 'only about 1/10 of the',
 'the story perfectly while the',
 'in her best camp lugosi',
 'honestly awful film bad',
 'the day great this may',
 'is a great way for',
 'made the wors

In [56]:
y_te_target[idx]

'of the best study of'

In [57]:
hist_count = np.zeros(100, dtype='int32')

for i in range(100):
    hist_count[i]=np.sum(np.sum(accuracy_test>0, axis=1)==i+1)

In [58]:
y_sum = np.sum(accuracy_test!=0, axis=1)
y_sum

array([10, 10, 11, ...,  9, 10, 12])

In [59]:
y_sum[y_sum>100] = 0

In [60]:
hist_count

array([   0,    0,    2,   17,  108,  384,  992, 1938, 2919, 3318, 3360,
       3016, 2275, 1677,  973,  599,  357,  175,  116,   51,   44,   30,
         19,   15,   18,   18,   15,   16,   17,   10,    8,    8,    6,
          6,    7,    8,    2,    5,    1,    1,    2,    4,    6,    4,
          2,    4,    1,    0,    2,    2,    0,    0,    0,    1,    1,
          3,    1,    5,    1,    5,    3,    2,    0,    2,    2,    0,
          0,    0,    1,    1,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [61]:
## matplotlib seaborn for histogram
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

# the histogram of the data

n, bins, patches = plt.hist(y_sum, 100)

plt.xlabel('Non zero words')
plt.ylabel('Num Documents')
plt.title('Non zero count words of sparsemax')
plt.axis([1,25,0,4000])
plt.grid(True)

plt.show()


<Figure size 640x480 with 1 Axes>

In [62]:
indices = np.where(y_te_ == 1)

In [63]:
sum_list = []
idx = -1
sl = 0
for i in range(len(indices[0])):
    
    if idx != indices[0][i]:
        sum_list.append(sl)
        sl = 0
    
    sl += (int)(accuracy_test[indices[0][i], indices[1][i]] == y_te_[indices[0][i], indices[1][i]])
    
    idx = indices[0][i]
    

In [64]:
np.sum(sum_list != 0)

1

In [65]:
np.where(sum_list != 0)

(array([0]),)

In [66]:
y_te_target[0]

'was an excellent show it'

In [67]:
target_sum = np.sum(y_te_target_matrix.todense(),axis=0)[0]

for i in target_sum:
    print(i)

[[ 21  83  24 ... 252   5   9]]


In [68]:
def layer_split(x):
    return tf.split(x,num_or_size_splits=human_terms_len,axis=1)

In [69]:
## Let's try in Keras for now.

from keras.initializers import Constant, glorot_uniform
from keras.layers import Input, Dense
from keras.models import Model

input_layer = Input(shape=(X_tr_.shape[1],))
middle_layer = Dense(100, activation='tanh', kernel_initializer=glorot_uniform(seed=42))(input_layer)
# output = Dense(y_tr_target_matrix.shape[1], activation='softmax', kernel_initializer=glorot_uniform(seed=42))(input_layer)

output_layer = []
for i in range(y_tr_target_matrix.shape[1]):
    output_layer.append(Dense(1, activation='sigmoid', kernel_initializer=glorot_uniform(seed=42))(middle_layer))


Using TensorFlow backend.


In [70]:
## not good representation without hidden layer. 

model = Model(inputs=input_layer, outputs=output_layer)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1315)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 100)          131600      input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_4 (D

dense_583 (Dense)               (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_584 (Dense)               (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_585 (Dense)               (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_586 (Dense)               (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_587 (Dense)               (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_588 

dense_1083 (Dense)              (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_1084 (Dense)              (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_1085 (Dense)              (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_1086 (Dense)              (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_1087 (Dense)              (None, 1)            101         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_1088

In [71]:
model.compile(loss='binary_crossentropy',
             metrics=['acc'],
             optimizer='rmsprop')

In [72]:
y_tr_target_matrix.shape

(22752, 1315)

In [73]:
y_split_col = np.hsplit(y_tr_target_matrix.todense().astype(float), y_tr_target_matrix.shape[1])

In [74]:
y_split_te = np.hsplit(y_te_target_matrix.todense().astype(float), y_te_target_matrix.shape[1])

In [75]:
x = np.arange(20.0).reshape(5, 4)
print(x.shape)
print(x)
print(np.hsplit(x, 4))

(5, 4)
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]
 [16. 17. 18. 19.]]
[array([[ 0.],
       [ 4.],
       [ 8.],
       [12.],
       [16.]]), array([[ 1.],
       [ 5.],
       [ 9.],
       [13.],
       [17.]]), array([[ 2.],
       [ 6.],
       [10.],
       [14.],
       [18.]]), array([[ 3.],
       [ 7.],
       [11.],
       [15.],
       [19.]])]


In [76]:
preds_matrix = model.predict(X_te_matrix)

In [77]:
len(preds_matrix)

1315

In [78]:
preds_matrix

[array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.46278775],
        [0.55079746],
        ...,
        [0.55462074],
        [0.5017936 ],
        [0.37824714]], dtype=float32), array([[0.5266149 ],
        [0.4

In [79]:
preds_m = np.squeeze(np.array(preds_matrix))

In [80]:
preds_m.shape

(1315, 22701)

In [81]:
## cast preds_m to 0, 1

preds_cast = (preds_m >= 0.5).astype(float)

In [82]:
y_te_target_matrix.T.shape

(1315, 22701)

### Tensorflow

In [83]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_tr_matrix.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, y_tr_target_matrix.shape[1]], name='label')

W = {
    'hidden': tf.Variable(tf.random_normal([X_tr_matrix.shape[1], 100])),
    'output': tf.Variable(tf.random_normal([100,y_tr_target_matrix.shape[1]]))
}
biases = {
    'hidden': tf.Variable(tf.random_normal([100], mean=1.0)),
    'output': tf.Variable(tf.random_normal([y_tr_target_matrix.shape[1]], mean=1.0))
}


hidden = tf.matmul(X, W['hidden']) + biases['hidden']  # hidden layer
hidden = tf.nn.relu(hidden)

output_ = tf.matmul(hidden, W['output']) + biases['output']  # outputs
# split_layer = tf.split(output_,num_or_size_splits=N_OUTPUTS,axis=1)
preds = tf.nn.sigmoid(output_)

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output_,
                                                             labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)

correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

# writer = tf.summary.FileWriter('./graphs/imdb_simple', tf.get_default_graph())

# Start the session

saver = tf.train.Saver()

In [84]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(100):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_tr_matrix.todense(), Y: y_tr_target_matrix.todense()})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X: X_te_matrix.todense(), Y: y_te_target_matrix.todense()})
    
    prediction = sess.run(preds, 
                         feed_dict={X: X_te_matrix.todense()})
    print('Accuracy {0}'.format(accuracy_test))
    print('output matrix', prediction)
    save_path = saver.save(sess, "./fully_connected_model.ckpt")
# writer.close()

Epoch 0 out of 100 loss: 19.204245
Epoch 1 out of 100 loss: 18.963303
Epoch 2 out of 100 loss: 18.725641
Epoch 3 out of 100 loss: 18.491291
Epoch 4 out of 100 loss: 18.26027
Epoch 5 out of 100 loss: 18.032583
Epoch 6 out of 100 loss: 17.808245
Epoch 7 out of 100 loss: 17.587254
Epoch 8 out of 100 loss: 17.369616
Epoch 9 out of 100 loss: 17.15532
Epoch 10 out of 100 loss: 16.944366
Epoch 11 out of 100 loss: 16.736746
Epoch 12 out of 100 loss: 16.532442
Epoch 13 out of 100 loss: 16.331444
Epoch 14 out of 100 loss: 16.133734
Epoch 15 out of 100 loss: 15.939284
Epoch 16 out of 100 loss: 15.748074
Epoch 17 out of 100 loss: 15.560069
Epoch 18 out of 100 loss: 15.375242
Epoch 19 out of 100 loss: 15.193559
Epoch 20 out of 100 loss: 15.014986
Epoch 21 out of 100 loss: 14.839493
Epoch 22 out of 100 loss: 14.667037
Epoch 23 out of 100 loss: 14.49759
Epoch 24 out of 100 loss: 14.331107
Epoch 25 out of 100 loss: 14.167546
Epoch 26 out of 100 loss: 14.006863
Epoch 27 out of 100 loss: 13.849012
Epoch

In [85]:
with tf.Session() as sess:
    saver.restore(sess, "./fully_connected_model.ckpt")
    prediction = sess.run(preds,
                       feed_dict={X: X_te_matrix.todense()})
    
pred_cast = (prediction >= 0.5).astype(float)
sum_pred = 0
sum_list = []
for i in range(pred_cast.shape[0]):
    indices = np.squeeze(np.argwhere(pred_cast[i, :]==1))
    sum_list.append(np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
    if np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]) > 0:
#         print(i, np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
        sum_pred += 1
print(sum_pred)
sum_list = np.asarray(sum_list)
for i in range(6):
    print(i, np.sum(sum_list==i))

INFO:tensorflow:Restoring parameters from ./fully_connected_model.ckpt
18165
0 4536
1 8626
2 6458
3 2554
4 487
5 40


### Different activation ---> tanh

In [86]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_tr_matrix.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, y_tr_target_matrix.shape[1]], name='label')

W = {
    'hidden': tf.Variable(tf.random_normal([X_tr_matrix.shape[1], 100])),
    'output': tf.Variable(tf.random_normal([100,y_tr_target_matrix.shape[1]]))
}
biases = {
    'hidden': tf.Variable(tf.random_normal([100], mean=1.0)),
    'output': tf.Variable(tf.random_normal([y_tr_target_matrix.shape[1]], mean=1.0))
}

hidden = tf.matmul(X, W['hidden']) + biases['hidden']  # hidden layer
hidden = tf.nn.tanh(hidden)

output_ = tf.matmul(hidden, W['output']) + biases['output']  # outputs
preds = tf.nn.sigmoid(output_)

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output_,
                                                             labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)
correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
saver = tf.train.Saver()

In [87]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(100):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_tr_matrix.todense(), Y: y_tr_target_matrix.todense()})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X: X_te_matrix.todense(), Y: y_te_target_matrix.todense()})
    
    prediction = sess.run(preds, 
                         feed_dict={X: X_te_matrix.todense()})
    print('Accuracy {0}'.format(accuracy_test))
    print('output matrix', prediction)
    save_path = saver.save(sess, "./fully_connected_model_tanh.ckpt")
writer.close()

Epoch 0 out of 100 loss: 4.4061265
Epoch 1 out of 100 loss: 4.381152
Epoch 2 out of 100 loss: 4.3563857
Epoch 3 out of 100 loss: 4.3318286
Epoch 4 out of 100 loss: 4.3074713
Epoch 5 out of 100 loss: 4.2833014
Epoch 6 out of 100 loss: 4.259304
Epoch 7 out of 100 loss: 4.2354608
Epoch 8 out of 100 loss: 4.2117558
Epoch 9 out of 100 loss: 4.188171
Epoch 10 out of 100 loss: 4.164688
Epoch 11 out of 100 loss: 4.1412888
Epoch 12 out of 100 loss: 4.1179557
Epoch 13 out of 100 loss: 4.0946684
Epoch 14 out of 100 loss: 4.0714097
Epoch 15 out of 100 loss: 4.0481596
Epoch 16 out of 100 loss: 4.0249
Epoch 17 out of 100 loss: 4.0016117
Epoch 18 out of 100 loss: 3.9782763
Epoch 19 out of 100 loss: 3.9548767
Epoch 20 out of 100 loss: 3.9313948
Epoch 21 out of 100 loss: 3.9078143
Epoch 22 out of 100 loss: 3.884119
Epoch 23 out of 100 loss: 3.8602934
Epoch 24 out of 100 loss: 3.836323
Epoch 25 out of 100 loss: 3.812193
Epoch 26 out of 100 loss: 3.7878916
Epoch 27 out of 100 loss: 3.7634053
Epoch 28 out

In [88]:
with tf.Session() as sess:
    saver.restore(sess, "./fully_connected_model_tanh.ckpt")
    prediction = sess.run(preds,
                       feed_dict={X: X_te_matrix.todense()})
    
pred_cast = (prediction >= 0.5).astype(float)
sum_pred = 0
sum_list = []

for i in range(pred_cast.shape[0]):
    indices = np.squeeze(np.argwhere(pred_cast[i, :]==1))
    sum_list.append(np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
    if np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]) > 0:
#         print(i, np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
        sum_pred += 1
print(sum_pred)
sum_list = np.asarray(sum_list)
for i in range(6):
    print(i, np.sum(sum_list==i))

INFO:tensorflow:Restoring parameters from ./fully_connected_model_tanh.ckpt
15402
0 7299
1 9360
2 4640
3 1233
4 160
5 9


In [89]:
sum_list.shape

(22701,)

In [90]:
np.sum(pred_cast, axis=1)[:50]

array([341., 374., 363., 294., 330., 317., 358., 332., 296., 387., 302.,
       393., 314., 324., 336., 321., 347., 303., 295., 365., 313., 302.,
       289., 626., 256., 325., 470., 338., 318., 333., 375., 316., 324.,
       353., 323., 377., 324., 295., 315., 281., 318., 363., 314., 288.,
       318., 336., 306., 360., 358., 398.])

In [None]:
pred_cast.shape

(22701, 1315)

### Less Hidden Unit

In [None]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_tr_matrix.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, y_tr_target_matrix.shape[1]], name='label')

W = {
    'hidden': tf.Variable(tf.random_normal([X_tr_matrix.shape[1], 10])),
    'output': tf.Variable(tf.random_normal([10,y_tr_target_matrix.shape[1]]))
}
biases = {
    'hidden': tf.Variable(tf.random_normal([10], mean=1.0)),
    'output': tf.Variable(tf.random_normal([y_tr_target_matrix.shape[1]], mean=1.0))
}

hidden = tf.matmul(X, W['hidden']) + biases['hidden']  # hidden layer
hidden = tf.nn.tanh(hidden)

output_ = tf.matmul(hidden, W['output']) + biases['output']  # outputs
preds = tf.nn.sigmoid(output_)

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output_,
                                                             labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)
correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
saver = tf.train.Saver()

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(100):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_tr_matrix.todense(), Y: y_tr_target_matrix.todense()})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X: X_te_matrix.todense(), Y: y_te_target_matrix.todense()})
    
    prediction = sess.run(preds, 
                         feed_dict={X: X_te_matrix.todense()})
    print('Accuracy {0}'.format(accuracy_test))
    print('output matrix', prediction)
    save_path = saver.save(sess, "./fully_connected_model_tanh_10hu.ckpt")
writer.close()

Epoch 0 out of 100 loss: 1.9461802
Epoch 1 out of 100 loss: 1.9429775
Epoch 2 out of 100 loss: 1.9397755
Epoch 3 out of 100 loss: 1.9365743
Epoch 4 out of 100 loss: 1.9333745
Epoch 5 out of 100 loss: 1.9301745
Epoch 6 out of 100 loss: 1.926974
Epoch 7 out of 100 loss: 1.9237719
Epoch 8 out of 100 loss: 1.9205663
Epoch 9 out of 100 loss: 1.9173563
Epoch 10 out of 100 loss: 1.9141403
Epoch 11 out of 100 loss: 1.910917
Epoch 12 out of 100 loss: 1.9076838
Epoch 13 out of 100 loss: 1.9044397
Epoch 14 out of 100 loss: 1.9011828
Epoch 15 out of 100 loss: 1.8979117
Epoch 16 out of 100 loss: 1.8946247
Epoch 17 out of 100 loss: 1.89132
Epoch 18 out of 100 loss: 1.8879964
Epoch 19 out of 100 loss: 1.8846531
Epoch 20 out of 100 loss: 1.8812877
Epoch 21 out of 100 loss: 1.8778995
Epoch 22 out of 100 loss: 1.8744874
Epoch 23 out of 100 loss: 1.8710498
Epoch 24 out of 100 loss: 1.8675854
Epoch 25 out of 100 loss: 1.8640933
Epoch 26 out of 100 loss: 1.8605717
Epoch 27 out of 100 loss: 1.8570197
Epoch 

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./fully_connected_model_tanh_10hu.ckpt")
    prediction = sess.run(preds,
                       feed_dict={X: X_te_matrix.todense()})
    
pred_cast = (prediction >= 0.5).astype(float)
sum_pred = 0
sum_list = []

for i in range(pred_cast.shape[0]):
    indices = np.squeeze(np.argwhere(pred_cast[i, :]==1))
    sum_list.append(np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
    if np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]) > 0:
#         print(i, np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
        sum_pred += 1
print(sum_pred)
sum_list = np.asarray(sum_list)
for i in range(6):
    print(i, np.sum(sum_list==i))

INFO:tensorflow:Restoring parameters from ./fully_connected_model_tanh_10hu.ckpt
21442
0 1259
1 5292
2 8133
3 5754
4 1960
5 303


### More epochs

In [None]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, X_tr_matrix.shape[1]], name='review')
Y = tf.placeholder(tf.float32, [None, y_tr_target_matrix.shape[1]], name='label')

W = {
    'hidden': tf.Variable(tf.random_normal([X_tr_matrix.shape[1], 100])),
    'output': tf.Variable(tf.random_normal([100,y_tr_target_matrix.shape[1]]))
}
biases = {
    'hidden': tf.Variable(tf.random_normal([100], mean=1.0)),
    'output': tf.Variable(tf.random_normal([y_tr_target_matrix.shape[1]], mean=1.0))
}

hidden = tf.matmul(X, W['hidden']) + biases['hidden']  # hidden layer
hidden = tf.nn.tanh(hidden)

output_ = tf.matmul(hidden, W['output']) + biases['output']  # outputs
preds = tf.nn.sigmoid(output_)

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output_,
                                                             labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)
correct_preds = tf.equal(tf.cast(tf.greater(preds, tf.constant(0.5)), tf.float32), 
                         Y)

accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(1000):
        total_loss = 0
        
        _, loss_per_epoch = sess.run([optimizer, loss], 
                           feed_dict={X: X_tr_matrix.todense(), Y: y_tr_target_matrix.todense()})
        
        print('Epoch', epoch, 'out of', EPOCHS, 'loss:', loss_per_epoch)
    
    accuracy_test = sess.run(accuracy, 
                             feed_dict={X: X_te_matrix.todense(), Y: y_te_target_matrix.todense()})
    
    prediction = sess.run(preds, 
                         feed_dict={X: X_te_matrix.todense()})
    print('Accuracy {0}'.format(accuracy_test))
    print('output matrix', prediction)
    save_path = saver.save(sess, "./fully_connected_model_tanh_1000_epochs.ckpt")
# writer.close()

Epoch 0 out of 100 loss: 4.274069
Epoch 1 out of 100 loss: 4.2491217
Epoch 2 out of 100 loss: 4.224275
Epoch 3 out of 100 loss: 4.199536
Epoch 4 out of 100 loss: 4.1748962
Epoch 5 out of 100 loss: 4.1503477
Epoch 6 out of 100 loss: 4.125879
Epoch 7 out of 100 loss: 4.1014795
Epoch 8 out of 100 loss: 4.077136
Epoch 9 out of 100 loss: 4.0528345
Epoch 10 out of 100 loss: 4.028562
Epoch 11 out of 100 loss: 4.0043025
Epoch 12 out of 100 loss: 3.980042
Epoch 13 out of 100 loss: 3.9557648
Epoch 14 out of 100 loss: 3.9314547
Epoch 15 out of 100 loss: 3.9070966
Epoch 16 out of 100 loss: 3.8826737
Epoch 17 out of 100 loss: 3.858172
Epoch 18 out of 100 loss: 3.8335748
Epoch 19 out of 100 loss: 3.8088682
Epoch 20 out of 100 loss: 3.7840374
Epoch 21 out of 100 loss: 3.7590683
Epoch 22 out of 100 loss: 3.7339475
Epoch 23 out of 100 loss: 3.7086623
Epoch 24 out of 100 loss: 3.6832004
Epoch 25 out of 100 loss: 3.6575503
Epoch 26 out of 100 loss: 3.6317008
Epoch 27 out of 100 loss: 3.6056416
Epoch 28 o

Epoch 224 out of 100 loss: 0.13024282
Epoch 225 out of 100 loss: 0.12870282
Epoch 226 out of 100 loss: 0.12719922
Epoch 227 out of 100 loss: 0.12573102
Epoch 228 out of 100 loss: 0.1242972
Epoch 229 out of 100 loss: 0.12289673
Epoch 230 out of 100 loss: 0.1215287
Epoch 231 out of 100 loss: 0.120192155
Epoch 232 out of 100 loss: 0.1188862
Epoch 233 out of 100 loss: 0.11760999
Epoch 234 out of 100 loss: 0.11636266
Epoch 235 out of 100 loss: 0.11514339
Epoch 236 out of 100 loss: 0.11395139
Epoch 237 out of 100 loss: 0.112785906
Epoch 238 out of 100 loss: 0.11164617
Epoch 239 out of 100 loss: 0.11053149
Epoch 240 out of 100 loss: 0.10944113
Epoch 241 out of 100 loss: 0.10837445
Epoch 242 out of 100 loss: 0.107330754
Epoch 243 out of 100 loss: 0.10630946
Epoch 244 out of 100 loss: 0.10530991
Epoch 245 out of 100 loss: 0.104331546
Epoch 246 out of 100 loss: 0.10337376
Epoch 247 out of 100 loss: 0.102436006
Epoch 248 out of 100 loss: 0.10151775
Epoch 249 out of 100 loss: 0.100618474
Epoch 250

Epoch 438 out of 100 loss: 0.048594203
Epoch 439 out of 100 loss: 0.048512682
Epoch 440 out of 100 loss: 0.04843171
Epoch 441 out of 100 loss: 0.048351277
Epoch 442 out of 100 loss: 0.04827136
Epoch 443 out of 100 loss: 0.048191987
Epoch 444 out of 100 loss: 0.048113126
Epoch 445 out of 100 loss: 0.048034776
Epoch 446 out of 100 loss: 0.047956944
Epoch 447 out of 100 loss: 0.047879603
Epoch 448 out of 100 loss: 0.047802757
Epoch 449 out of 100 loss: 0.047726396
Epoch 450 out of 100 loss: 0.047650535
Epoch 451 out of 100 loss: 0.047575146
Epoch 452 out of 100 loss: 0.047500234
Epoch 453 out of 100 loss: 0.047425784
Epoch 454 out of 100 loss: 0.047351807
Epoch 455 out of 100 loss: 0.047278285
Epoch 456 out of 100 loss: 0.047205217
Epoch 457 out of 100 loss: 0.04713259
Epoch 458 out of 100 loss: 0.047060415
Epoch 459 out of 100 loss: 0.046988677
Epoch 460 out of 100 loss: 0.046917364
Epoch 461 out of 100 loss: 0.046846498
Epoch 462 out of 100 loss: 0.04677604
Epoch 463 out of 100 loss: 0.

Epoch 651 out of 100 loss: 0.038046654
Epoch 652 out of 100 loss: 0.038015243
Epoch 653 out of 100 loss: 0.037983935
Epoch 654 out of 100 loss: 0.037952717
Epoch 655 out of 100 loss: 0.037921585
Epoch 656 out of 100 loss: 0.037890557
Epoch 657 out of 100 loss: 0.037859622
Epoch 658 out of 100 loss: 0.037828777
Epoch 659 out of 100 loss: 0.03779803
Epoch 660 out of 100 loss: 0.037767366
Epoch 661 out of 100 loss: 0.0377368
Epoch 662 out of 100 loss: 0.037706316
Epoch 663 out of 100 loss: 0.03767593
Epoch 664 out of 100 loss: 0.037645634
Epoch 665 out of 100 loss: 0.037615426
Epoch 666 out of 100 loss: 0.037585303
Epoch 667 out of 100 loss: 0.037555266
Epoch 668 out of 100 loss: 0.037525322
Epoch 669 out of 100 loss: 0.037495468
Epoch 670 out of 100 loss: 0.03746569
Epoch 671 out of 100 loss: 0.03743601
Epoch 672 out of 100 loss: 0.03740641
Epoch 673 out of 100 loss: 0.037376896
Epoch 674 out of 100 loss: 0.037347462
Epoch 675 out of 100 loss: 0.03731812
Epoch 676 out of 100 loss: 0.0372

Epoch 863 out of 100 loss: 0.032906964
Epoch 864 out of 100 loss: 0.032887913
Epoch 865 out of 100 loss: 0.0328689
Epoch 866 out of 100 loss: 0.032849915
Epoch 867 out of 100 loss: 0.032830972
Epoch 868 out of 100 loss: 0.032812066
Epoch 869 out of 100 loss: 0.03279319
Epoch 870 out of 100 loss: 0.03277435
Epoch 871 out of 100 loss: 0.032755554
Epoch 872 out of 100 loss: 0.032736782
Epoch 873 out of 100 loss: 0.032718047
Epoch 874 out of 100 loss: 0.032699354
Epoch 875 out of 100 loss: 0.03268069
Epoch 876 out of 100 loss: 0.03266206
Epoch 877 out of 100 loss: 0.032643467
Epoch 878 out of 100 loss: 0.032624908
Epoch 879 out of 100 loss: 0.032606382
Epoch 880 out of 100 loss: 0.032587893
Epoch 881 out of 100 loss: 0.032569434
Epoch 882 out of 100 loss: 0.032551013
Epoch 883 out of 100 loss: 0.032532625
Epoch 884 out of 100 loss: 0.03251427
Epoch 885 out of 100 loss: 0.03249595
Epoch 886 out of 100 loss: 0.03247766
Epoch 887 out of 100 loss: 0.032459404
Epoch 888 out of 100 loss: 0.03244

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./fully_connected_model_tanh_1000_epochs.ckpt")
    prediction = sess.run(preds,
                       feed_dict={X: X_te_matrix.todense()})
    
pred_cast = (prediction >= 0.5).astype(float)
sum_pred = 0
sum_list = []

for i in range(pred_cast.shape[0]):
    indices = np.squeeze(np.argwhere(pred_cast[i, :]==1))
    sum_list.append(np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
    if np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]) > 0:
#         print(i, np.sum(pred_cast[i, indices] == y_te_target_matrix[i,indices]))
        sum_pred += 1
print(sum_pred)
sum_list = np.asarray(sum_list)
for i in range(6):
    print(i, np.sum(sum_list==i))

INFO:tensorflow:Restoring parameters from ./fully_connected_model_tanh_1000_epochs.ckpt
3359
0 19342
1 2923
2 386
3 49
4 1
5 0


### Different initialization

### Sequence

In [None]:
# vectorize the data

# https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/

# count number of words with sets
# or simply use the tokenizer update in Keras

from keras.preprocessing.text import text_to_word_sequence, one_hot

train_sequence = []
test_sequence = []
for sample in X_tr_sample:
    train_sequence.append(text_to_word_sequence(sample))
for sample in X_te_sample:
    test_sequence.append(text_to_word_sequence(sample))
    
train_target = []
test_target = []

for target in y_tr_target:
    train_target.append(text_to_word_sequence(target))
for target in y_te_target:
    test_target.append(text_to_word_sequence(target))

In [None]:
len(train_sequence)
len(test_sequence)

22701

In [None]:
## So let preprocess it with one hot which provided by Keras

In [None]:
len(train_sequence[2])

81

In [None]:
# get dictionary

from keras.preprocessing.text import Tokenizer

MAX_NUM_WORDS = 10000
t = Tokenizer(num_words=MAX_NUM_WORDS)

t.fit_on_texts(X_tr_sample)

In [None]:
X_tr_sequence = t.texts_to_sequences(X_tr_sample)
X_te_sequence = t.texts_to_sequences(X_te_sample)

y_tr_sequence = t.texts_to_sequences(y_tr_target)
y_te_sequence = t.texts_to_sequences(y_te_target)

In [None]:
word_index = t.word_index

In [None]:
# can't use one-hot.
# Use embedding instead
len(train_sequence[0])

In [None]:
encoder_inputs = Input(shape=(None, ))