In [1]:
from keras.layers import Input, Dense, TimeDistributed, Embedding
from keras.layers import Concatenate, Reshape, Lambda, Multiply, multiply, concatenate
from keras.models import Model
from keras import backend as K

import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import tensorflow as tf
import numpy as np

Using TensorFlow backend.


In [2]:
# make first model

def build_base_model(input_shape):
    input_layer = Input(shape=(input_shape,))
    tanh_output = Dense(1, activation='tanh', name='tanh_output')(input_layer)
    
    model = Model(inputs=input_layer, outputs=tanh_output)
    model.summary()
    return model

In [3]:
# load data
# make sure that the first shape is the IMDB training data. 

def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [4]:
# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [5]:
def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

# 'imdb-unigrams.txt'

In [7]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [8]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [9]:
def accuracy_reject(combined, X, y_agreement, y):
    
    # define model which get the input from combined model
    # output the value after relu
    human_terms_relu_model = Model(inputs=combined.input,
                                    outputs=combined.get_layer('concatenate').output)
    predict_relu = human_terms_relu_model.predict([X, y_agreement])
    accept_indices = np.where(np.sum(predict_relu, axis=1)!=0)
    accept_indices = accept_indices[0]
    total_reject = X.shape[0] - len(accept_indices)
    rejection_rate = total_reject/X.shape[0]

    test_eval = combined.evaluate([X[accept_indices], y_agreement[accept_indices]], y[accept_indices])
    
    return test_eval, rejection_rate


In [10]:
def layer_split(x):
    return tf.split(x,num_or_size_splits=human_terms_len,axis=1)

def layer_concat(x):
    return tf.concat(x, axis=1)

In [11]:
# build the combined model
# Combined model
human_terms_len = len(word_list)

base_model = build_base_model(X_train.shape[1])

combined_input_layer = Input(shape=(X_train.shape[1],))

# build the hard coded weight for human terms
ht_input_layer = Input(shape=(human_terms_len,))

# split = Lambda( lambda x: tf.split(x,num_or_size_splits=human_terms_len,axis=1))(ht_input_layer)
split = Lambda(layer_split)(ht_input_layer)


# get the document prediction
label_layer = base_model(combined_input_layer)

# stack the multiply layer
dense_layer = []
for i in range(human_terms_len):
    dense_layer.append(Dense(1, activation='relu',use_bias=False, kernel_initializer='ones')(Multiply()([split[i], label_layer])))

# concat all the result   
# concat = Lambda( lambda x: tf.concat(x, axis=1), name='concatenate')(dense_layer)
concat = Lambda(layer_concat, name='concatenate')(dense_layer)


# pass it to sigmoid layer
output_layer = Dense(1, activation='sigmoid')(concat)

combined_model = Model(inputs=[combined_input_layer, ht_input_layer], outputs=output_layer)
combined_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3686)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3687      
Total params: 3,687
Trainable params: 3,687
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 83)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 3686)         0                                            
_________________________________________

In [12]:
base_model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['acc'])

combined_model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['acc'])

In [13]:
y_train_tanh = y_train_original
y_train_tanh[y_train_tanh == 0] = -1

In [14]:
y_train_agreement.shape

(25000, 83)

In [37]:
base_model_history = base_model.fit(X_train[:16667], y_train_tanh[:16667], 
                                    validation_data=(X_train[16667:], y_train_tanh[16667:]),
                                    batch_size=1, epochs=1)

Train on 16667 samples, validate on 8333 samples
Epoch 1/1


In [38]:
combined_model_history = combined_model.fit([X_train[:16667],y_train_agreement[:16667]], y_train_original[:16667], 
                                            validation_data=([X_train[16667:], y_train_agreement[16667:]], y_train_original[16667:]),
                                            batch_size=1, epochs=1)

Train on 16667 samples, validate on 8333 samples
Epoch 1/1


In [48]:
score = combined_model.evaluate([X_test, y_test_agreement], y_test_original)
score



[0.2038297988319397, 0.73308]

In [49]:
accuracy_reject(combined_model, X_test, y_test_agreement, y_test_original)



([0.16293317903452306, 0.8066815568037441], 0.15828)

In [50]:
score = combined_model.evaluate([X_train, y_train_agreement], y_train_original)
score



[0.7307313213539124, 0.25808]

In [51]:
accuracy_reject(combined_model, X_train, y_train_agreement, y_train_original)



([0.69519420602289, 0.304483246817348], 0.1524)

### Report

In [20]:
# Print report on the word transparency
index = [9, 19]
def report():
    for i in index:
        print()
        bm = base_model.predict(X_test[i])
        
        cm = combined_model.predict([X_test[i], 
                                np.reshape(y_test_agreement[i], (1,y_test_agreement.shape[1]))])
        
#         document_output = 'multiply'
#         document_predict = Model(inputs=combined_model.input,
#                                      outputs=combined_model.get_layer(document_output).output)
#         doc_output = document_predict.predict([np.reshape(data[i], (1,5)), 
#                                       ht_1_input[i], 
#                                       ht_2_input[i], 
#                                       ht_3_input[i], 
#                                       ht_4_input[i]])
        
        layer_name = 'concatenate'
        concat_after_relu = Model(inputs=combined_model.input,
                                     outputs=combined_model.get_layer(layer_name).output)
        concat_output = concat_after_relu.predict([X_test[i], 
                                np.reshape(y_test_agreement[i], (1,y_test_agreement.shape[1]))])
        
        print(X_test_original[i], '\n\n actual label : ', y_test_original[i], '\n predict from base model : ', bm.flatten(), '\n predict label : ', cm.flatten())
    
        for i,output in enumerate(concat_output.flatten()):
            if output != 0:
                print(word_list[i], output)

In [21]:
report()


hilarious, clean, light-hearted, and quote-worthy. what else can you ask for in a film? this is my all-time, number one favorite movie. ever since i was a little girl, i have dreamed of owning a blue van with flame and an observation bubble.the cliché character in ridiculous situation are what make this film such great fun. the wonderful comedic chemistry between stephen furst (harold) and andy tennant (melio) make up most of my favorite part of the movie. and who did not love the hopeless awkwardness of flynch? do not forget the airport antic of leon's crony, dressed up as hari krishna: dancing, chanting and playing the tambourine--unbeatable! the clue are genius, the location are classic, and the plot is timeless.a word to the wise, if you did not watch this film when you were little, it probably will not win a place in your heart today. but nevertheless give it a chance, you may find that "it does not matter what you say, it does not matter what you do, you have gotta play." 

 act

In [14]:
score = combined_model.evaluate([X_test, y_test_agreement], y_test_original)



In [15]:
score

[0.16272140228033066, 0.78232]

### Let's see the weight when the trainable is false

In [18]:
def build_combined_model():
    # build the combined model
    # Combined model
    human_terms_len = len(word_list)

    base_model = build_base_model(X_train.shape[1])

    combined_input_layer = Input(shape=(X_train.shape[1],))

    # build the hard coded weight for human terms
    ht_input_layer = Input(shape=(human_terms_len,))

    split = Lambda( lambda x: tf.split(x,num_or_size_splits=human_terms_len,axis=1))(ht_input_layer)

    # get the document prediction
    label_layer = base_model(combined_input_layer)

    # stack the multiply layer
    dense_layer = []
    for i in range(human_terms_len):
        dense_layer.append(Dense(1, activation='relu',use_bias=False, kernel_initializer='ones', trainable=True)(Multiply()([split[i], label_layer])))

    # concat all the result   
    concat = Lambda( lambda x: tf.concat(x, axis=1), name='concatenate')(dense_layer)

    # pass it to sigmoid layer
    output_layer = Dense(1, activation='sigmoid')(concat)

    combined_model = Model(inputs=[combined_input_layer, ht_input_layer], outputs=output_layer)
    combined_model.summary()
    
    return base_model, combined_model

In [19]:
false_base_model, false_combined_model = build_combined_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 3686)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3687      
Total params: 3,687
Trainable params: 3,687
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 83)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 3686)         0                                            
_________________________________________

In [20]:
false_base_model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['acc'])

false_base_model.trainable=False

false_combined_model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['acc'])

base_model_history_train_false = false_base_model.fit(X_train[:16667], y_train_tanh[:16667], 
                                    validation_data=(X_train[16667:], y_train_tanh[16667:]),
                                    batch_size=1, epochs=1)

combined_model_history_train_false = false_combined_model.fit([X_train[:16667],y_train_agreement[:16667]], y_train_original[:16667], batch_size=1, epochs=1)

  'Discrepancy between trainable weights and collected trainable'


Train on 16667 samples, validate on 8333 samples
Epoch 1/1
Epoch 1/1


In [23]:
# Evaluate model

score = false_combined_model.evaluate([X_test, y_test_agreement], y_test_original)
score



[0.13706384876012803, 0.81436]

In [52]:
accuracy_reject(false_combined_model, X_test, y_test_agreement, y_test_original)



([0.10435561584943832, 0.8663397802488687], 0.11896)

In [24]:
score = false_combined_model.evaluate([X_train, y_train_agreement], y_train_original)
score



[0.6858638257026672, 0.35624]

In [53]:
accuracy_reject(false_combined_model, X_train, y_train_agreement, y_train_original)



([0.6450690616898749, 0.4036988350496261], 0.11756)