In [3]:
import numpy as np
from numpy.random import seed
seed(42)
# load data
# make sure that the first shape is the IMDB training data. 

def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

# 'imdb-unigrams.txt'

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [None]:
import tensorflow as tf

from tensorflow import set_random_seed
seed(42)
set_random_seed(42)

from keras.layers import Input, Dense, TimeDistributed, Embedding
from keras.layers import Concatenate, Reshape, Lambda, Multiply, multiply, concatenate
from keras.models import Model
from keras import backend as K



def accuracy_reject(combined, X, y_agreement, y):
    
    # define model which get the input from combined model
    # output the value after relu
    human_terms_relu_model = Model(inputs=combined.input,
                                    outputs=combined.get_layer('concatenate').output)
    predict_relu = human_terms_relu_model.predict([X, y_agreement])
    accept_indices = np.where(np.sum(predict_relu, axis=1)!=0)
    accept_indices = accept_indices[0]
    total_reject = X.shape[0] - len(accept_indices)
    rejection_rate = total_reject/X.shape[0]

    test_eval = combined.evaluate([X[accept_indices], y_agreement[accept_indices]], y[accept_indices])
    
    return test_eval, rejection_rate

# make first model

def build_base_model(input_shape):
    input_layer = Input(shape=(input_shape,))
#     tanh_output = Dense(1, activation='tanh', name='tanh_output')(input_layer)
    tanh_output = Dense(1, activation='sigmoid')(input_layer)
    
    model = Model(inputs=input_layer, outputs=tanh_output)
    model.summary()
    return model


def layer_split(x):
    return tf.split(x,num_or_size_splits=human_terms_len,axis=1)

def layer_concat(x):
    return tf.concat(x, axis=1)

# build the combined model
# Combined model
human_terms_len = len(word_list)

base_model = build_base_model(X_train.shape[1])

combined_input_layer = Input(shape=(X_train.shape[1],))

# build the hard coded weight for human terms
ht_input_layer = Input(shape=(human_terms_len,))

# split = Lambda( lambda x: tf.split(x,num_or_size_splits=human_terms_len,axis=1))(ht_input_layer)
split = Lambda(layer_split)(ht_input_layer)


# get the document prediction
label_layer = base_model(combined_input_layer)

# do normalize of bipolar sigmoid


# stack the multiply layer
dense_layer = []
for i in range(human_terms_len):
    dense_layer.append(Dense(1, activation='relu',use_bias=False, kernel_initializer='ones')(Multiply()([split[i], label_layer])))

# concat all the result   
# concat = Lambda( lambda x: tf.concat(x, axis=1), name='concatenate')(dense_layer)
concat = Lambda(layer_concat, name='concatenate')(dense_layer)


# pass it to sigmoid layer
output_layer = Dense(1, activation='sigmoid')(concat)

combined_model = Model(inputs=[combined_input_layer, ht_input_layer], outputs=output_layer)
# combined_model.summary()

base_model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

# combined_model.compile(loss='mse',
#                       optimizer='adam',
#                       metrics=['acc'])

y_train_tanh = y_train_original
y_train_tanh[y_train_tanh == 0] = -1

y_test_tanh = y_test_original
y_test_tanh[y_test_tanh == 0] = -1

base_model_history = base_model.fit(X_train[:16667], y_train_original[:16667], 
                                    validation_data=(X_train[16667:], y_train_original[16667:]),
                                    batch_size=1, epochs=1)

# combined_model_history = combined_model.fit([X_train[:16667],y_train_agreement[:16667]], y_train_original[:16667], 
#                                             validation_data=([X_train[16667:], y_train_agreement[16667:]], y_train_original[16667:]),
#                                             batch_size=1, epochs=1)

In [None]:
def binary_accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)

In [None]:
score = combined_model.evaluate([X_test, y_test_agreement], y_test_original)
score

In [17]:
accuracy_reject(combined_model, X_test, y_test_agreement, y_test_original)



([0.6911189293496206, 0.3853985141717785], 0.12776)

In [18]:
score = combined_model.evaluate([X_train, y_train_agreement], y_train_original)
score



[0.6867102897262574, 0.35676]

In [19]:
accuracy_reject(combined_model, X_train, y_train_agreement, y_train_original)



([0.6436229766920029, 0.4079308452250274], 0.12544)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)

clf.fit(X_train, y_train_original)
print(clf.score(X_test, y_test_original))
print(clf.score(X_train, y_train_original))

### Report

In [None]:
# Print report on the word transparency
index = [9, 19]
def report():
    for i in index:
        print()
        bm = base_model.predict(X_test[i])
        
        cm = combined_model.predict([X_test[i], 
                                np.reshape(y_test_agreement[i], (1,y_test_agreement.shape[1]))])
        
#         document_output = 'multiply'
#         document_predict = Model(inputs=combined_model.input,
#                                      outputs=combined_model.get_layer(document_output).output)
#         doc_output = document_predict.predict([np.reshape(data[i], (1,5)), 
#                                       ht_1_input[i], 
#                                       ht_2_input[i], 
#                                       ht_3_input[i], 
#                                       ht_4_input[i]])
        
        layer_name = 'concatenate'
        concat_after_relu = Model(inputs=combined_model.input,
                                     outputs=combined_model.get_layer(layer_name).output)
        concat_output = concat_after_relu.predict([X_test[i], 
                                np.reshape(y_test_agreement[i], (1,y_test_agreement.shape[1]))])
        
        print(X_test_original[i], '\n\n actual label : ', y_test_original[i], '\n predict from base model : ', bm.flatten(), '\n predict label : ', cm.flatten())
    
        for i,output in enumerate(concat_output.flatten()):
            if output != 0:
                print(word_list[i], output)

In [None]:
report()

In [None]:
score = combined_model.evaluate([X_test, y_test_agreement], y_test_original)

In [None]:
score

### Let's see the weight when the trainable is false

In [None]:
def build_combined_model():
    # build the combined model
    # Combined model
    human_terms_len = len(word_list)

    base_model = build_base_model(X_train.shape[1])

    combined_input_layer = Input(shape=(X_train.shape[1],))

    # build the hard coded weight for human terms
    ht_input_layer = Input(shape=(human_terms_len,))

    split = Lambda( lambda x: tf.split(x,num_or_size_splits=human_terms_len,axis=1))(ht_input_layer)

    # get the document prediction
    label_layer = base_model(combined_input_layer)

    # stack the multiply layer
    dense_layer = []
    for i in range(human_terms_len):
        dense_layer.append(Dense(1, activation='relu',use_bias=False, kernel_initializer='ones', trainable=True)(Multiply()([split[i], label_layer])))

    # concat all the result   
    concat = Lambda( lambda x: tf.concat(x, axis=1), name='concatenate')(dense_layer)

    # pass it to sigmoid layer
    output_layer = Dense(1, activation='sigmoid')(concat)

    combined_model = Model(inputs=[combined_input_layer, ht_input_layer], outputs=output_layer)
    combined_model.summary()
    
    return base_model, combined_model

In [None]:
false_base_model, false_combined_model = build_combined_model()

In [None]:
false_base_model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['acc'])

false_base_model.trainable=False

false_combined_model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['acc'])

base_model_history_train_false = false_base_model.fit(X_train[:16667], y_train_tanh[:16667], 
                                    validation_data=(X_train[16667:], y_train_tanh[16667:]),
                                    batch_size=1, epochs=1)

combined_model_history_train_false = false_combined_model.fit([X_train[:16667],y_train_agreement[:16667]], y_train_original[:16667], batch_size=1, epochs=1)

In [None]:
# Evaluate model

score = false_combined_model.evaluate([X_test, y_test_agreement], y_test_original)
score

In [None]:
accuracy_reject(false_combined_model, X_test, y_test_agreement, y_test_original)

In [None]:
score = false_combined_model.evaluate([X_train, y_train_agreement], y_train_original)
score

In [None]:
accuracy_reject(false_combined_model, X_train, y_train_agreement, y_train_original)