In [1]:
from keras.layers import Input, Dense, TimeDistributed, Embedding
from keras.layers import Concatenate, Reshape, Lambda, Multiply, multiply, concatenate
from keras.models import Model
from keras import backend as K

import os
# os.environ['CUDA_VISIBLE_DEVICES'] = ''

import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load data
# make sure that the first shape is the IMDB training data. 

def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [3]:
# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df = 100)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [4]:
def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

# 'imdb-unigrams.txt'

In [30]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [31]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [37]:
def build_base_model(optimizer='Adagrad'):
    input_layer = Input(shape=(input_shape,))
    tanh_output = Dense(1, activation='tanh', name='tanh_output')(input_layer)
    
    model = Model(inputs=input_layer, outputs=tanh_output)
    model.summary()
    
    return model

#input_shape, human_terms_shape
def build_combined_model(optimizer='adam'):

    # input for base model
#     base_model = build_base_model(input_shape)
    combined_input_layer = Input(shape=(input_shape,))

    # build the hard coded weight for human terms and split the input 
    ht_input_layer = Input(shape=(human_terms_shape,))
    split = Lambda( lambda x: tf.split(x,num_or_size_splits=human_terms_shape,axis=1))(ht_input_layer)

    # get the document prediction
    label_layer = base_model(combined_input_layer)
    
    # multiply the predicion and the human terms absence -> pass it to relu
    dense_layer = []
    for i in range(human_terms_len):
        dense_layer.append(Dense(
            1, 
            activation='relu', 
            use_bias=False, 
            kernel_initializer='ones')(Multiply()([split[i], label_layer])))

    # concat all the result and pass it to sigmoid layer
    concat = Lambda( lambda x: tf.concat(x, axis=1), name='concatenate')(dense_layer)
    output_layer = Dense(1, activation='sigmoid')(concat)

    # build model
    combined_model = Model(inputs=[combined_input_layer, ht_input_layer], outputs=output_layer)
    combined_model.summary()
    

    combined_model.compile(loss='mse',
                      optimizer=optimizer,
                      metrics=['mae','acc'])
    
    return base_model, combined_model

In [10]:
y_train_tanh = y_train_original
y_train_tanh[y_train_tanh == 0] = -1

In [18]:
y_train_agreement.shape

(25000, 83)

In [17]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

seed = 42
np.random.seed(seed)

input_shape=X_train.shape[1]
# create model
model = KerasClassifier(build_fn=build_base_model, epochs=10, batch_size=1, verbose=0)

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train_tanh)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 3641)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3642      
Total params: 3,642
Trainable params: 3,642
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 3641)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3642      
Total params: 3,642
Trainable params: 3,642
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 3641)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3642      
Total params: 3,642
Trainable params: 3,642
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 3641)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3642      
Total params: 3,642
Trainable params: 3,642
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer 

In [19]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
# Best: 0.833200 using {'optimizer': 'Adagrad'}
# 0.522400 (0.006081) with: {'optimizer': 'SGD'}
# 0.605120 (0.004240) with: {'optimizer': 'RMSprop'}
# 0.833200 (0.004663) with: {'optimizer': 'Adagrad'}
# 0.690640 (0.008747) with: {'optimizer': 'Adadelta'}
# 0.639200 (0.003653) with: {'optimizer': 'Adam'}
# 0.791840 (0.004609) with: {'optimizer': 'Adamax'}
# 0.509160 (0.011387) with: {'optimizer': 'Nadam'}

Best: 0.833200 using {'optimizer': 'Adagrad'}
0.522400 (0.006081) with: {'optimizer': 'SGD'}
0.605120 (0.004240) with: {'optimizer': 'RMSprop'}
0.833200 (0.004663) with: {'optimizer': 'Adagrad'}
0.690640 (0.008747) with: {'optimizer': 'Adadelta'}
0.639200 (0.003653) with: {'optimizer': 'Adam'}
0.791840 (0.004609) with: {'optimizer': 'Adamax'}
0.509160 (0.011387) with: {'optimizer': 'Nadam'}


In [23]:
base_model = build_base_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 3641)              0         
_________________________________________________________________
tanh_output (Dense)          (None, 1)                 3642      
Total params: 3,642
Trainable params: 3,642
Non-trainable params: 0
_________________________________________________________________


In [24]:
base_model_history = base_model.fit(X_train, y_train_tanh, batch_size=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# combined_model_history = combined_model.fit([X_train,y_train_agreement], y_train_original, batch_size=1, epochs=2)

In [27]:
base_model.trainable=False

In [38]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier



seed = 42
np.random.seed(seed)

input_shape=X_train.shape[1]
human_terms_shape=len(word_list)
# create model
model = KerasClassifier(build_fn = build_combined_model, epochs=10, batch_size=1, verbose=0)

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit([X_train, y_train_agreement], y_train_original)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

ValueError: Found input variables with inconsistent numbers of samples: [2, 25000]

In [36]:
y_train_original.shape

(25000,)