In [1]:
import numpy as np
import pandas as pd
import re
import jsonlines
import os
import random

os.environ['KERAS_BACKEND'] = 'theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate
from sklearn.metrics import accuracy_score

from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.imagenet_utils import preprocess_input

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import Embedding, Merge, Dropout, Flatten, LSTM, Bidirectional
from keras.models import Model
import AttentionwithContext as ac
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras.optimizers import RMSprop

import tensorflow as tf
from config import Config
from cnn_model import cnn_model
import nn

Using Theano backend.


In [2]:
MAX_SEQUENCE_LENGTH = 200 #30
MAX_CONTENT_LENGTH = 200
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100

In [3]:
def imgModel(vals_df):
    img_features = []
    config = Config()
    images = tf.placeholder(dtype=tf.float32, shape=[config.batch_size] + config.image_shape)
    sess = tf.Session()

    model = cnn_model(config)
    features = model.build_vgg16(images)
    model.load_cnn(sess,config.vgg16_file)

    for entry in vals_df.values:
        img_path = entry[1][0]
        try:
            img = image.load_img(img_path, target_size=(224, 224,3))
        except OSError:
            continue
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        imgList = img_data
        vgg16_feature = sess.run(features,feed_dict={images:img_data})
        img_features.append(vgg16_feature[0])

    tf.reset_default_graph()
    return img_features

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [5]:
def get_title_df(vals):
    titles_df = []
    for i in range(len(vals)): ## For titles
        text = []
        k = vals[i][5]
        text.append(k)
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        titles_df+=[words]
    return titles_df

In [6]:
def get_content_df(vals):
    content_df = []
    for i in range(len(vals)): ## For content
        text = []
        for j in range(2, 7):
            if(j==5):
                continue
            else:
                k = vals[i][j]
                if(j==7):
                    text.append(k)
                else:
                    text += (k)
        words = ""
        for string in text:
            string = clean_str(string)
            words += " ".join(string.split())
        content_df += [words]
    return content_df

In [7]:
def get_padded_sequences(df):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(df)
    sequences = tokenizer.texts_to_sequences(df)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_CONTENT_LENGTH)
    return data, word_index

In [8]:
def get_padded_sequences_content(df):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(df)
    sequences = tokenizer.texts_to_sequences(df)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_CONTENT_LENGTH)
    return data, word_index

In [9]:
count = 0
full_count = 0
train_val_data = []
test_data = []
complete_data = []

with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        complete_data.append(obj)
random.shuffle(complete_data)
complete_length = len(complete_data)
print(complete_length)

80013


In [10]:
final_vals = []
data_df = pd.DataFrame.from_dict(complete_data)
# train = pd.merge(data_df, truth_data_df, on="id")
features = ["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription"]
vals = data_df[features]
vals = vals.values.tolist()
# length = 5000
count =0
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][0], [vals[i][1][0]], vals[i][2], vals[i][3], vals[i][4], vals[i][5], vals[i][6], vals[i][7]])
#         count+=1

In [11]:
vals_df = pd.DataFrame(final_vals, columns=["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription"])
print("Final vals length", len(final_vals))

Final vals length 36042


In [12]:
image_features = imgModel(vals_df)

Loading the CNN from ./vgg16_no_fc.npy...


 31%|███       | 4/13 [00:00<00:00, 38.96it/s]

conv3_3
conv1_2
conv3_1
conv5_2
conv1_1
conv4_3
conv4_1
conv5_3


100%|██████████| 13/13 [00:00<00:00, 36.83it/s]

conv2_1
conv2_2
conv3_2
conv5_1
conv4_2
26 tensors loaded.





In [13]:
print(len(image_features))

36033


In [14]:
title_unlabeled = get_title_df(vals_df.values.tolist())
title_unlabeled_df, t_word_index = get_padded_sequences(title_unlabeled)

Found 27823 unique tokens.


In [15]:
content_unlabeled = get_content_df(vals_df.values.tolist())
content_unlabeled_df, c_word_index = get_padded_sequences(content_unlabeled)

Found 517052 unique tokens.


In [None]:
print(content_unlabeled[0])

In [16]:
count = 0  ### LABELED DATA
full_count = 0
train_val_data = []
test_data = []
all_data = []

with jsonlines.open('instances_train.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
#         count += 1
#         full_count+=1
#         if (count > 17600):
#             test_data.append(obj)
#         if(count<=17600):
#             train_val_data.append(obj)
        all_data.append(obj)
random.shuffle(all_data)
# all_length = len(all_data)
# print(all_length) 
# train_val_data = all_data[:int(0.9*complete_length)]
# test_data = all_data[int(0.9*complete_length)+1:]
count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

In [17]:
train_vals = []
data_df = pd.DataFrame.from_dict(all_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train = pd.merge(data_df, truth_data_df, on="id")
features = ["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"]
vals_train = train[features].values.tolist()
for i in range(len(vals_train)):
    if vals_train[i][1] != []:
        train_vals.append([vals_train[i][0], [vals_train[i][1][0]], vals_train[i][2], vals_train[i][3], vals_train[i][4], vals_train[i][5], vals_train[i][6], vals_train[i][7], vals_train[i][8]])

In [18]:
vals_train_df = pd.DataFrame(train_vals, columns=["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])
print("Training data length", len(train_vals))

Training data length 10306


In [19]:
tf.reset_default_graph()

In [20]:
image_features_train = imgModel(vals_train_df)

  0%|          | 0/13 [00:00<?, ?it/s]

Loading the CNN from ./vgg16_no_fc.npy...
conv3_3
conv1_2


 15%|█▌        | 2/13 [00:00<00:01,  7.21it/s]

conv3_1
conv5_2
conv1_1
conv4_3
conv4_1


100%|██████████| 13/13 [00:00<00:00, 17.52it/s]


conv5_3
conv2_1
conv2_2
conv3_2
conv5_1
conv4_2
26 tensors loaded.


In [21]:
print(len(image_features_train))

10306


In [22]:
def get_labels(vals_df):
    labels = []
    for i in vals_df.values:
        if(i[8]=="clickbait"):
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [23]:
labels = get_labels(vals_train_df)

In [24]:
print(len(labels))

10306


In [25]:
title_train = get_title_df(vals_train_df.values.tolist())
title_train_df, t_train_word_index = get_padded_sequences(title_train)

Found 16652 unique tokens.


In [26]:
content_train = get_content_df(vals_train_df.values.tolist())
content_train_df, c_train_word_index = get_padded_sequences(content_train)

Found 272530 unique tokens.


In [None]:
print(content_train[0])

In [None]:
labels = to_categorical(np.asarray(labels))
print('Shape of label tensor:', labels.shape)

In [27]:
labels = to_categorical(np.asarray(labels), num_classes=3)
print('Shape of label tensor:', labels.shape)

Shape of label tensor: (10306, 3)


In [31]:
train_length = len(title_train_df)

In [28]:
TRAIN_SPLIT = 0.8
TRAIN_VAL_SPLIT = 0.9

In [29]:
indices = np.arange(title_train_df.shape[0])
np.random.shuffle(indices)
data = title_train_df[indices]
contentdata = content_train_df[indices]
labels = labels[indices]
images = np.asarray(image_features_train)[indices]
nb_validation_samples = int(TRAIN_SPLIT * title_train_df.shape[0])

In [32]:
x_train = data[:nb_validation_samples]
c_train = contentdata[:nb_validation_samples]
y_train = labels[:nb_validation_samples]
x_val = data[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
c_val = contentdata[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
y_val = labels[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
x_test = data[int(TRAIN_VAL_SPLIT*train_length):]
c_test = contentdata[int(TRAIN_VAL_SPLIT*train_length):]
y_test = labels[int(TRAIN_VAL_SPLIT*train_length):]

In [33]:
image_train = images[:nb_validation_samples]
image_val = images[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
image_test = images[int(TRAIN_VAL_SPLIT*train_length):]

In [34]:
print(len(x_train))

8244


In [35]:
print(len(image_train))

8244


In [None]:
print(len(x_val))

In [None]:
print(len(image_val))

In [36]:
print(len(x_test))

1031


In [None]:
print(len(image_test))

In [37]:
embeddings_index = {}
f=open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [38]:
t_embedding_matrix = np.random.random((len(t_train_word_index) + 1, EMBEDDING_DIM)) ##Titles
for word, i in t_train_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        t_embedding_matrix[i] = embedding_vector

In [39]:
c_embedding_matrix = np.random.random((len(c_train_word_index) + 1, EMBEDDING_DIM)) ##Content
for word, i in c_train_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        c_embedding_matrix[i] = embedding_vector

In [40]:
weights_matrix = np.concatenate((t_embedding_matrix, c_embedding_matrix), axis = 0)

In [41]:
embedding_layer = Embedding(len(t_train_word_index)+ len(c_train_word_index) + 2, EMBEDDING_DIM, weights=[weights_matrix], input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [42]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
content_data_input = Input(shape=(MAX_CONTENT_LENGTH,),dtype='float32')
image_data_input = Input(shape=(100352,), dtype='float32')

In [43]:
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)

  'RNN dropout is no longer supported with the Theano backend '


In [44]:
content_embedded_sequences = embedding_layer(content_data_input)
l_lstm_content = Bidirectional(LSTM(100))(content_embedded_sequences)

  'RNN dropout is no longer supported with the Theano backend '


In [45]:
preds_title = Dense(1, activation='softmax')(l_lstm) 

preds_content = Dense(1, activation='softmax')(l_lstm_content)

preds_image = Dense(1, activation='softmax')(image_data_input)

In [46]:
model1 = Model(sequence_input, preds_title)
model2 = Model(content_data_input, preds_content)
model3 = Model(image_data_input, preds_image)
# merged = Merge([model1, model3], mode='concat', name="merged")
merged = Merge([model1, model2, model3], mode='concat', name="merged")
final_model = Sequential()
final_model.add(merged)

  """


In [47]:
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merged (Merge)               (None, 3)                 0         
Total params: 29,340,755
Trainable params: 29,340,755
Non-trainable params: 0
_________________________________________________________________


In [48]:
checkpoint = ModelCheckpoint("weights-title+img-{epoch:02d}-{val_acc:.2f}.hdf5")
callbacks_list = [checkpoint]
final_model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['acc'])

In [None]:
final_model.fit([x_train, np.array(image_train)], y_train, validation_data=([x_val, np.array(image_val)], y_val), epochs=10, batch_size=32, callbacks=callbacks_list)

In [None]:
final_model.fit([x_train, c_train, np.asarray(image_train)], y_train, validation_data=([x_val, c_val, np.asarray(image_val)], y_val), epochs=2, batch_size=32, callbacks=callbacks_list)

In [None]:
print(c_train[0])

In [None]:
final_model.fit([x_train, c_train], y_train, validation_data=([x_val, c_val], y_val), epochs=2, batch_size=32, callbacks=callbacks_list)

In [49]:
final_model.load_weights('weights-title+contentimg-01-0.81.hdf5')

In [80]:
def findmaxindices(final_model, number, x_u, image_features_u):
    test_proba = final_model.predict_classes([x_u[:36032], np.array(image_features_u)][:36032], batch_size=32, verbose=1)
    max_indices = np.argpartition(test_proba,-number,axis=0)[-number:]
    return test_proba, max_indices

In [None]:
def findmaxindicesC(final_model, number, x_u, content):
    test_proba = final_model.predict_classes([x_u[:36032], content[:36032]], batch_size=32, verbose=1)
    max_indices = np.argpartition(test_proba,-number,axis=0)[-number:]
    return test_proba, max_indices

In [50]:
def findmaxindicesCI(final_model, number, x_u, content, image_features_u):
    test_proba = final_model.predict_classes([x_u[:36032], content[:36032], np.array(image_features_u)[:36032]], batch_size=32, verbose=1)
    max_indices = np.argpartition(test_proba,-number,axis=0)[-number:]
    return test_proba, max_indices

In [81]:
def addthistotrain(indices, xtest, imgtest, ytest, xtrain, imgtrain, ytrain):
        # getting all the y's
#         ytest = self.get_y(xtest)
        ytest = to_categorical(np.asarray(ytest), num_classes =2)
        
        # adding the min and max indices populations to xtrain and ytrain there
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        xtrain = np.concatenate((xtrain,xtest[indices]),axis=0)
        imgtrain = np.concatenate((imgtrain,imgtest[indices]),axis=0)
        ytrain = np.concatenate((ytrain,ytest[indices]),axis=0)
#         ytrain[ytrain>=0.5]=1
#         ytrain[ytrain<0.5]=0
        
        print("################################")
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        return (xtrain, imgtrain, ytrain)

In [None]:
def addthistotrainC(indices, xtest, contenttest, ytest, xtrain, contenttrain, ytrain):
        # getting all the y's
#         ytest = self.get_y(xtest)
        ytest = to_categorical(np.asarray(ytest), num_classes =2)
        
        # adding the min and max indices populations to xtrain and ytrain there
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        xtrain = np.concatenate((xtrain,xtest[indices]),axis=0)
        contenttrain = np.concatenate((contenttrain,contenttest[indices]),axis=0)
        ytrain = np.concatenate((ytrain,ytest[indices]),axis=0)
#         ytrain[ytrain>=0.5]=1
#         ytrain[ytrain<0.5]=0
        
        print("################################")
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        return (xtrain, contenttrain, ytrain)

In [51]:
def addthistotrainCI(indices, xtest, contenttest, imgtest, ytest, xtrain, contenttrain, imgtrain, ytrain):
        # getting all the y's
#         ytest = self.get_y(xtest)
        ytest = to_categorical(np.asarray(ytest), num_classes =3)
        
        # adding the min and max indices populations to xtrain and ytrain there
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        xtrain = np.concatenate((xtrain,xtest[indices]),axis=0)
        contenttrain = np.concatenate((contenttrain,contenttest[indices]),axis=0)
        imgtrain = np.concatenate((imgtrain,imgtest[indices]),axis=0)
        ytrain = np.concatenate((ytrain,ytest[indices]),axis=0)
#         ytrain[ytrain>=0.5]=1
#         ytrain[ytrain<0.5]=0
        
        print("################################")
        print("Ytest shape is ",ytest.shape)
        print("Xtrain shape is",xtrain.shape)
        print("Xtest shape is ",xtest.shape)
        print("Ytrain shape is ",ytrain.shape)
        print("Indices shape is ",indices.shape)
        print("xtest[indices] shape is",xtest[indices].shape)
        
        return (xtrain, contenttrain, imgtrain, ytrain)

In [72]:
ytest, max_indices = findmaxindicesCI(final_model, 20000, title_unlabeled_df, content_unlabeled_df, image_features)



In [82]:
ytest, max_indices = findmaxindices(final_model, 20000, title_unlabeled_df, image_features)

MemoryError: 

In [None]:
ytest, max_indices = findmaxindicesC(final_model, 20000, title_unlabeled_df, content_unlabeled_df)

In [None]:
print(ytest[max_indices])

In [None]:
xtrain = x_train
imagetrain = image_train
ytrain = y_train

In [77]:
xtrain = data
contenttrain = contentdata
imagetrain = images
ytrain = labels

In [None]:
print(title_unlabeled_df[max_indices])

In [None]:
newXtrain, newImgtrain, newYtrain = addthistotrain(max_indices, title_unlabeled_df, np.array(image_features), ytest, xtrain, np.array(imagetrain), ytrain)

In [None]:
newXtrain, newContenttrain, newYtrain = addthistotrainC(max_indices, title_unlabeled_df, content_unlabeled_df, ytest, xtrain, contenttrain, ytrain)

In [83]:
newXtrain, newContenttrain, newImgtrain, newYtrain = addthistotrainCI(max_indices, title_unlabeled_df, content_unlabeled_df, np.array(image_features), ytest, xtrain, contenttrain, np.array(imagetrain), ytrain)

MemoryError: 

In [None]:
print(len(xtrain))

In [None]:
print(len(imagetrain))

In [None]:
print(len(title_unlabeled_df))

In [None]:
print(ytrain.shape)

In [None]:
print(ytest.shape)

In [None]:
print(len(image_features))

In [None]:
def shufflensplit(xtrain, imgtrain, ytrain):
    indices = np.arange(xtrain.shape[0])
    np.random.shuffle(indices)
    data = xtrain[indices]
    labels = ytrain[indices]
    images = np.asarray(imgtrain)[indices]
    nb_validation_samples = int(TRAIN_SPLIT * len(xtrain))
    train_length = len(xtrain)
    x_train = data[0:nb_validation_samples]
    y_train = labels[0:nb_validation_samples]
    x_val = data[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    y_val = labels[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    x_test = data[int(TRAIN_VAL_SPLIT*train_length):]
    y_test = labels[int(TRAIN_VAL_SPLIT*train_length):]
    image_train = images[0:nb_validation_samples]
    image_val = images[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    image_test = images[int(TRAIN_VAL_SPLIT*train_length):]
    return(x_train, y_train, x_val, y_val, x_test, y_test, image_train, image_val, image_test)
    

In [None]:
def shufflensplitC(xtrain, contenttrain, ytrain):
    indices = np.arange(xtrain.shape[0])
    np.random.shuffle(indices)
    data = xtrain[indices]
    content = contenttrain[indices]
    labels = ytrain[indices]
#     images = np.asarray(imgtrain)[indices]
    nb_validation_samples = int(TRAIN_SPLIT * len(xtrain))
    train_length = len(xtrain)
    x_train = data[0:nb_validation_samples]
    c_train = content[0:nb_validation_samples]
    y_train = labels[0:nb_validation_samples]
    x_val = data[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    c_val = content[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    y_val = labels[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    x_test = data[int(TRAIN_VAL_SPLIT*train_length):]
    c_test = content[int(TRAIN_VAL_SPLIT*train_length):]
    y_test = labels[int(TRAIN_VAL_SPLIT*train_length):]
#     image_train = images[0:nb_validation_samples]
#     image_val = images[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
#     image_test = images[int(TRAIN_VAL_SPLIT*train_length):]
    return(x_train, y_train, x_val, y_val, x_test, y_test, c_train, c_val, c_test)
    

In [55]:
def shufflensplitCI(xtrain, contenttrain, imgtrain, ytrain):
    indices = np.arange(xtrain.shape[0])
    np.random.shuffle(indices)
    data = xtrain[indices]
    content = contenttrain[indices]
    labels = ytrain[indices]
    images = np.asarray(imgtrain)[indices]
    nb_validation_samples = int(TRAIN_SPLIT * len(xtrain))
    train_length = len(xtrain)
    x_train = data[0:nb_validation_samples]
    c_train = content[0:nb_validation_samples]
    y_train = labels[0:nb_validation_samples]
    x_val = data[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    c_val = content[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    y_val = labels[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    x_test = data[int(TRAIN_VAL_SPLIT*train_length):]
    c_test = content[int(TRAIN_VAL_SPLIT*train_length):]
    y_test = labels[int(TRAIN_VAL_SPLIT*train_length):]
    image_train = images[0:nb_validation_samples]
    image_val = images[nb_validation_samples:int(TRAIN_VAL_SPLIT*train_length)]
    image_test = images[int(TRAIN_VAL_SPLIT*train_length):]
    return(x_train, y_train, x_val, y_val, x_test, y_test, c_train, c_val, c_test, image_train, image_val, image_test)
    

In [None]:
new_Xtrain, new_Ytrain, new_Xval, new_Yval, x_test, y_test, new_Ctrain, new_Cval, c_test, new_Imgtrain, new_Imgval, image_test = shufflensplitCI(newXtrain, newContenttrain, newImgtrain, newYtrain)

In [None]:
print(len(new_Ctrain))

In [None]:
print(len(new_Xval))

In [None]:
print(len(x_test))

In [57]:
semi_supervised_model = Sequential()
semi_supervised_model.add(merged)
semi_supervised_model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['acc'])

In [59]:
checkpoint_semi = ModelCheckpoint("weights-semi+title+contentimg-{epoch:02d}-{val_acc:.2f}.hdf5")
callbacks_list_semi = [checkpoint_semi]

In [None]:
semi_supervised_model.fit([new_Xtrain, new_Imgtrain], new_Ytrain, validation_data=([new_Xval, np.array(new_Imgval)], new_Yval), epochs=3, batch_size=32, callbacks=callbacks_list_semi)

In [None]:
semi_supervised_model.fit([new_Xtrain, new_Ctrain], new_Ytrain, validation_data=([new_Xval, new_Cval], new_Yval), epochs=2, batch_size=32, callbacks=callbacks_list_semi)

In [None]:
semi_supervised_model.fit([new_Xtrain, new_Ctrain, new_Imgtrain], new_Ytrain, validation_data=([new_Xval, new_Cval, np.array(new_Imgval)], new_Yval), epochs=2, batch_size=32, callbacks=callbacks_list_semi)

In [69]:
semi_supervised_model.load_weights('weights-semi+title+contentimg-01-0.89.hdf5')

In [None]:
preds = semi_supervised_model.predict_classes([x_test, np.asarray(image_test)], batch_size=32, verbose=1)

In [None]:
preds = semi_supervised_model.predict_classes([x_test, c_test], batch_size=32, verbose=1)

In [70]:
preds = semi_supervised_model.predict_classes([x_test, c_test, np.asarray(image_test)], batch_size=32, verbose=1)



In [None]:
print(np.argmax(y_test,1))

In [71]:
print("Accuracy score on Test data ", accuracy_score(np.argmax(y_test,1), preds))

Accuracy score on Test data  0.899064500246
