In [2]:
import numpy as np
import pandas as pd
import re
import jsonlines
import os

os.environ['KERAS_BACKEND'] = 'theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate
from sklearn.metrics import accuracy_score
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.imagenet_utils import preprocess_input

from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Input
from keras.layers import Embedding, Merge, Dropout, LSTM, Bidirectional
from keras.models import Model
import AttentionwithContext as ac

import tensorflow as tf
from config import Config
from cnn_model import cnn_model

Using Theano backend.


In [3]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1111806

count = 0
full_count = 0
train_val_data = []
test_data = []

with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        full_count+=1
        if (count > 17600):
            test_data.append(obj)
        if(count<=17600):
            train_val_data.append(obj)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [5]:
def imgModel(vals_df):
    # model = VGG16(weights='imagenet', include_top=False)
    # model.summary()
    img_features = []
    config = Config()
    images = tf.placeholder(
        dtype=tf.float32,
        shape=[config.batch_size] + config.image_shape)

    sess = tf.Session()

    model = cnn_model(config)
    features = model.build_vgg16(images)
    model.load_cnn(sess,config.vgg16_file)

    for entry in vals_df.values:
        img_path = entry[1][0]
        # print(img_path)
        img = image.load_img(img_path, target_size=(224, 224,3))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)

        # vgg16_feature = model.predict(img_data)

        vgg16_feature = sess.run(features,feed_dict={images:img_data})
        img_features.append(vgg16_feature[0])
    tf.reset_default_graph()
    return img_features

In [6]:
def get_padded_sequences(df):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(df)
    sequences = tokenizer.texts_to_sequences(df)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

In [7]:
def get_content_df(vals):
    content_df = []
    for i in range(len(vals)): ## For content
        text = []
        for j in range(2, 7):
            if(j==5):
                continue
            else:
                k = vals[i][j]
                if(j==6):
                    text.append(k)
                else:
                    text += (k)
        words = ""
        for string in text:
            string = clean_str(string)
            words += " ".join(string.split())
        content_df += [words]
    return content_df

In [8]:
def get_title_df(vals):
    titles_df = []
    for i in range(len(vals)): ## For titles
        text = []
        k = vals[i][5]
        text+=(k)
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        titles_df+=[words]
    return titles_df

In [9]:
def get_labels(vals_df):
    labels = []
    for i in vals_df.values:
        if(i[8]=="clickbait"):
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [10]:
final_vals = []
data_df = pd.DataFrame.from_dict(train_val_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train = pd.merge(data_df, truth_data_df, on="id")
features = ["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"]
vals = train[features]
vals = vals.values.tolist()
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][0], [vals[i][1][0]], vals[i][2], vals[i][3], vals[i][4], vals[i][5], vals[i][6], vals[i][7], vals[i][8]])

vals_df = pd.DataFrame(final_vals, columns=["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])
print("Final vals length", len(final_vals))

Final vals length 9295


In [11]:
finalTestvals = []
test_data_df = pd.DataFrame.from_dict(test_data)
test = pd.merge(test_data_df, truth_data_df, on="id")
test_vals = test[features].values.tolist()
for i in range(len(test_vals)):
    if test_vals[i][1] != []:
        finalTestvals.append([test_vals[i][0], [test_vals[i][1][0]], test_vals[i][2], test_vals[i][3], test_vals[i][4], test_vals[i][5], test_vals[i][6], test_vals[i][7], test_vals[i][8]])

test_vals_df = pd.DataFrame(finalTestvals, columns=["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])
print("finalTestVals length", len(finalTestvals))

finalTestVals length 1011


In [11]:
image_features = imgModel(vals_df)

Loading the CNN from ./vgg16_no_fc.npy...


 31%|███       | 4/13 [00:00<00:00, 37.88it/s]

conv4_3
conv3_3
conv1_1
conv3_2
conv4_2
conv4_1
conv2_1
conv1_2


100%|██████████| 13/13 [00:00<00:00, 36.60it/s]

conv5_2
conv5_3
conv2_2
conv3_1
conv5_1
26 tensors loaded.





In [16]:
tf.reset_default_graph()

In [17]:
image_features_test = imgModel(test_vals_df)

  0%|          | 0/13 [00:00<?, ?it/s]

Loading the CNN from ./vgg16_no_fc.npy...
conv1_2
conv4_2
conv4_1
conv4_3


 85%|████████▍ | 11/13 [00:00<00:00, 40.00it/s]

conv3_1
conv3_3
conv1_1
conv2_1
conv3_2
conv2_2
conv5_1
conv5_2
conv5_3


100%|██████████| 13/13 [00:00<00:00, 36.32it/s]


26 tensors loaded.


In [12]:
labels = get_labels(vals_df)
tlabels = get_labels(test_vals_df)

In [13]:
title_train = get_title_df(vals_df.values.tolist())
content_train = get_content_df(vals_df.values.tolist())

title_train_df, t_word_index = get_padded_sequences(title_train)
content_train_df, c_word_index = get_padded_sequences(content_train)

Found 9283 unique tokens.
Found 252166 unique tokens.


In [14]:
title_test = get_title_df(test_vals_df.values.tolist())
content_test = get_content_df(test_vals_df.values.tolist())

title_test_df = get_padded_sequences(title_test)
content_test_df = get_padded_sequences(content_test)

Found 1036 unique tokens.
Found 55792 unique tokens.


In [15]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', title_train_df.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(title_train_df.shape[0])
np.random.shuffle(indices)
data = title_train_df[indices]
content_data = content_train_df[indices]
# image_data = image_features[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

Shape of data tensor: (9295, 1000)
Shape of label tensor: (9295, 2)


In [16]:
x_title_train = data[:-nb_validation_samples]
x_content_train = content_data[:-nb_validation_samples]
# x_image_train = image_features[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_title_val = data[-nb_validation_samples:]
x_content_val = content_data[-nb_validation_samples:]
# x_image_val = image_features[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

x_title_test = title_test_df
x_content_test = content_test_df
x_image_test = image_features_test
y_test = tlabels

NameError: name 'image_features_test' is not defined

In [None]:
print('Training and validation sets')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

embeddings_index = {}
f=open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

In [None]:
t_embedding_matrix = np.random.random((len(t_word_index) + 1, EMBEDDING_DIM)) ##Titles
for word, i in t_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        t_embedding_matrix[i] = embedding_vector

c_embedding_matrix = np.random.random((len(c_word_index) + 1, EMBEDDING_DIM)) ##Content
for word, i in c_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        c_embedding_matrix[i] = embedding_vector


In [None]:
embedding_layer = Embedding(len(t_word_index) + 1, EMBEDDING_DIM, weights=[t_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

content_embedding_layer = Embedding(len(c_word_index) + 1, EMBEDDING_DIM, weights=[c_embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
content_data_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='float32')
image_data_input = Input(shape=(100352,), dtype='float32')

embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)

content_embedded_sequences = content_embedding_layer(content_data_input)
l_lstm_content = Bidirectional(LSTM(100))(content_embedded_sequences)

In [None]:
print("-----------------------------")
print(l_lstm.shape)

preds_title = Dense(2, activation='softmax')(l_lstm)

preds_content = Dense(2,activation='softmax')(l_lstm_content)

preds_image = Dense(2, activation='softmax')(image_data_input)

preds_add = concatenate([preds_title, preds_content, preds_image], axis =-1)

preds = Dense(2)(preds_add)

model = Model([sequence_input, content_data_input, image_data_input], preds)
# model1.add_update(ac.AttentionWithContext()) ###############

checkpoint = ModelCheckpoint("weights-text-{epoch:02d}-{val_acc:.2f}.hdf5")
callbacks_list = [checkpoint]
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [25]:
print("model fitting - Bidirectional LSTM with titles and content")
model.summary()
print('------')

model fitting - Bidirectional LSTM with titles and content
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 100)    928400      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1000, 100)    25216700    input_2[0][0]                    
__________________________________________________

In [30]:
 model.fit([x_title_train, x_content_train, np.asarray(x_image_train)], y_train, validation_data=([x_title_val, x_content_val, np.asarray(x_image_val)], y_val), epochs=10, batch_size=50, callbacks=callbacks_list)

Train on 8262 samples, validate on 1033 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f38b87b82e8>

In [26]:
model.load_weights('weights-text-02-0.20.hdf5')

In [1]:
preds = model.predict([x_title_test, x_content_test, np.asarray(x_image_test)], batch_size=50, verbose=1)

NameError: name 'model' is not defined

In [None]:
preds_new = []
for i in range(len(preds)):
    preds_new.append(preds[i][0] + preds[i][1])
print("Accuracy score on Test data ", accuracy_score(y_test, np.asarray(preds_new).round()))

In [35]:
print(len(x_image_test))

1011
