In [2]:
import numpy as np
import pandas as pd
import re
import jsonlines
import json

import sys
import os

os.environ['KERAS_BACKEND'] = 'theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate

from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.imagenet_utils import preprocess_input

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, LSTM, GRU, Bidirectional
from keras.models import Model

Using Theano backend.


In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [4]:
def imgModel(vals_df):
    model = VGG16(weights='imagenet', include_top=False)
    model.summary()
    vgg16_features = []
    for entry in vals_df.values:
        img_path = entry[1][0]
        # print(img_path)
        img = image.load_img(img_path, target_size=(224, 224))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        vgg16_f= model.predict(img_data)
        vgg16_features.append(vgg16_f[0])
#         print(vgg16_features)
    
    return vgg16_features

In [5]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1111806

count = 0
train_val_data = []
test_data = []

with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        if (count > 9275):
            test_data.append(obj)
        if(count<=9275):
            train_val_data.append(obj)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)


In [6]:
final_vals = []
data_df = pd.DataFrame.from_dict(train_val_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train = pd.merge(data_df, truth_data_df, on="id")
features = ["id", "postMedia", "targetTitle", "truthClass"]
vals = train[features]
vals = vals.values.tolist()
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][0], [vals[i][1][0]], [vals[i][2]], vals[i][3]])

vals_df = pd.DataFrame(final_vals, columns=["id", "file_path", "title", "truthClass"])

image_features = imgModel(vals_df)
finalTestvals = []
test_data_df = pd.DataFrame.from_dict(test_data)
test = pd.merge(test_data_df, truth_data_df, on="id")
test_vals = test[features].values.tolist()
for i in range(len(test_vals)):
    if test_vals[i][1] != []:
        finalTestvals.append([test_vals[i][0], [test_vals[i][1][0]], [test_vals[i][2]], test_vals[i][3]])
tdata = test[features].values
tdata = test_data_df.values

labels = []
tlabels = []
df = []

for i in vals_df.values:
    if(i[3]=="clickbait"):
        labels.append(1)
    else:
        labels.append(0)

for i in tdata:
    if (i[3] == "clickbait"):
        tlabels.append(1)
    else:
        tlabels.append(0)

for i in range(vals_df.values.shape[0]):
    text = []
    for j in range(1,5):
        k = vals_df.values[i][2]
        text+=(k)
    words = ""
    for string in text:
        string = clean_str(string)
        words +=" ".join(string.split())
    df+=[words]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________

In [7]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df)
sequences = tokenizer.texts_to_sequences(df)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
x_test = tdata
y_test = tlabels

print('Training and validation sets')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Found 15739 unique tokens.
Shape of data tensor: (4843, 1000)
Shape of label tensor: (4843, 2)
Training and validation sets
[ 3369.   936.]
[ 425.  113.]


In [8]:
embeddings_index = {}
f=open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

Total 400000 word vectors.


In [9]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(2, activation='relu')(l_lstm)
model1 = Model(sequence_input, preds)

  'RNN dropout is no longer supported with the Theano backend '


In [16]:
import tensorflow as tf
from theano.tensor import _shared

In [23]:
model2_in = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
image_features_np = _shared(np.asarray(image_features, np.float32))
model2_inter = embedding_layer(image_features_np)
model2_out = Dense(2, activation='relu')(model2_inter)
model2 = Model(model2_in, model2_out)

TypeError: object of type 'NoneType' has no len()