In [2]:
import numpy as np
import pandas as pd
import re
import jsonlines
import os

os.environ['KERAS_BACKEND'] = 'theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [3]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
# VALIDATION_SPLIT = 0.1111806
train_val_data =[]
truth_data = []
test_data = []
count= 0
full_count=0
with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        full_count+=1
        if (count > 17600):
            test_data.append(obj)
        if(count<=17600):
            train_val_data.append(obj)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [5]:
def get_labels(vals_df):
    labels = []
    for i in vals_df.values:
        if(i[3]=="clickbait"):
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [7]:
def get_title_df(vals):
    titles_df = []
    for i in range(len(vals)): ## For titles
        text = []
        k = vals[i][2]
        text+=(k)
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        titles_df+=[words]
    return titles_df

In [8]:
def get_caption_df(vals):
    titles_df = []
    for i in range(len(vals)): ## For titles
        text = []
        k = vals[i][4]
        text+=(k)
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        titles_df+=[words]
    return titles_df

In [9]:
def get_content_df(vals):
    content_df = []
    for i in range(len(vals)): ## For content
        text = []
        for j in range(2, 6):
            if(j==5):
                continue
            else:
                k = vals[i][j]
                if(j==6):
                    text.append(k)
                else:
                    text += (k)
        words = ""
        for string in text:
            string = clean_str(string)
            words += " ".join(string.split())
        content_df += [words]
    return content_df

In [10]:
def get_padded_sequences(df):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(df)
    sequences = tokenizer.texts_to_sequences(df)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

In [12]:
final_vals = []
data_df = pd.DataFrame.from_dict(train_val_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train_captions_df = pd.read_csv('./total_captions.csv')
train_captions_df['id'] = train_captions_df['id'].apply(str)
# features = ["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
#                 "targetDescription", "truthClass"]
train_1 = pd.merge(data_df, truth_data_df, on="id")
train = pd.merge(train_1,train_captions_df,on="id")
print(len(train))
features = ["id","postMedia","targetTitle",  "truthClass","caption"]
vals = train[features]
vals = vals.values.tolist()
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][0],vals[i][1],vals[i][2], vals[i][3], vals[i][4]])

9295


In [13]:
vals_df = pd.DataFrame(final_vals, columns=["id", "postMedia","targetTitle",  "truthClass","caption"])
print("Final vals length", len(final_vals))

Final vals length 9295


In [14]:
finalTestvals = []
test_data_df = pd.DataFrame.from_dict(test_data)
test_1 = pd.merge(test_data_df, truth_data_df, on="id")
test = pd.merge(test_1,train_captions_df,on="id")
test_vals = test[features].values.tolist()
for i in range(len(test_vals)):
    if test_vals[i][1] != []:
        finalTestvals.append([test_vals[i][0],test_vals[i][1],test_vals[i][2], test_vals[i][3], test_vals[i][4]])
test_vals_df = pd.DataFrame(finalTestvals, columns=["id", "postMedia","targetTitle",  "truthClass","caption"])
print("finalTestVals length", len(finalTestvals))

finalTestVals length 1011


In [15]:
labels = get_labels(vals_df)
tlabels = get_labels(test_vals_df)

In [16]:
title_train = get_title_df(vals_df.values.tolist())
content_train = get_caption_df(vals_df.values.tolist())

title_train_df, t_word_index = get_padded_sequences(title_train)
content_train_df, c_word_index = get_padded_sequences(content_train)

Found 9283 unique tokens.
Found 3456 unique tokens.


In [17]:
title_test = get_title_df(test_vals_df.values.tolist())
content_test = get_caption_df(test_vals_df.values.tolist())

title_test_df, title_test_index = get_padded_sequences(title_test)
content_test_df, content_test_index = get_padded_sequences(content_test)

Found 1036 unique tokens.
Found 631 unique tokens.


In [18]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', title_train_df.shape)
print('Shape of label tensor:', labels.shape)
VALIDATION_SPLIT = 0.1111806
indices = np.arange(title_train_df.shape[0])
np.random.shuffle(indices)
data = title_train_df[indices]
content_data = content_train_df[indices]
# image_data = image_features[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

Shape of data tensor: (9295, 1000)
Shape of label tensor: (9295, 2)


In [22]:
x_title_train = data[:-nb_validation_samples]
x_content_train = content_data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_title_val = data[-nb_validation_samples:]
x_content_val = content_data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [23]:
x_title_test = title_test_df
x_content_test = content_test_df
y_test = tlabels

In [24]:
print('Training and validation sets')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

embeddings_index = {}
f=open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Training and validation sets
[6512. 1750.]
[806. 227.]
Total 400000 word vectors.


In [25]:
# t_embedding_matrix = np.random.random((len(t_word_index) + 1, EMBEDDING_DIM)) ##Titles
# for word, i in t_word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         t_embedding_matrix[i] = embedding_vector

# c_embedding_matrix = np.random.random((len(c_word_index) + 1, EMBEDDING_DIM)) ##Content
# for word, i in c_word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         c_embedding_matrix[i] = embedding_vector

In [26]:
t_embedding_matrix = np.random.random((len(title_test_index) + 1, EMBEDDING_DIM)) ##Titles
for word, i in title_test_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        t_embedding_matrix[i] = embedding_vector

c_embedding_matrix = np.random.random((len(content_test_index) + 1, EMBEDDING_DIM)) ##Content
for word, i in content_test_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        c_embedding_matrix[i] = embedding_vector

In [None]:
# embedding_layer = Embedding(len(t_word_index) + 1, EMBEDDING_DIM, weights=[t_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=True)

# content_embedding_layer = Embedding(len(c_word_index) + 1, EMBEDDING_DIM, weights=[c_embedding_matrix],
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=True)

In [27]:
embedding_layer = Embedding(len(title_test_index) + 1, EMBEDDING_DIM, weights=[t_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

content_embedding_layer = Embedding(len(content_test_index) + 1, EMBEDDING_DIM, weights=[c_embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [28]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
content_data_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='float32')

embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)

content_embedded_sequences = content_embedding_layer(content_data_input)
l_lstm_content = Bidirectional(LSTM(100))(content_embedded_sequences)

In [29]:
print("-----------------------------")
print(l_lstm.shape)

preds_title = Dense(2, activation='softmax')(l_lstm)

preds_content = Dense(2,activation='softmax')(l_lstm_content)

preds_add = concatenate([preds_title, preds_content], axis =-1)

preds = Dense(2)(preds_add)

model = Model([sequence_input, content_data_input], preds)
# model1.add_update(ac.AttentionWithContext()) ###############

checkpoint = ModelCheckpoint("weights-text-{epoch:02d}-{val_acc:.2f}.hdf5")
callbacks_list = [checkpoint]
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

-----------------------------
Shape.0


In [30]:
print("model fitting - Bidirectional LSTM with titles and content")
model.summary()
print('------')

model fitting - Bidirectional LSTM with titles and content
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 100)    103700      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1000, 100)    63200       input_2[0][0]                    
__________________________________________________

In [None]:
 model.fit([x_title_train, x_content_train], y_train, validation_data=([x_title_val, x_content_val], y_val), epochs=10, batch_size=50, callbacks=callbacks_list)

In [31]:
model.load_weights('weights-text-10-0.79.hdf5')

In [32]:
preds = model.predict([x_title_test, x_content_test], batch_size=50, verbose=1)



In [35]:
preds_new = []
for i in range(len(preds)):
    preds_new.append(preds[i][0] + preds[i][1])
print("Accuracy score on Test data ", accuracy_score(y_test, np.asarray(preds_new).round()))

Accuracy score on Test data  0.2354104846686449


In [34]:
from sklearn.metrics import accuracy_score