In [1]:
import numpy as np
import pandas as pd
import re
import jsonlines
import os

os.environ['KERAS_BACKEND'] = 'theano'
import AttentionwithContext as ac
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from sklearn.metrics import accuracy_score

Using Theano backend.


In [2]:
MAX_SEQUENCE_LENGTH = 200
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1111806
train_val_data =[]
truth_data = []
test_data = []
count= 0
full_count=0
with jsonlines.open('instances_train.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        full_count+=1
        if (count > 17600):
            test_data.append(obj)
        if(count<=17600):
            train_val_data.append(obj)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()

In [4]:
def get_labels(vals_df):
    labels = []
    for i in vals_df.values:
        if(i[7]=="clickbait"):
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [5]:
def get_title_df(vals):
    titles_df = []
    for i in range(len(vals)): ## For titles
        text = []
        k = vals[i][4]
        text.append(k)
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        titles_df+=[words]
    return titles_df

In [6]:
def get_content_df(vals):
    content_df = []
    for i in range(len(vals)): ## For content
        text = []
        for j in range(2, 6):
            if(j==4):
                continue
            else:
                k = vals[i][j]
                if(j==6):
                    text.append(k)
                else:
                    text += (k)
        words = ""
        for string in text:
            string = clean_str(string)
            words += " ".join(string.split())
        content_df += [words]
    return content_df

In [7]:
def get_padded_sequences(df):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(df)
    sequences = tokenizer.texts_to_sequences(df)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

In [8]:
final_vals = []
data_df = pd.DataFrame.from_dict(train_val_data)
truth_data_df = pd.DataFrame.from_dict(truth_data)
train = pd.merge(data_df, truth_data_df, on="id")
features = ["id", "postMedia", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"]
vals = train[features]
vals = vals.values.tolist()
for i in range(len(vals)):
    if vals[i][1] != []:
        final_vals.append([vals[i][0], vals[i][2], vals[i][3], vals[i][4], vals[i][5], vals[i][6], vals[i][7], vals[i][8]])

In [9]:
vals_df = pd.DataFrame(final_vals, columns=["id", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])
print("Final vals length", len(final_vals))

Final vals length 9295


In [10]:
finalTestvals = []
test_data_df = pd.DataFrame.from_dict(test_data)
test = pd.merge(test_data_df, truth_data_df, on="id")
test_vals = test[features].values.tolist()
for i in range(len(test_vals)):
    if test_vals[i][1] != []:
        finalTestvals.append([test_vals[i][0], test_vals[i][2], test_vals[i][3], test_vals[i][4], test_vals[i][5], test_vals[i][6], test_vals[i][7], test_vals[i][8]])

test_vals_df = pd.DataFrame(finalTestvals, columns=["id", "postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                "targetDescription", "truthClass"])
print("finalTestVals length", len(finalTestvals))

finalTestVals length 1011


In [11]:
labels = get_labels(vals_df)

In [12]:
tlabels = get_labels(test_vals_df) #For testing

In [13]:
title_train = get_title_df(vals_df.values.tolist())
content_train = get_content_df(vals_df.values.tolist())

title_train_df, t_word_index = get_padded_sequences(title_train)
content_train_df, c_word_index = get_padded_sequences(content_train)

Found 15837 unique tokens.
Found 245526 unique tokens.


In [14]:
title_test = get_title_df(test_vals_df.values.tolist()) ##For testing
content_test = get_content_df(test_vals_df.values.tolist())

title_test_df, title_test_index = get_padded_sequences(title_test)
content_test_df, content_test_index = get_padded_sequences(content_test)

Found 4424 unique tokens.
Found 54783 unique tokens.


In [15]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', title_train_df.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(title_train_df.shape[0])
np.random.shuffle(indices)
data = title_train_df[indices]
content_data = content_train_df[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

Shape of data tensor: (9295, 200)
Shape of label tensor: (9295, 2)


In [16]:
x_title_train = data[:-nb_validation_samples]
x_content_train = content_data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_title_val = data[-nb_validation_samples:]
x_content_val = content_data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [17]:
x_title_test = title_test_df ##For testing
x_content_test = content_test_df
y_test = tlabels

In [18]:
print('Training and validation sets')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

embeddings_index = {}
f=open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Training and validation sets
[ 6513.  1749.]
[ 805.  228.]
Total 400000 word vectors.


In [19]:
t_embedding_matrix = np.random.random((len(t_word_index) + 1, EMBEDDING_DIM)) ##Titles
for word, i in t_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        t_embedding_matrix[i] = embedding_vector

c_embedding_matrix = np.random.random((len(c_word_index) + 1, EMBEDDING_DIM)) ##Content
for word, i in c_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        c_embedding_matrix[i] = embedding_vector

In [20]:
weights_matrix = np.concatenate((t_embedding_matrix, c_embedding_matrix), axis = 0)

In [21]:
embedding_layer = Embedding(len(t_word_index)+ len(c_word_index) + 2, EMBEDDING_DIM, weights=[weights_matrix], input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [22]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32') #Run again while testing
content_data_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='float32')

embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)

content_embedded_sequences = embedding_layer(content_data_input)
l_lstm_content = Bidirectional(LSTM(100))(content_embedded_sequences)

  'RNN dropout is no longer supported with the Theano backend '
  'RNN dropout is no longer supported with the Theano backend '


In [24]:
from keras.layers import merge

In [None]:
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
attention_probs = Dense(MAX_SEQUENCE_LENGTH, activation='softmax', name='attention_probs')(inputs)
attention_mul = merge([inputs, attention_probs], output_shape=32, name='attention_mul', mode='mul')

In [None]:
# import keras.backend as K

# def mean_pred(y_true, y_pred):
#     return K.mean(y_pred)

In [37]:
preds_title = Dense(1, activation='softmax')(l_lstm) #Run again while testing
attention_probs_title = Dense(MAX_SEQUENCE_LENGTH, activation='softmax', name='attention_probs')(preds_title)
attention_mul = merge([sequence_input, attention_probs_title], output_shape=32, name='attention_mul', mode='mul')
preds_title = Dense(1, activation='softmax')(attention_mul)

preds_content = Dense(1,activation='softmax')(l_lstm_content)

# preds_add = concatenate([preds_title, preds_content], axis =-1)

# preds = Dense(2)(preds_add)

# model = Model([sequence_input, content_data_input], preds)
# model2.add(ac.AttentionWithContext()) ###############

# attention_vector = get_activations(m, testing_inputs_1, print_shape_only=True)[1].flatten()

model1 = Model(sequence_input, preds_title)
model2 = Model(content_data_input, preds_content)
merged = Merge([model1, model2], mode='concat', name="merged")
final_model = Sequential()
final_model.add(merged)
# model = Sequential()
# model.add(merged)
# model3 = Model(inputs, attention_mul)
# final_model = Sequential()
# final_model.add(Merge([model, model3], mode = 'concat'))
# final_model.add(attention_mul)

from keras.optimizers import RMSprop
rmsprop = RMSprop(lr=0.005, rho=0.9, epsilon=None, decay=0.0004)
checkpoint = ModelCheckpoint("weights-content-{epoch:02d}-{val_acc:.2f}.hdf5")
callbacks_list = [checkpoint]
final_model.compile(loss='mean_squared_error', optimizer=rmsprop, metrics=['acc'])

  This is separate from the ipykernel package so we can avoid doing imports until
  name=name)


In [38]:
print("model fitting - Bidirectional LSTM with titles and content")
final_model.summary()
print('------')

model fitting - Bidirectional LSTM with titles and content
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merged (Merge)               (None, 2)                 0         
Total params: 26,459,103
Trainable params: 26,459,103
Non-trainable params: 0
_________________________________________________________________
------


In [None]:
final_model.fit([x_title_train, x_content_train], y_train, validation_data=([x_title_val, x_content_val], y_val), epochs=3, batch_size=50, callbacks=callbacks_list)

Train on 8262 samples, validate on 1033 samples
Epoch 1/3

In [None]:
final_model.load_weights('weights-content2-01-0.78.hdf5')

In [None]:
preds = final_model.predict_classes([x_title_test, x_content_test], batch_size=50, verbose=1)

In [None]:
print("Accuracy score on Test data ", accuracy_score(y_test, preds))