In [330]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense,Embedding,RepeatVector,Bidirectional
from keras.models import Model
import re
import numpy as np
from scipy import stats
# import demoji
import tokenizer
from nltk.corpus import stopwords
stopwords_list = list(set(stopwords.words('english')))
from keras.utils import to_categorical


In [2]:
domain1_path = "./dEFEND/gossipcop_content_no_ignore.tsv"
domain2_path = "./dEFEND/politifact_content_no_ignore.tsv"

In [3]:
domain1_frame = pd.read_csv(domain1_path,delimiter="\t").set_index('id')
domain2_frame = pd.read_csv(domain2_path,delimiter="\t").set_index('id')

### Counts for each label for the 2 domain

#### First Domain

In [4]:
domain1_frame

Unnamed: 0_level_0,label,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1
gossipcop-9096198130,1,Sarah Jessica Parker is getting candid about h...
gossipcop-6982710185,1,Many celebrities have been sharing their thoug...
gossipcop-7887456921,1,He reportedly hasn't seen her in over four yea...
gossipcop-1594778479,1,The fashion crowd is speaking out about Kim Ka...
gossipcop-8172018375,1,What term do you want to search? Search with g...
...,...,...
gossipcop-854842,0,Aisha Tyler‘s divorce from Jeffrey Tietjens ha...
gossipcop-843491,0,All four of Queen Elizabeth and Prince Philip'...
gossipcop-897778,0,Theresa Caputo is adjusting to her new life af...
gossipcop-899849,0,Follow Us on Twitter Nominations for the 25th...


In [5]:
domain1_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,3586
1,2230


#### Second Domain

In [6]:
domain2_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,145
1,270


### Cleaning the Text

In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[()\"_#/@;*%:{}<>`+=~|.!?,'$-\[\]]", "", text)
    text = re.sub(r"[0-9]", "", text)
    
#     for words in stopwords_list:
#         text = re.sub(r"\b{}\b".format(words),"",text)
    
    return text

In [8]:
def tagger(decoder_input_sentence):
    start = "<BOS> "
    end = " <EOS>"
    final_target = [start + text + end for text in decoder_input_sentence] 
    return final_target

In [9]:
# def tokenizer(text_lists):
#     return [line.split(" ") for line in text_lists]

In [10]:
def create_vocab(text_lists):
    tokenizer = Tokenizer(oov_token="<UNK>")
    tokenizer.fit_on_texts(text_lists)
    
    dictionary = tokenizer.word_index
    word2idx = {}
    idx2word = {}
    for k,v in dictionary.items():
        word2idx[k]=v
        idx2word[v]=k
    
    return word2idx,idx2word,tokenizer

#### Encoder Training Data

In [11]:
encoder_inputs = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
# encoder_inputs = 
decoder_inputs = tagger(encoder_inputs[:])


#### Fake news detection training data

In [338]:
X_CC = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
Y_CC = np.array(domain1_frame['label'].apply(lambda x:int(x)).values.tolist()+ domain2_frame['label'].apply(lambda x:int(x)).values.tolist())
Y_CC_oh = to_categorical(Y_CC)

#### Domain classification Data

In [352]:
X_DC = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
Y_DC = np.array([0]*len(domain1_frame) + [1]*len(domain2_frame))
Y_DC_oh = to_categorical(Y_DC)

In [12]:
word2idx,idx2word,tokenizer = create_vocab(encoder_inputs)

### Due to ambiguity with regards to Keras Tokenizer num_words, below is a good enough fix, though it changes the tokenizer word_index outside of the class

In [13]:
num_words = 10000

sorted_by_word_count = sorted(tokenizer.word_counts.items(), key=lambda kv: kv[1], reverse=True)
tokenizer.word_index = {}
word2idx = {}
idx2word = {}
i = 0
for word,count in sorted_by_word_count:
    if i == num_words:
        break

    tokenizer.word_index[word] = i + 1    # <= because tokenizer is 1 indexed
    word2idx[word] = i+1
    idx2word[i+1]=word
    i += 1





In [14]:
tokenizer.word_index[tokenizer.oov_token] = num_words+1
word2idx[tokenizer.oov_token] = num_words+1
idx2word[num_words+1]=tokenizer.oov_token

In [577]:
# idx = len(tokenizer.word_index)+1
# tokenizer.word_index["<BOS>"] = idx
# word2idx["<BOS>"] = idx
# idx2word[idx] = "<BOS>"

# idx = len(tokenizer.word_index)+1
# tokenizer.word_index["<EOS>"] = idx
# word2idx["<EOS>"] = idx
# idx2word[idx] = "<EOS>"

In [15]:
word2idx["<UNK>"]

10001

In [16]:

t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
t_decoder_inputs = tokenizer.texts_to_sequences(decoder_inputs)


# 
t_encoder_inputs = pad_sequences(t_encoder_inputs,maxlen=10,padding='post', truncating='post')

# t_decoder_inputs = pad_sequences(t_encoder_inputs,maxlen=100,padding='post', truncating='post')
# t_decoder_inputs = np.insert(t_decoder_inputs,0,word2idx["<BOS>"],axis=1)
# t_decoder_inputs = np.insert(t_decoder_inputs,t_decoder_inputs.shape[1],word2idx["<EOS>"],axis=1)


max_encoder_len = max([len(val) for val in t_encoder_inputs])
max_decoder_len = max([len(val) for val in t_decoder_inputs])


In [382]:
def test_generator(X,Y,Y_CC,Y_DC,batch_size=128,max_len=5):
    
        
    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)
    for idx in range(0,len(X),batch_size):

        encoder_input = np.zeros((batch_size,max_encoder_len))
#         decoder_input = np.zeros((batch_size,max_decoder_len))
        decoder_target = np.zeros((batch_size,max_encoder_len,len(word2idx)+1))
        for j,input_seq in enumerate(X[idx:idx+batch_size]):
            for i,word_idx in enumerate(input_seq):
                encoder_input[j,i]= word_idx
                decoder_target[j,i,word_idx] = 1
           
        yield [encoder_input,[decoder_target,y_cc[idx:idx+batch_size],y_dc[idx:idx+batch_size]]]

        

In [383]:
def all_data_generator(X,Y,Y_CC,Y_DC):
    encoder_input = np.zeros((len(X),max_encoder_len))
    decoder_input = np.zeros((len(X),max_decoder_len))
    decoder_target = np.zeros((len(X),max_encoder_len,len(word2idx)+1)) ## Extra index for padding, word2idx is 1 indexed
    for j,(input_seq,target_seq) in enumerate(zip(X,Y)):
        for i,word_idx in enumerate(input_seq):
            encoder_input[j,i]= word_idx
            decoder_target[j,i,word_idx] = 1
    
    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)

    return [encoder_input,[decoder_target,y_cc,y_dc]]

In [379]:
vocab_len= len(word2idx)

### Loading Glove Word Vectors

In [255]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [256]:
embedding_matrix = np.zeros((vocab_len+1, 100))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i+1] = embedding_vector

In [257]:
embedding_matrix.shape

(10002, 100)

### Building the Final model

In [398]:
inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")

encoder_embedding = Embedding(vocab_len+1,100,trainable=True,weights=[embedding_matrix],input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

encoder_inputs = encoder_embedding(inputs)


encoder = LSTM(64,return_state=True)


encoder_outputs, state_h, state_c = encoder(encoder_inputs)


encoder_states = [state_h,state_c]


########## Content Classification Part ###############

fully_connected_CC = Dense(128,
                       activation="tanh",
                       name="non_linear_CC")

logits_CC = fully_connected_CC(encoder_outputs)

softmax_layer_CC = Dense(2,
                     activation="softmax",
                     name="softmax_layer_CC")

output_CC = softmax_layer_CC(logits_CC)


######### Domain Classification Part ##############

fully_connected_DC = Dense(128,
                       activation="tanh",
                       name="non_linear_DC")

logits_DC = fully_connected_DC(encoder_outputs)

softmax_layer_DC = Dense(2,
                     activation="softmax",
                     name="softmax_layer_DC")

output_DC = softmax_layer_DC(logits_DC)


########### Autoencoder PART #############
decoder_inputs = RepeatVector(max_encoder_len)(encoder_outputs)


decoder_lstm = LSTM(64, 
                         return_state=True,
                        return_sequences=True,
                         name = 'decoder_lstm')


decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)


decoder_dense = Dense(vocab_len+1, 
                      activation='softmax', 
                      name = 'decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)



model = Model(inputs,[decoder_outputs,output_CC,output_DC])

In [399]:
model.summary()

Model: "model_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 10)           0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 10, 100)      1000200     encoder_inputs[0][0]             
__________________________________________________________________________________________________
lstm_28 (LSTM)                  [(None, 64), (None,  42240       encoder_embedding[0][0]          
__________________________________________________________________________________________________
repeat_vector_18 (RepeatVector) (None, 10, 64)       0           lstm_28[0][0]                    
___________________________________________________________________________________________

In [400]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs,Y_CC,Y_DC,batch_size=1000)
encoder_input_data,[decoder_target_data,y_cc,y_dc] = next(generator)

In [401]:
model.compile(optimizer='rmsprop', loss=['categorical_crossentropy', 'binary_crossentropy', 'binary_crossentropy'],
             loss_weights=[0.1,0.6,-0.1])
history = model.fit(encoder_input_data, 
                    [decoder_target_data,y_cc,y_dc],
                    batch_size=128,
                    validation_split=0.1,
                    shuffle=True,
                    epochs=800)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 900 samples, validate on 100 samples
Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800


Epoch 28/800
Epoch 29/800
Epoch 30/800


KeyboardInterrupt: 

### Building the Autoencoder

In [261]:
inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")

encoder_embedding = Embedding(vocab_len+1,100,trainable=True,weights=[embedding_matrix],input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

encoder_inputs = encoder_embedding(inputs)

# encoder_inputs = LSTM(64,return_sequences=True)(encoder_inputs)
encoder = LSTM(64,return_state=True)

# encoder = Bidirectional(LSTM(64, 
#                     return_state=True, 
#                     name = 'encoder'))

# encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# # We discard `encoder_outputs` and only keep the states.

# encoder_states = [forward_h, backward_h]
encoder_states = [state_h,state_c]

# # Set up the decoder, using `encoder_states` as initial state.



decoder_inputs = RepeatVector(max_encoder_len)(encoder_outputs)

# # We set up our decoder to return full output sequences,
# # and to return internal states as well. We don't use the
# # return states in the training model, but we will use them in inference.

decoder_lstm = LSTM(64, 
                         return_state=True,
                        return_sequences=True,
                         name = 'decoder_lstm')


# # The inital_state call argument, specifying the initial state(s) of a RNN. 
# # This is used to pass the encoder states to the decoder as initial states.
# # Basically making the first memory of the decoder the encoded semantics
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
# decoder_outputs = LSTM(128,return_sequences=True)(decoder_outputs)

# decoder_outputs = Dense(64,activation="tanh")(decoder_outputs)
decoder_dense = Dense(vocab_len+1, 
                      activation='softmax', 
                      name = 'decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# # Define the model that will turn
# # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(inputs,decoder_outputs)

In [262]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 10)           0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 10, 100)      1000200     encoder_inputs[0][0]             
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 64), (None,  42240       encoder_embedding[0][0]          
__________________________________________________________________________________________________
repeat_vector_11 (RepeatVector) (None, 10, 64)       0           lstm_13[0][0]                    
___________________________________________________________________________________________

In [267]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs,batch_size=128)

In [268]:
# [encoder_input_data,decoder_input_data],decoder_target_data = next(generator)
encoder_input_data,decoder_target_data = next(generator)

In [265]:
encoder_input_data,decoder_target_data = all_data_generator(t_encoder_inputs,t_decoder_inputs)

In [269]:
encoder_input_data.shape

(128, 10)

In [270]:
encoder_input_data[127]

array([  61.,  248.,  198., 2020., 9961.,   43.,   15., 1232.,   19.,
        848.])

In [271]:
# [encoder_input_data,decoder_input_data],decoder_target_data = all_data_generator(t_encoder_inputs,t_decoder_inputs)
# encoder_input_data,decoder_target_data = all_data_generator(t_encoder_inputs,t_decoder_inputs)

In [775]:
# decoder_target_data.shape

### Training the Model

In [272]:
encoder_input_data[0]

array([ 952., 1064., 2305.,    8.,  269., 4031.,   35.,   13.,  136.,
         12.])

In [274]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
history = model.fit(encoder_input_data, 
                    decoder_target_data,
                    batch_size=128,
                    epochs=800)
#                     validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78

Epoch 100/800
Epoch 101/800
Epoch 102/800
Epoch 103/800
Epoch 104/800
Epoch 105/800
Epoch 106/800
Epoch 107/800
Epoch 108/800
Epoch 109/800
Epoch 110/800
Epoch 111/800
Epoch 112/800
Epoch 113/800
Epoch 114/800
Epoch 115/800
Epoch 116/800
Epoch 117/800
Epoch 118/800
Epoch 119/800
Epoch 120/800
Epoch 121/800
Epoch 122/800
Epoch 123/800
Epoch 124/800
Epoch 125/800
Epoch 126/800
Epoch 127/800
Epoch 128/800
Epoch 129/800
Epoch 130/800
Epoch 131/800
Epoch 132/800
Epoch 133/800
Epoch 134/800
Epoch 135/800
Epoch 136/800
Epoch 137/800
Epoch 138/800
Epoch 139/800
Epoch 140/800
Epoch 141/800
Epoch 142/800
Epoch 143/800
Epoch 144/800
Epoch 145/800
Epoch 146/800
Epoch 147/800
Epoch 148/800
Epoch 149/800
Epoch 150/800
Epoch 151/800
Epoch 152/800
Epoch 153/800
Epoch 154/800
Epoch 155/800
Epoch 156/800
Epoch 157/800
Epoch 158/800
Epoch 159/800
Epoch 160/800
Epoch 161/800
Epoch 162/800
Epoch 163/800
Epoch 164/800
Epoch 165/800
Epoch 166/800
Epoch 167/800
Epoch 168/800
Epoch 169/800
Epoch 170/800
Epoch 

Epoch 198/800
Epoch 199/800
Epoch 200/800
Epoch 201/800
Epoch 202/800
Epoch 203/800
Epoch 204/800
Epoch 205/800
Epoch 206/800
Epoch 207/800
Epoch 208/800
Epoch 209/800
Epoch 210/800
Epoch 211/800
Epoch 212/800
Epoch 213/800
Epoch 214/800
Epoch 215/800
Epoch 216/800
Epoch 217/800
Epoch 218/800
Epoch 219/800
Epoch 220/800
Epoch 221/800
Epoch 222/800
Epoch 223/800
Epoch 224/800
Epoch 225/800
Epoch 226/800
Epoch 227/800
Epoch 228/800
Epoch 229/800
Epoch 230/800
Epoch 231/800
Epoch 232/800
Epoch 233/800
Epoch 234/800
Epoch 235/800
Epoch 236/800
Epoch 237/800
Epoch 238/800
Epoch 239/800
Epoch 240/800
Epoch 241/800
Epoch 242/800
Epoch 243/800
Epoch 244/800
Epoch 245/800
Epoch 246/800
Epoch 247/800
Epoch 248/800
Epoch 249/800
Epoch 250/800
Epoch 251/800
Epoch 252/800
Epoch 253/800
Epoch 254/800
Epoch 255/800
Epoch 256/800
Epoch 257/800
Epoch 258/800
Epoch 259/800
Epoch 260/800
Epoch 261/800
Epoch 262/800
Epoch 263/800
Epoch 264/800
Epoch 265/800
Epoch 266/800
Epoch 267/800
Epoch 268/800
Epoch 

KeyboardInterrupt: 

### Testing the Model

In [281]:
for idx in encoder_input_data[25]:
    print(idx2word[idx],end=" ")

the rumour meghan markle had a secret first husband where 

In [282]:
# model.compile('rmsprop', 'mse')
output_array = model.predict([encoder_input_data])

In [283]:
for idx in output_array[25]:
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

the the one had had secret secret husband where where 

In [187]:
for idx in output_array[100]:
    print(idx[np.argmax(20)])
    break
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

1.9437695e-07
