In [1]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense,Embedding,RepeatVector
from keras.models import Model
import re
import numpy as np
from scipy import stats
# import demoji
import tokenizer

Using TensorFlow backend.


In [7]:
domain1_path = "./dEFEND/gossipcop_content_no_ignore.tsv"
domain2_path = "./dEFEND/politifact_content_no_ignore.tsv"

In [8]:
domain1_frame = pd.read_csv(domain1_path,delimiter="\t").set_index('id')
domain2_frame = pd.read_csv(domain2_path,delimiter="\t").set_index('id')

### Counts for each label for the 2 domain

#### First Domain

In [9]:
domain1_frame

Unnamed: 0_level_0,label,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1
gossipcop-9096198130,1,Sarah Jessica Parker is getting candid about h...
gossipcop-6982710185,1,Many celebrities have been sharing their thoug...
gossipcop-7887456921,1,He reportedly hasn't seen her in over four yea...
gossipcop-1594778479,1,The fashion crowd is speaking out about Kim Ka...
gossipcop-8172018375,1,What term do you want to search? Search with g...
...,...,...
gossipcop-854842,0,Aisha Tyler‘s divorce from Jeffrey Tietjens ha...
gossipcop-843491,0,All four of Queen Elizabeth and Prince Philip'...
gossipcop-897778,0,Theresa Caputo is adjusting to her new life af...
gossipcop-899849,0,Follow Us on Twitter Nominations for the 25th...


In [10]:
domain1_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,3586
1,2230


#### Second Domain

In [11]:
domain2_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,145
1,270


### Cleaning the Text

In [12]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[()\"_#/@;*%:<>{}`+=~|.!?,'$-\[\]]", "", text)
    text = re.sub(r"[0-9]", "", text)
    
    return text

In [13]:
def tagger(decoder_input_sentence):
    start = "<BOS> "
    end = " <EOS>"
    final_target = [start + text + end for text in decoder_input_sentence] 
    return final_target

In [14]:
# def tokenizer(text_lists):
#     return [line.split(" ") for line in text_lists]

In [160]:
def create_vocab(text_lists):
    tokenizer = Tokenizer(num_words=20000,oov_token="<UNK>")
    tokenizer.fit_on_texts(text_lists)
    
    dictionary = tokenizer.word_index
    word2idx = {}
    idx2word = {}
    for k,v in dictionary.items():
        word2idx[k]=v
        idx2word[v]=k
    
    return word2idx,idx2word,tokenizer

In [161]:
encoder_inputs = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
# encoder_inputs = 
decoder_inputs = tagger(encoder_inputs[:])


In [163]:
word2idx,idx2word,tokenizer = create_vocab(encoder_inputs)

### Due to ambiguity with regards to Keras Tokenizer num_words, below is a good enough fix, though it changes the tokenizer word_index outside of the class

In [214]:
num_words = 5000

sorted_by_word_count = sorted(tokenizer.word_counts.items(), key=lambda kv: kv[1], reverse=True)
tokenizer.word_index = {}
word2idx = {}
idx2word = {}
i = 0
for word,count in sorted_by_word_count:
    if i == num_words:
        break
    tokenizer.word_index[word] = i + 1    # <= because tokenizer is 1 indexed
    word2idx[word] = i+1
    idx2word[i+1]=word
    i += 1


In [215]:
tokenizer.word_index[tokenizer.oov_token] = num_words+1
word2idx[tokenizer.oov_token] = num_words+1
idx2word[num_words+1]=tokenizer.oov_token

In [216]:

t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
t_decoder_inputs = tokenizer.texts_to_sequences(decoder_inputs)

t_encoder_inputs = pad_sequences(t_encoder_inputs,maxlen=50)

max_encoder_len = max([len(val) for val in t_encoder_inputs])
max_decoder_len = max([len(val) for val in t_decoder_inputs])


In [217]:
t_encoder_inputs[0]

array([ 952, 1064, 2305,  452,  340, 2960,  744,  881,    3,  420, 2256,
          6,  574,    2,    1,  275,  248, 4427,  123,  952, 1064, 2305,
        124,   21,   45, 5001,   80,  574,    2,    1,  275, 5001,  952,
       1064, 2305,    2, 5001, 5001, 4601,  250,  248, 4427,  574,    2,
          1,  275,  398, 1167,  353, 1613], dtype=int32)

In [362]:
def test_generator(X,Y,batch_size=128,max_len=5):
    
        
    
    for idx in range(0,len(X),batch_size):

        encoder_input = np.zeros((batch_size,max_encoder_len))
#         decoder_input = np.zeros((batch_size,max_decoder_len))
        decoder_target = np.zeros((batch_size,max_encoder_len,len(word2idx)+1))
        for j,input_seq in enumerate(X[idx:idx+batch_size]):
            for i,word_idx in enumerate(input_seq):
                encoder_input[j,i]= word_idx
                decoder_target[j,i,word_idx] = 1
           
        yield [encoder_input,decoder_target]

        

In [406]:
def all_data_generator(X,Y):
    encoder_input = np.zeros((len(X),max_encoder_len))
    decoder_input = np.zeros((len(X),max_decoder_len))
    decoder_target = np.zeros((len(X),max_encoder_len,len(word2idx)+1))
    for j,(input_seq,target_seq) in enumerate(zip(X,Y)):
        for i,word_idx in enumerate(input_seq):
            encoder_input[j,i]= word_idx
            decoder_target[j,i,word_idx] = 1

#         for i,word_idx in enumerate(target_seq):
#             decoder_input[j,i] = word_idx-1

#             if i>0:
#                 decoder_target[j,i-1,word_idx-1] = 1
    
#     return [[encoder_input,decoder_input],decoder_target]
    return [encoder_input,decoder_target]

In [407]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs,batch_size=128)

In [408]:
vocab_len= len(word2idx)

In [409]:
vocab_len

5001

In [410]:
next(generator)[0].shape

(128, 50)

In [411]:
len(t_encoder_inputs)

6231

In [412]:
vocab_len

5001

### Building the Embedding Layer

In [413]:
vocab_len

5001

In [459]:
inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")

encoder_embedding = Embedding(vocab_len+1,100,input_length=max_encoder_len,name="encoder_embedding")

encoder_inputs = encoder_embedding(inputs)



encoder = LSTM(128, 
                    return_state=True, 
                    name = 'encoder')

encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# # We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# # Set up the decoder, using `encoder_states` as initial state.



decoder_inputs = RepeatVector(max_encoder_len)(encoder_outputs)

# # We set up our decoder to return full output sequences,
# # and to return internal states as well. We don't use the
# # return states in the training model, but we will use them in inference.

decoder_lstm = LSTM(128, 
                         return_state=True,
                        return_sequences=True,
                         name = 'decoder_lstm')

# # The inital_state call argument, specifying the initial state(s) of a RNN. 
# # This is used to pass the encoder states to the decoder as initial states.
# # Basically making the first memory of the decoder the encoded semantics
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)

decoder_dense = Dense(vocab_len+1, 
                      activation='softmax', 
                      name = 'decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# # Define the model that will turn
# # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(inputs,decoder_outputs)

In [460]:
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 50)           0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 50, 100)      500200      encoder_inputs[0][0]             
__________________________________________________________________________________________________
encoder (LSTM)                  [(None, 128), (None, 117248      encoder_embedding[0][0]          
__________________________________________________________________________________________________
repeat_vector_12 (RepeatVector) (None, 50, 128)      0           encoder[0][0]                    
___________________________________________________________________________________________

In [517]:
!pip install pydot



In [522]:
from IPython.display import SVG
from keras.utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [486]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs,batch_size=120)

In [487]:
# [encoder_input_data,decoder_input_data],decoder_target_data = next(generator)
encoder_input_data,decoder_target_data = next(generator)

In [488]:
encoder_input_data.shape

(120, 50)

In [489]:
encoder_input_data[1]

array([4.248e+03, 8.200e+01, 1.380e+02, 4.800e+01, 4.530e+02, 3.000e+00,
       1.210e+02, 1.600e+01, 4.600e+01, 5.000e+00, 1.821e+03, 2.030e+03,
       5.000e+00, 5.460e+02, 2.100e+01, 4.500e+01, 5.001e+03, 1.500e+01,
       8.800e+01, 2.015e+03, 9.000e+00, 6.490e+02, 2.985e+03, 1.740e+03,
       2.100e+01, 4.500e+01, 5.001e+03, 6.300e+01, 9.000e+00, 5.001e+03,
       3.000e+00, 1.150e+03, 5.001e+03, 3.000e+00, 2.853e+03, 4.600e+01,
       5.001e+03, 4.300e+01, 1.600e+01, 5.590e+02, 5.001e+03, 3.840e+02,
       1.250e+02, 5.200e+01, 3.550e+02, 3.787e+03, 3.000e+00, 2.576e+03,
       5.001e+03, 4.930e+02])

In [490]:
len(encoder_input_data)

120

In [491]:
for idx in encoder_input_data[0]:
    print(idx2word[idx],end=" ")

sarah jessica parker wants sexual beast ellen degeneres to play samantha in sex and the city kim cattrall says sarah jessica parker could have been <UNK> over sex and the city <UNK> sarah jessica parker and <UNK> <UNK> reunite following kim cattrall sex and the city drama pics related gallery 

In [492]:
# model.compile('rmsprop', 'mse')
output_array = model.predict([encoder_input_data])

In [493]:
output_array.shape

(120, 50, 5002)

In [494]:
for idx in decoder_target_data[0]:
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

sarah jessica parker wants sexual beast ellen degeneres to play samantha in sex and the city kim cattrall says sarah jessica parker could have been <UNK> over sex and the city <UNK> sarah jessica parker and <UNK> <UNK> reunite following kim cattrall sex and the city drama pics related gallery 

In [501]:
for idx in output_array[100]:
    print(idx[np.argmax(20)])
    break
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

0.0007085485


In [471]:
# [encoder_input_data,decoder_input_data],decoder_target_data = all_data_generator(t_encoder_inputs,t_decoder_inputs)
encoder_input_data,decoder_target_data = all_data_generator(t_encoder_inputs,t_decoder_inputs)

In [472]:
decoder_target_data.shape

(6231, 50, 5002)

### Training the Model

In [473]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
history = model.fit(encoder_input_data, 
                    decoder_target_data,
                    epochs=4)
#                     validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


### Building the Autoencoder

In [None]:
encoder_inputs = Input(shape=(None, 1), 
                       name = 'encoder_inputs')