In [2]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
from scipy import stats
# import demoji
import tokenizer

In [3]:
domain1_path = "./dEFEND/gossipcop_content_no_ignore.tsv"
domain2_path = "./dEFEND/politifact_content_no_ignore.tsv"

In [4]:
domain1_frame = pd.read_csv(domain1_path,delimiter="\t").set_index('id')
domain2_frame = pd.read_csv(domain2_path,delimiter="\t").set_index('id')

### Counts for each label for the 2 domain

#### First Domain

In [68]:
domain1_frame

Unnamed: 0_level_0,label,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1
gossipcop-9096198130,1,Sarah Jessica Parker is getting candid about h...
gossipcop-6982710185,1,Many celebrities have been sharing their thoug...
gossipcop-7887456921,1,He reportedly hasn't seen her in over four yea...
gossipcop-1594778479,1,The fashion crowd is speaking out about Kim Ka...
gossipcop-8172018375,1,What term do you want to search? Search with g...
...,...,...
gossipcop-854842,0,Aisha Tyler‘s divorce from Jeffrey Tietjens ha...
gossipcop-843491,0,All four of Queen Elizabeth and Prince Philip'...
gossipcop-897778,0,Theresa Caputo is adjusting to her new life af...
gossipcop-899849,0,Follow Us on Twitter Nominations for the 25th...


In [5]:
domain1_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,3586
1,2230


#### Second Domain

In [6]:
domain2_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,145
1,270


### Cleaning the Text

In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[()\"_#/@;*%:<>{}`+=~|.!?,'$-\[\]]", "", text)
    text = re.sub(r"[0-9]", "", text)
    
    return text

In [8]:
def tagger(decoder_input_sentence):
    start = "<BOS> "
    end = " <EOS>"
    final_target = [start + text + end for text in decoder_input_sentence] 
    return final_target

In [399]:
# def tokenizer(text_lists):
#     return [line.split(" ") for line in text_lists]

In [9]:
def create_vocab(text_lists):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_lists)
    
    dictionary = tokenizer.word_index
    word2idx = {}
    idx2word = {}
    for k,v in dictionary.items():
        word2idx[k]=v
        idx2word[v]=k
    
    return word2idx,idx2word,tokenizer

In [35]:
encoder_inputs = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
decoder_inputs = tagger(encoder_inputs)


In [18]:
t_word2idx,t_idx2word,tokenizer = create_vocab(decoder_inputs)

In [37]:

t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
t_decoder_inputs = tokenizer.texts_to_sequences(decoder_inputs)

max_encoder_len = max([len(val) for val in t_encoder_inputs])
max_decoder_len = max([len(val) for val in t_decoder_inputs])

In [64]:
def test_generator(X,Y,batch_size=3,max_len=5):
    
    while True:
        
        
        for j in range(0,len(X),batch_size):
            
            encoder_input = np.zeros((batch_size,max_len))
            for j,input_seq in enumerate(X[j:j+batch_size]):
                for i,word_idx in enumerate(input_seq[:max_len]):
                    encoder_input[j,i]= word_idx

            decoder_input = np.zeros((batch_size,max_len+1))
            decoder_target = np.zeros((batch_size,max_len+1))

            for j,target_seq in enumerate(Y[j:j+batch_size]):
                for i,word_idx in enumerate(target_seq[:max_len+1]):
                    decoder_input[j,i] = word_idx
                    
                    if i>0:
                        decoder_target[j,i-1] = word_idx

            yield [[encoder_input,decoder_input],decoder_target]
            
        

In [65]:
t_encoder_inputs = [[1,2,3,4]]
t_decoder_inputs = [[65,1,2,3,4,64]]

In [66]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs)

In [67]:
next(generator)

[[array([[1., 2., 3., 4., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]), array([[65.,  1.,  2.,  3.,  4., 64.],
         [ 0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.]])],
 array([[ 1.,  2.,  3.,  4., 64.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.]])]

In [402]:
vocab = set()

for sent in decoder_inputs:
    for word in sent.split(" "):
        if word not in vocab:
            vocab.add(word)

vocab = sorted(vocab)

In [403]:
word2idx = {}
idx2word = {}

for idx,word in enumerate(vocab):
    word2idx[word]=idx
    idx2word[idx]=word

In [406]:
vocab_len = len(word2idx)

In [412]:
max_len_input = max([len(sent.split()) for sent in decoder_inputs])

In [440]:
max_len_input =5

### Padding the sequences

In [469]:
def generate_batch(X,Y,batch_size=3):
    while True:
        for j in range(0,len(X),batch_size):
            encoder_input_data = np.zeros((batch_size,max_len_input),dtype='float32')
            decoder_input_data = np.zeros((batch_size,max_len_input+1),dtype="float32")
            decoder_target_data = np.zeros((batch_size,max_len_input+1),dtype="float32")
            
            for i,(input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t,word in enumerate(input_text.split(" ")[:max_len_input]):
                    encoder_input_data[i,t] = word2idx[word]
                    
                ### Decoder target data is one timestep ahead of decoder input data
                
                for t,word in enumerate(target_text.split(" ")[:max_len_input+1]):
                    if t<len(target_text.split(" ")[:max_len_input+1]):
                        decoder_input_data[i,t] = word2idx[word]
                    if t>0:
                        decoder_target_data[i,t-1] = word2idx[word]
                    decoder_target_data[i,-1] = word2idx["<EOS>"]
        
        yield [[encoder_input_data,decoder_input_data],decoder_target_data]

In [470]:
generator = generate_batch(encoder_inputs,decoder_inputs)