In [330]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
from scipy import stats
import demoji
import tokenizer

In [245]:
domain1_path = "./dEFEND/gossipcop_content_no_ignore.tsv"
domain2_path = "./dEFEND/politifact_content_no_ignore.tsv"

In [246]:
domain1_frame = pd.read_csv(domain1_path,delimiter="\t").set_index('id')
domain2_frame = pd.read_csv(domain2_path,delimiter="\t").set_index('id')

### Counts for each label for the 2 domain

#### First Domain

In [247]:
domain1_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,3586
1,2230


#### Second Domain

In [248]:
domain2_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,145
1,270


### Cleaning the Text

In [397]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[()\"_#/@;*%:<>{}`+=~|.!?,'$-\[\]]", "", text)
    text = re.sub(r"[0-9]", "", text)
    
    return text

In [398]:
def tagger(decoder_input_sentence):
    start = "<BOS> "
    end = " <EOS>"
    final_target = [start + text + end for text in decoder_input_sentence] 
    return final_target

In [399]:
# def tokenizer(text_lists):
#     return [line.split(" ") for line in text_lists]

In [472]:
def create_vocab(text_lists):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_lists)
    
    dictionary = tokenizer.word_index
    word2idx = {}
    idx2word = {}
    for k,v in dictionary.items():
        word2idx[k]=v
        idx2word[v]=k
    
    return word2idx,idx2word,tokenizer

In [474]:
t_word2idx,t_idx2word,tokenizer = create_vocab(decoder_inputs)

In [481]:
t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
t_decoder_inputs = tokenizer.texts_to_sequences(decoder_inputs)

In [543]:
def test_generator(X,Y,batch_size=3,max_len=5):
    
    while True:
        X = pad_sequences(X,maxlen=max_len,padding="post",truncating="post")
        Y = pad_sequences(Y,maxlen=max_len+1,padding="post",truncating="post")
        for j in range(0,len(X),batch_size):
            encoder_input = X[j:j+batch_size]

            decoder_input = []
            decoder_target = []

            for target_seq in Y[j:j+batch_size]:
                decoder_input.append(target_seq)
                decoder_target.append(np.append(target_seq[1:],t_word2idx["eos"]))

        yield [[encoder_input,decoder_input],decoder_target]
            
        

In [544]:
generator = test_generator(t_encoder_inputs,t_decoder_inputs)

In [550]:
next(generator)

[[array([[ 3029,   503,   514, 29098,  4799],
         [ 2389,   683,     1,  1178,     5],
         [    4,  3300,   723, 13699,  3891]], dtype=int32),
  [array([   65,  3029,   503,   514, 29098,  4799], dtype=int32),
   array([  65, 2389,  683,    1, 1178,    5], dtype=int32),
   array([   65,     4,  3300,   723, 13699,  3891], dtype=int32)]],
 [array([ 3029,   503,   514, 29098,  4799,    64]),
  array([2389,  683,    1, 1178,    5,   64]),
  array([    4,  3300,   723, 13699,  3891,    64])]]

In [414]:
encoder_inputs = domain1_frame['content'].apply(lambda x: clean_text(x)).values.tolist()+ domain2_frame['content'].apply(lambda x: clean_text(x)).values.tolist()
decoder_inputs = tagger(encoder_inputs)


In [402]:
vocab = set()

for sent in decoder_inputs:
    for word in sent.split(" "):
        if word not in vocab:
            vocab.add(word)

vocab = sorted(vocab)

In [403]:
word2idx = {}
idx2word = {}

for idx,word in enumerate(vocab):
    word2idx[word]=idx
    idx2word[idx]=word

In [406]:
vocab_len = len(word2idx)

In [412]:
max_len_input = max([len(sent.split()) for sent in decoder_inputs])

In [440]:
max_len_input =5

### Padding the sequences

In [469]:
def generate_batch(X,Y,batch_size=3):
    while True:
        for j in range(0,len(X),batch_size):
            encoder_input_data = np.zeros((batch_size,max_len_input),dtype='float32')
            decoder_input_data = np.zeros((batch_size,max_len_input+1),dtype="float32")
            decoder_target_data = np.zeros((batch_size,max_len_input+1),dtype="float32")
            
            for i,(input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t,word in enumerate(input_text.split(" ")[:max_len_input]):
                    encoder_input_data[i,t] = word2idx[word]
                    
                ### Decoder target data is one timestep ahead of decoder input data
                
                for t,word in enumerate(target_text.split(" ")[:max_len_input+1]):
                    if t<len(target_text.split(" ")[:max_len_input+1]):
                        decoder_input_data[i,t] = word2idx[word]
                    if t>0:
                        decoder_target_data[i,t-1] = word2idx[word]
                    decoder_target_data[i,-1] = word2idx["<EOS>"]
        
        yield [[encoder_input_data,decoder_input_data],decoder_target_data]

In [470]:
generator = generate_batch(encoder_inputs,decoder_inputs)