<a href="https://colab.research.google.com/github/arnavc1712/Cross-Domain-Fake-News-Detection/blob/master/Cross_Domain_Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense,Embedding,RepeatVector,Bidirectional
from keras.models import Model
import re
import numpy as np
from scipy import stats
# import demoji
# import tokenizer
# from nltk.corpus import stopwords
# stopwords_list = list(set(stopwords.words('english')))
from keras.utils import to_categorical
import os
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

Using TensorFlow backend.


In [0]:
domain1_path = "./gossipcop_content_no_ignore.tsv"
domain2_path = "./politifact_content_no_ignore.tsv"

In [0]:
domain1_frame = pd.read_csv(domain1_path,delimiter="\t").set_index('id')
domain2_frame = pd.read_csv(domain2_path,delimiter="\t").set_index('id')

### Exploring the Dataset

#### First Domain

In [4]:
domain1_frame

Unnamed: 0_level_0,label,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1
gossipcop-9096198130,1,Sarah Jessica Parker is getting candid about h...
gossipcop-6982710185,1,Many celebrities have been sharing their thoug...
gossipcop-7887456921,1,He reportedly hasn't seen her in over four yea...
gossipcop-1594778479,1,The fashion crowd is speaking out about Kim Ka...
gossipcop-8172018375,1,What term do you want to search? Search with g...
...,...,...
gossipcop-854842,0,Aisha Tyler‘s divorce from Jeffrey Tietjens ha...
gossipcop-843491,0,All four of Queen Elizabeth and Prince Philip'...
gossipcop-897778,0,Theresa Caputo is adjusting to her new life af...
gossipcop-899849,0,Follow Us on Twitter Nominations for the 25th...


In [5]:
domain1_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,3586
1,2230


#### Second Domain

In [6]:
domain2_frame.groupby(['label'])[['label']].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,145
1,270


### Preprocessing and Feature Engineering

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
nltk.download('punkt')
nltk.download('wordnet')

class Preprocess:

  def __init__(self):
    self.wordnet_lemmatizer = WordNetLemmatizer()
    self.porter = PorterStemmer()

  def clean_text(self,text):
      '''Clean text by removing unnecessary characters and altering the format of words.'''

      text = text.lower()
      text = re.sub(r"i'm", "i am", text)
      text = re.sub(r"he's", "he is", text)
      text = re.sub(r"she's", "she is", text)
      text = re.sub(r"it's", "it is", text)
      text = re.sub(r"that's", "that is", text)
      text = re.sub(r"what's", "that is", text)
      text = re.sub(r"where's", "where is", text)
      text = re.sub(r"how's", "how is", text)
      text = re.sub(r"\'ll", " will", text)
      text = re.sub(r"\'ve", " have", text)
      text = re.sub(r"\'re", " are", text)
      text = re.sub(r"\'d", " would", text)
      text = re.sub(r"\'re", " are", text)
      text = re.sub(r"won't", "will not", text)
      text = re.sub(r"can't", "cannot", text)
      text = re.sub(r"n't", " not", text)
      text = re.sub(r"n'", "ng", text)
      text = re.sub(r"'bout", "about", text)
      text = re.sub(r"'til", "until", text)
      text = re.sub(r"[()\"_#/@;*%:{}<>`+=~|.!?,'$-\[\]]", "", text)
      text = re.sub(r"[0-9]", "", text)
      
      return text
    
  def lemmatizer(self,text):
    sentence_words = nltk.word_tokenize(text)
    ret_text = []
    for word in sentence_words:
      ret_text.append(self.wordnet_lemmatizer.lemmatize(word))
    
    return " ".join(ret_text)


  def stemmer(self,text):
    sentence_words = nltk.word_tokenize(text)
    ret_text = []
    for word in sentence_words:
      ret_text.append(self.porter.stem(word))
    
    return " ".join(ret_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preparing Data for the Model

## Encoder data

In [0]:
def encoder_data(df,lemmatize=False,stem=False):
  preprocessor = Preprocess()
  encoder_inputs = df['content'].apply(lambda x: preprocessor.clean_text(x))
  if lemmatize:
    encoder_inputs = df['content'].apply(lambda x: preprocessor.lemmatizer(x))
  if stem:
    encoder_inputs = df['content'].apply(lambda x: preprocessor.stemmer(x))
  encoder_inputs = np.array(encoder_inputs.values.tolist())
  return encoder_inputs


## Fake news detection training data

In [0]:
def fake_news_target(df):
  y_cc = np.array(df['label'].apply(lambda x:int(x)).values.tolist())
  return y_cc


## Domain classification Data

In [0]:
def domain_target(df):
  y_dc = np.array([0]*len(domain1_frame) + [1]*len(domain2_frame))
  return y_dc


## Shuffling the data

In [0]:
class Shuffle:

  def __init__(self,data_len):
    self.idx = np.arange(data_len)
    np.random.shuffle(self.idx)
  
  def shuffle(self,data):
    return np.array(data)[self.idx]

## Creating word to index mapping and vice versa

In [0]:
def create_vocab(text_lists,vocab_len): ## Creating the word2idx, idx2word mapping using the Keras Tokenizer
    tokenizer = Tokenizer(oov_token="<UNK>")
    tokenizer.fit_on_texts(text_lists)
    ## Due to ambiguity with regards to Keras Tokenizer num_words, below is a good enough fix, 
    ## though it changes the tokenizer word_index outside of the class
    num_words = vocab_len

    sorted_by_word_count = sorted(tokenizer.word_counts.items(), key=lambda kv: kv[1], reverse=True)
    tokenizer.word_index = {}
    word2idx = {}
    idx2word = {}
    i = 0
    for word,count in sorted_by_word_count:
        if i == num_words:
            break

        tokenizer.word_index[word] = i + 1    # <= because tokenizer is 1 indexed
        word2idx[word] = i+1
        idx2word[i+1]=word
        i += 1
    
    tokenizer.word_index[tokenizer.oov_token] = num_words+1
    word2idx[tokenizer.oov_token] = num_words+1
    idx2word[num_words+1]=tokenizer.oov_token
    
    return word2idx,idx2word,tokenizer

## Tokenizing and Padding/Truncating Data

In [0]:
def pad_tokenize_data(encoder_inputs,max_sentence_length,tokenizer):

  t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
  t_encoder_inputs = pad_sequences(t_encoder_inputs,maxlen=max_sentence_length,padding='post', truncating='post')


  return t_encoder_inputs


## Defining Data Generators

In [0]:
def batch_generator(X,Y_CC,Y_DC,max_sentence_length,word2idx,batch_size=128):
    
        
    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)
    for idx in range(0,len(X),batch_size):
        encoder_input = np.zeros((batch_size,max_sentence_length))
        decoder_target = np.zeros((batch_size,max_sentence_length,len(word2idx)+1))
        for j,input_seq in enumerate(X[idx:idx+batch_size]):
            for i,word_idx in enumerate(input_seq):
                encoder_input[j,i]= word_idx
                decoder_target[j,i,word_idx] = 1
           
        yield [encoder_input,[decoder_target,y_cc[idx:idx+batch_size],y_dc[idx:idx+batch_size]]]

In [0]:
def all_data_generator(X,Y_CC,Y_DC,max_sentence_length,word2idx):
    encoder_input = np.zeros((len(X),max_sentence_length))
    decoder_target = np.zeros((len(X),max_sentence_length,len(word2idx)+1)) ## Extra index for padding, word2idx is 1 indexed
    for j,input_seq in enumerate(X):
        for i,word_idx in enumerate(input_seq):
            encoder_input[j,i]= word_idx
            decoder_target[j,i,word_idx] = 1
    
    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)

    return [encoder_input,[decoder_target,y_cc,y_dc]]

## Loading Glove Word Vectors

In [0]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [0]:
# import zipfile
# zip_ref = zipfile.ZipFile("./glove.6B.zip", 'r')
# zip_ref.extractall("./glove/")
# zip_ref.close()

In [0]:
embeddings_index = dict()
f = open('./glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
def generate_glove_matrix(vocab_len):
  embedding_matrix = np.zeros((vocab_len+1, 100))
  for word, i in word2idx.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i+1] = embedding_vector
  return embedding_matrix

# Building our models

## Building domain specific classification Model

In [0]:
def classification_model(max_encoder_len,embedding_dim,latent_dim,include_glove=False):
  inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")

  if include_glove:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=False,weights=[embedding_matrix],input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  else:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=True,input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  encoder_inputs = encoder_embedding(inputs)


  encoder = LSTM(latent_dim,return_state=True)


  encoder_outputs, state_h, state_c = encoder(encoder_inputs)

  fully_connected_CC = Dense(128,
                        activation="tanh",
                        name="non_linear_CC")

  logits_CC = fully_connected_CC(encoder_outputs)

  softmax_layer_CC = Dense(2,
                      activation="softmax",
                      name="softmax_layer_CC")

  output_CC = softmax_layer_CC(logits_CC)

  model = Model(inputs,output_CC)

  return model




## Building domain independent model with domain loss

In [0]:
def classification_domain_model(max_encoder_len,embedding_dim,latent_dim,include_glove=False):
  inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")


  if include_glove:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=False,weights=[embedding_matrix],input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  else:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=True,input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  encoder_inputs = encoder_embedding(inputs)


  encoder = LSTM(latent_dim,return_state=True)


  encoder_outputs, state_h, state_c = encoder(encoder_inputs)


  encoder_states = [state_h,state_c]


  ########## Content Classification Part ###############

  fully_connected_CC = Dense(128,
                        activation="tanh",
                        name="non_linear_CC")

  logits_CC = fully_connected_CC(encoder_outputs)

  softmax_layer_CC = Dense(2,
                      activation="softmax",
                      name="softmax_layer_CC")

  output_CC = softmax_layer_CC(logits_CC)


  ######### Domain Classification Part ##############

  fully_connected_DC = Dense(128,
                        activation="tanh",
                        name="non_linear_DC")

  logits_DC = fully_connected_DC(encoder_outputs)

  softmax_layer_DC = Dense(2,
                      activation="softmax",
                      name="softmax_layer_DC")

  output_DC = softmax_layer_DC(logits_DC)


  model = Model(inputs,[output_CC,output_DC])

  return model



## Building Plain Autoencoder

In [0]:
def autoencoder(max_encoder_len,embedding_dim,latent_dim):
  inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")


  encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=True,input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  encoder_inputs = encoder_embedding(inputs)


  encoder = LSTM(latent_dim,return_state=True)


  encoder_outputs, state_h, state_c = encoder(encoder_inputs)


  encoder_states = [state_h,state_c]

  decoder_inputs = RepeatVector(max_encoder_len)(encoder_outputs)


  decoder_lstm = LSTM(64, 
                          return_state=True,
                          return_sequences=True,
                          name = 'decoder_lstm')


  decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)


  decoder_dense = Dense(vocab_len+1, 
                        activation='softmax', 
                        name = 'decoder_dense')
  decoder_outputs = decoder_dense(decoder_outputs)



  model = Model(inputs,decoder_outputs)

  return model


## Building the Final model

In [0]:
def final_model(max_encoder_len,embedding_dim,latent_dim,include_glove=False):
  inputs = Input(shape=(max_encoder_len,),name="encoder_inputs")


  if include_glove:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=False,weights=[embedding_matrix],input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  else:
    encoder_embedding = Embedding(vocab_len+1,embedding_dim,trainable=True,input_length=max_encoder_len,mask_zero=True,name="encoder_embedding")

  encoder_inputs = encoder_embedding(inputs)


  encoder = LSTM(latent_dim,return_state=True)


  encoder_outputs, state_h, state_c = encoder(encoder_inputs)


  encoder_states = [state_h,state_c]


  ########## Content Classification Part ###############

  fully_connected_CC = Dense(128,
                        activation="tanh",
                        name="non_linear_CC")

  logits_CC = fully_connected_CC(encoder_outputs)

  softmax_layer_CC = Dense(2,
                      activation="softmax",
                      name="softmax_layer_CC")

  output_CC = softmax_layer_CC(logits_CC)


  ######### Domain Classification Part ##############

  fully_connected_DC = Dense(128,
                        activation="tanh",
                        name="non_linear_DC")

  logits_DC = fully_connected_DC(encoder_outputs)

  softmax_layer_DC = Dense(2,
                      activation="softmax",
                      name="softmax_layer_DC")

  output_DC = softmax_layer_DC(logits_DC)


  ########### Autoencoder PART #############
  decoder_inputs = RepeatVector(max_encoder_len)(encoder_outputs)


  decoder_lstm = LSTM(64, 
                          return_state=True,
                          return_sequences=True,
                          name = 'decoder_lstm')


  decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)


  decoder_dense = Dense(vocab_len+1, 
                        activation='softmax', 
                        name = 'decoder_dense')
  decoder_outputs = decoder_dense(decoder_outputs)



  model = Model(inputs,[decoder_outputs,output_CC,output_DC])

  return model

In [0]:
def print_model_summary(model):
  model.summary()

In [0]:
# from keras import backend as K

# def recall_m(y_true, y_pred):
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#         recall = true_positives / (possible_positives + K.epsilon())
#         return recall

# def precision_m(y_true, y_pred):
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#         precision = true_positives / (predicted_positives + K.epsilon())
#         return precision

# def f1_m(y_true, y_pred):
#     precision = precision_m(y_true, y_pred)
#     recall = recall_m(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Experiments

## Experiment 1: Domain Specific Classification

In [0]:
def prepare_training_data(df,vocab_size):
  encoder_inputs = encoder_data(df)
  y_cc = fake_news_target(df)
  y_dc = domain_target(df)

  shuffle = Shuffle(len(encoder_inputs)) ## Initializing the shuffle class instance
  encoder_inputs = shuffle.shuffle(encoder_inputs)
  y_cc = shuffle.shuffle(y_cc)
  y_dc = shuffle.shuffle(y_dc) ## Not used

  word2idx,idx2word,tokenizer = create_vocab(encoder_inputs,vocab_size)

  vocab_len = len(word2idx)

  encoder_inputs = pad_tokenize_data(encoder_inputs,max_sentence_length,tokenizer)

  encoder_inputs,[decoder_target,y_cc,y_dc] = all_data_generator(encoder_inputs,y_cc,y_dc,max_sentence_length,word2idx)

  train_X, test_X, train_Y, test_Y = train_test_split(encoder_inputs,y_cc,test_size=0.1,random_state=42)

  return [[train_X, test_X, train_Y, test_Y],[word2idx,idx2word,tokenizer],vocab_len]

  

In [0]:
def prepare_testing_data(df,word2idx):
  encoder_inputs = encoder_data(df)
  y_cc = fake_news_target(df)
  y_dc = domain_target(df)

  encoder_inputs = pad_tokenize_data(encoder_inputs,max_sentence_length,tokenizer)

  encoder_inputs,[decoder_target,y_cc,y_dc] = all_data_generator(encoder_inputs,y_cc,y_dc,max_sentence_length,word2idx)

  return [encoder_inputs,[decoder_target,y_cc,y_dc]]



### Training

In [0]:
max_sentence_length = 100
embedding_dim = 100
latent_dim=64
vocab_size=5000


In [0]:
[train_X, test_X, train_Y, test_Y],[word2idx,idx2word,tokenizer],vocab_len = prepare_training_data(domain1_frame,vocab_size=vocab_size)

In [0]:
embedding_matrix = generate_glove_matrix(vocab_len)

In [31]:
model = classification_model(max_sentence_length,embedding_dim,latent_dim,include_glove=False)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [32]:
print_model_summary(model)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  (None, 100)               0         
_________________________________________________________________
encoder_embedding (Embedding (None, 100, 100)          500200    
_________________________________________________________________
lstm_1 (LSTM)                [(None, 64), (None, 64),  42240     
_________________________________________________________________
non_linear_CC (Dense)        (None, 128)               8320      
_________________________________________________________________
softmax_layer_CC (Dense)     (None, 2)                 258       
Total params: 551,018
Trainable params: 551,018
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(optimizer="rmsprop", loss='binary_crossentropy',metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', patience=5,verbose=1)
history = model.fit(train_X, 
                    train_Y,
                    batch_size=256,
                    validation_split=0.1,
#                     shuffle=True,
                    callbacks=[es],
                    epochs=30)






Train on 4710 samples, validate on 524 samples
Epoch 1/30





Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 00007: early stopping


### Testing

In [0]:
y_pred = model.predict(test_X)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in test_Y])

In [35]:
print(f"Prediction acuracy on same domain is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on same domain is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on same domain is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on same domain is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]), 
    index=['true:Fake', 'true:Real'], 
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)

Prediction acuracy on same domain is 0.71
Precision on same domain is 0.66
Recall on same domain is 0.54
F1 on same domain is 0.6


           pred:Fake  pred:Real
true:Fake        123        103
true:Real         64        292


In [0]:
d2_encoder_inputs,[decoder_target,d2_y_cc,d2_y_dc] = prepare_testing_data(domain2_frame,word2idx)

In [0]:
y_pred = model.predict(d2_encoder_inputs)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in d2_y_cc])

In [38]:
print(f"Prediction acuracy on different domain is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on different domain is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on different domain is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on different domain is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]), 
    index=['true:Fake', 'true:Real'], 
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)

Prediction acuracy on different domain is 0.49
Precision on different domain is 0.72
Recall on different domain is 0.34
F1 on different domain is 0.47


           pred:Fake  pred:Real
true:Fake         93        177
true:Real         36        109


## Experiment 2: Domain Independent (No AutoEncoder)

In [0]:
def prepare_training_data(df,vocab_size):
  encoder_inputs = encoder_data(df)
  y_cc = fake_news_target(df)
  y_dc = domain_target(df)

  shuffle = Shuffle(len(encoder_inputs)) ## Initializing the shuffle class instance
  encoder_inputs = shuffle.shuffle(encoder_inputs)
  y_cc = shuffle.shuffle(y_cc)
  y_dc = shuffle.shuffle(y_dc) 

  word2idx,idx2word,tokenizer = create_vocab(encoder_inputs,vocab_size)

  vocab_len = len(word2idx)

  encoder_inputs = pad_tokenize_data(encoder_inputs,max_sentence_length,tokenizer)

  encoder_inputs,[decoder_target,y_cc,y_dc] = all_data_generator(encoder_inputs,y_cc,y_dc,max_sentence_length,word2idx)

  train_X, test_X, train_C_Y, test_C_Y,train_D_Y,test_D_Y = train_test_split(encoder_inputs,y_cc,y_dc,test_size=0.1,random_state=42)

  return [[train_X, test_X, train_C_Y, test_C_Y,train_D_Y,test_D_Y],[word2idx,idx2word,tokenizer],vocab_len]

  

### Training

In [0]:
max_sentence_length = 100
embedding_dim = 100
latent_dim=64
vocab_size=2000

In [0]:
[train_X, test_X, train_C_Y, test_C_Y,train_D_Y,test_D_Y],[word2idx,idx2word,tokenizer],vocab_len = prepare_training_data(pd.concat([domain1_frame,domain2_frame]),vocab_size=vocab_size)

In [0]:
model = classification_domain_model(max_sentence_length,embedding_dim,latent_dim,include_glove=False)

In [43]:
print_model_summary(model)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 100, 100)     200200      encoder_inputs[0][0]             
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 64), (None,  42240       encoder_embedding[0][0]          
__________________________________________________________________________________________________
non_linear_CC (Dense)           (None, 128)          8320        lstm_2[0][0]                     
____________________________________________________________________________________________

In [44]:
model.compile(optimizer="rmsprop", loss=['binary_crossentropy','binary_crossentropy'],loss_weights=[0.7,-0.1],metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', patience=5,verbose=1)
history = model.fit(train_X, 
                    [train_C_Y,train_D_Y],
                    batch_size=256,
                    validation_split=0.1,
#                     shuffle=True,
                    callbacks=[es],
                    epochs=30)

Train on 5046 samples, validate on 561 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 00015: early stopping


### Testing

In [0]:
y_pred,_ = model.predict(test_X)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in test_C_Y])

In [46]:
print(f"Prediction acuracy on both domains is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on both domains is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on both domains is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on both domains is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]), 
    index=['true:Fake', 'true:Real'], 
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)

Prediction acuracy on both domains is 0.75
Precision on both domains is 0.66
Recall on both domains is 0.51
F1 on both domains is 0.58


           pred:Fake  pred:Real
true:Fake        108        103
true:Real         56        357


## Experiment 3: Domain Independent (With AutoEncoder)

In [0]:
def prepare_training_data(df,vocab_size):
  encoder_inputs = encoder_data(df)
  y_cc = fake_news_target(df)
  y_dc = domain_target(df)

  shuffle = Shuffle(len(encoder_inputs)) ## Initializing the shuffle class instance
  encoder_inputs = shuffle.shuffle(encoder_inputs)
  y_cc = shuffle.shuffle(y_cc)
  y_dc = shuffle.shuffle(y_dc)

  word2idx,idx2word,tokenizer = create_vocab(encoder_inputs,vocab_size)

  vocab_len = len(word2idx)

  encoder_inputs = pad_tokenize_data(encoder_inputs,max_sentence_length,tokenizer)

  encoder_inputs,[decoder_target,y_cc,y_dc] = all_data_generator(encoder_inputs,y_cc,y_dc,max_sentence_length,word2idx)

  train_X, test_X, train_decoder_Y,test_decoder_Y,train_C_Y, test_C_Y,train_D_Y,test_D_Y = train_test_split(encoder_inputs,decoder_target,y_cc,y_dc,test_size=0.1,random_state=42)

  return [[train_X, test_X, train_decoder_Y,test_decoder_Y,train_C_Y, test_C_Y,train_D_Y,test_D_Y],[word2idx,idx2word,tokenizer],vocab_len]

### Training

In [0]:
max_sentence_length = 100
embedding_dim = 100
latent_dim=64
vocab_size=2000

In [0]:
[train_X, test_X, train_decoder_Y,test_decoder_Y,train_C_Y, test_C_Y,train_D_Y,test_D_Y],[word2idx,idx2word,tokenizer],vocab_len = prepare_training_data(pd.concat([domain1_frame,domain2_frame]),vocab_size)

In [25]:
model = final_model(max_sentence_length,embedding_dim,latent_dim,include_glove=False)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
print_model_summary(model)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 100, 100)     200200      encoder_inputs[0][0]             
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 64), (None,  42240       encoder_embedding[0][0]          
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 100, 64)      0           lstm_1[0][0]                     
____________________________________________________________________________________________

In [27]:
model.compile(optimizer="rmsprop", loss=['categorical_crossentropy','binary_crossentropy','binary_crossentropy'],loss_weights=[0.5,0.8,-0.1],metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', patience=30,verbose=1)
history = model.fit(train_X, 
                    [train_decoder_Y,train_C_Y,train_D_Y],
                    batch_size=256,
                    validation_split=0.1,
#                     shuffle=True,
                    callbacks=[es],
                    epochs=800)






Train on 5046 samples, validate on 561 samples
Epoch 1/800





Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 00040: early stopping


### Testing

In [0]:
decoder_t_output,y_pred,_ = model.predict(test_X)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in test_C_Y])

In [29]:
print(f"Prediction acuracy on both domains is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on both domains is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on both domains is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on both domains is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]), 
    index=['true:Fake', 'true:Real'], 
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)

Prediction acuracy on both domains is 0.57
Precision on both domains is 0.47
Recall on both domains is 0.75
F1 on both domains is 0.58


           pred:Fake  pred:Real
true:Fake        184         62
true:Real        209        169


### Testing the Model

In [30]:
for idx in test_X[0]:
    print(idx2word[idx],end=" ")

<UNK> hands yearold <UNK> the <UNK> at her concert is <UNK> away watch many people can only dream of the chance to <UNK> their favorite artists song right in their <UNK> for yearold victoria anthony that dream came true at <UNK> concert in <UNK> on saturday may back on may anthony shared a video of herself on twitter directed to the beautiful <UNK> singer when you come here i really want to <UNK> at your concert she wrote anthony then gave a few <UNK> of her incredible singing voice including the <UNK> <UNK> in the caption to help <UNK> <UNK> 

In [31]:
# model.compile('rmsprop', 'mse')
output_array = model.predict([test_encoder_input_data])

NameError: ignored

In [32]:
for idx in decoder_t_output[0]:
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 

In [0]:
for idx in output_array[100]:
    print(idx[np.argmax(20)])
    break
    lookup = np.argmax(idx)
#     print(lookup)
    if lookup==0:
        break
    else:
        print(idx2word[lookup],end=" ")

1.9437695e-07
