### CNN model for small sized fake news data (Rada data)

data source is taken from Rada paper: http://web.eecs.umich.edu/~mihalcea/downloads/fakeNewsDatasets.zip

In [2]:
import pandas as pd
import os,sys
import glob

In [None]:
## access dataset
for root, dirs, files in os.walk("celebrityDataset/"):
    for file in files:
        if file.endswith(".txt"):
            allfile=os.path.join(root, file)
            print(allfile)

In [None]:
## rename file extension from given fakenews dataset (Rada data)
folder1 = 'fakeNewsDataset/fake/'
for filename in os.listdir(folder1):
    infilename = os.path.join(folder1,filename)
    if not os.path.isfile(infilename): 
        continue
    oldbase = os.path.splitext(filename)
    newname = infilename.replace('.fake.txt', '.txt')
    output = os.rename(infilename, newname)
    
## rename file extension in legit news
folder2 = 'fakeNewsDataset/legit/'
for filename in os.listdir(folder2):
    infilename = os.path.join(folder2,filename)
    if not os.path.isfile(infilename): 
        continue
    oldbase = os.path.splitext(filename)
    newname = infilename.replace('.legit.txt', '.txt')
    output = os.rename(infilename, newname)
    
## rename file extension in celebreity dataset
folder3 = 'celebrityDataset/fake/'
for filename in os.listdir(folder3):
    infilename = os.path.join(folder3,filename)
    if not os.path.isfile(infilename): 
        continue
    oldbase = os.path.splitext(filename)
    newname = infilename.replace('fake.txt', '.txt')
    output = os.rename(infilename, newname)
    
folder4 = 'celebrityDataset/legit/'
for filename in os.listdir(folder4):
    infilename = os.path.join(folder4,filename)
    if not os.path.isfile(infilename): 
        continue
    oldbase = os.path.splitext(filename)
    newname = infilename.replace('legit.txt', '.txt')
    output = os.rename(infilename, newname)

In [None]:
## import fake news
fake_news_1= glob.glob('fakeNewsDataset/fake/*.txt')
fake_news_1 = pd.concat(pd.read_csv(file, header=None, sep='\n', quoting=3, skip_blank_lines = True, encoding='utf-8').T for file in fake_news_1).reset_index(drop=True).fillna('')
fake_news_1 = pd.DataFrame({'headline': fake_news_1[0], 'context': fake_news_1.loc[:, 1:3].apply(' '.join, axis=1)})
fake_news_1.head(3)

In [None]:
fake_news_2 = glob.glob('celebrityDataset/fake/*.txt')
fake_news_2 = pd.concat(pd.read_csv(file, header=None, sep='\n', quoting=3, skip_blank_lines = True, encoding='utf-8').T for file in fake_news_2).reset_index(drop=True).fillna('')
fake_news_2 = pd.DataFrame({'headline': fake_news_2[0], 'context': fake_news_2.loc[:, 1:3].apply(' '.join, axis=1)})
fake_news_2.head(3)

In [None]:
legit_news_1 = glob.glob('fakeNewsDataset//legit/*.txt')
legit_news_1 = pd.concat(pd.read_csv(file, header=None, sep='\n', quoting=3, skip_blank_lines = True, encoding='utf-8').T for file in legit_news_1).reset_index(drop=True).fillna('')
legit_news_1 = pd.DataFrame({'headline': legit_news_1[0], 'context': legit_news_1.loc[:, 1:3].apply(' '.join, axis=1)})
legit_news_1.head(3)

In [None]:
legit_news_2 = glob.glob('celebrityDataset/legit//*.txt')
legit_news_2 = pd.concat(pd.read_csv(file, header=None, sep='\n', quoting=3, skip_blank_lines = True, encoding='utf-8').T for file in legit_news_2).reset_index(drop=True).fillna('')
legit_news_2 = pd.DataFrame({'headline': legit_news_2[0], 'context': legit_news_2.loc[:, 1:3].apply(' '.join, axis=1)})
legit_news_2.head(3)

In [None]:
## name entity recognition extraction

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk

In [None]:
from sklearn.utils import shuffle

fake_news_1['label'], fake_news_2['label']='fake', 'fake'
legit_news_1['label'], legit_news_2['label']='legit', 'legit'

rada_data=pd.concat([fake_news_1, fake_news_2, legit_news_1, legit_news_2])
rada_data['NER']=[get_continuous_chunks(x) for x in rada_data.context]

binary_labels = {'fake':0, 'legit':1}

def one_hot_label(label):
    return to_categorical(multi_labels_dict[x], num_classes=6)

rada_data['bi_label']=rada_data['label'].apply(lambda x: binary_labels[x])
rada_data=shuffle(rada_data)

## preprocessed data
rada_data['word_ids'] = rada_data['context'].apply(pre_process_statement)

In [None]:
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
import _pickle as cPickle

In [None]:
### train data on CNN model

### tokenize fake news statement and build vocabulary
vocab_dict={}

tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(all_fakenews['context'])
vocab_dict=tokenizer.word_index
cPickle.dump(tokenizer.word_index, open("vocab.p","wb"))
print("vocab dictionary is created")
print("saved vocan dictionary to pickle file")

def pre_process_statement(statement):
    text = text_to_word_sequence(statement)
    val = [0] * 10
    val = [vocab_dict[t] for t in text if t in vocab_dict] #Replace unk words with 0 index
    return val

In [None]:
## split data
from sklearn.model_selection import train_test_split
import numpy as np

val_size=0.2
labels=to_categorical(rada_data['bi_label'], num_classes=2)
random_state = np.random.randint(1000)

x_train, x_val, y_train, y_val = train_test_split(rada_data, labels, test_size=val_size, random_state=random_state)

x_train=x_train['word_ids']
x_val=x_val['word_ids']
X_train = sequence.pad_sequences(x_train, maxlen=32, padding='post',truncating='post')
X_val = sequence.pad_sequences(x_val, maxlen=32, padding='post',truncating='post')

In [None]:
### parameter list

## parameter list

vocab_length = len(vocab_dict.keys())
hidden_dims = 50 #Has to be same as EMBEDDING_DIM
lstm_size = 100
num_steps = 32
num_epochs = 30
batch_size = 64
#Hyperparams for CNN
kernel_sizes = [3,4,5]
filter_size = 128
#Meta data related hyper params
# num_party = 6
# num_state = 51
# num_context = 12
# num_job = 11
# num_sub = 14
# num_speaker = 21

##
max_features = len(tokenizer.word_index)+1
Embedding_dims=300

In [None]:
### create embedding layer

### add embedding layer
num_words=len(vocab_dict)+1

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    embeddings_index = {}
    f = open(gloveFile, encoding='utf8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

glove_model = loadGloveModel('glove.6B.300d.txt')

def build_glove_embedding_layers():
    embed_matrix=np.zeros((max_features, Embedding_dims))
    for word, indx in tokenizer.word_index.items():
        if indx >= max_features:
            continue
        if word in glove_model:
            embed_vec=glove_model[word]
            if embed_vec is not None:
                embed_matrix[indx]=embed_vec
    return embed_matrix

embedding_weights=build_glove_embedding_layers()

In [None]:
### compile cnn model

## keras dependencies

from keras.layers import Concatenate, Input, MaxPooling1D
from keras.models import Model

from keras.preprocessing.sequence import pad_sequences # To make vectors the same size. 
# from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPool1D, MaxPool1D, Conv2D
from keras.layers import concatenate, Concatenate
from keras.optimizers import SGD
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, CSVLogger
from keras.utils.vis_utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
import keras

In [None]:
kernel_arr = []
statement_input = Input(shape=(32,), dtype='int32', name='main_input')
x = Embedding(vocab_length+1,Embedding_dims,weights=[embedding_weights],input_length=32,trainable=False)(statement_input) #Preloaded glove embeddings
# x = Embedding(output_dim=hidden_size, input_dim=vocab_length+1, input_length=num_steps)(statement_input) #Train embeddings from scratch

for kernel in kernel_sizes:
    x_1 = Conv1D(filters=filter_size,kernel_size=kernel, activation='relu')(x)
    x_1 = GlobalMaxPool1D()(x_1)
    kernel_arr.append(x_1)

conv_in = concatenate(kernel_arr)
conv_in = Dropout(0.6)(conv_in)
conv_in = Dense(hidden_dims, activation='relu')(conv_in)
conv_in = Dense(128, activation='relu')(conv_in)
main_output = Dense(2, activation='softmax', name='main_output')(conv_in)

model = Model(inputs=[statement_input], outputs=[main_output])
model.summary()

### compile CNN model
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.2)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd,
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

# tb = TensorBoard()
# csv_logger = keras.callbacks.CSVLogger('training.log')
# filepath= "weights.best.hdf5"
# checkpoint = keras.callbacks.ModelCheckpoint(filepath, 
#                                              monitor='val_categorical_accuracy', 
#                                              verbose=1, save_best_only=True, mode='max')

history= model.fit({'main_input': x_train},
                   {'main_output': y_train},epochs=num_epochs, batch_size=batch_size,
                   validation_data=(x_val,y_val))