# Importing all required packages

In [None]:
# Ignore warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Computations
import itertools

import gensim

# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import KFold, cross_val_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


import seaborn as sns
import nltk
import re
from wordcloud import WordCloud

# Visualization
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

# Reading Fake and Real Data from CSV Files

In [None]:
fake = pd.read_csv('./bin_dataset/Fake.csv', delimiter = ',')
true = pd.read_csv('./bin_dataset/True.csv', delimiter = ',')

# Assigning 0 and 1 labels to Fake and Real Data

In [None]:
fake['label']= 0
true['label']= 1

dataset = pd.DataFrame()
dataset = true.append(fake).sample(frac = 1, random_state = 1)
dataset.index = range(len(true) + len(fake))

### Combining the text and title fields for "full text"

In [None]:
dataset['total'] = dataset['title'] + dataset['text']

### 5-fold cross validation

In [None]:
cv = KFold(n_splits=5, random_state=1,shuffle=True)

### Train-Test 80-20 Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['total'], dataset.label, test_size=0.20, random_state=0)

Pre-processing data to remove stop words

In [None]:
y = dataset["label"].values

X = []

stop_words = set(nltk.corpus.stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for par in dataset["total"].values:
    tmp = []
    sentences = nltk.sent_tokenize(par)
    for sent in sentences:
        sent = sent.lower()
        tokens = tokenizer.tokenize(sent)
        filtered_words = [w.strip() for w in tokens if w not in stop_words and len(w) > 1]
        tmp.extend(filtered_words)
    X.append(tmp)

In [None]:
#Dimension of vectors we are generating
EMBEDDING_DIM = 100

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences=X, vector_size=EMBEDDING_DIM, window=5, min_count=1)

In [None]:
# len(w2v_model.wv.key_to_index)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)

In [None]:
word_index = tokenizer.word_index

In [None]:
len(word_index)

In [None]:
#Lets keep all news to 700, add padding to news with less than 700 words and truncating long ones
maxlen = 700 

#Making all news of size maxlen defined above
X = pad_sequences(X, maxlen=maxlen)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Function to create weight matrix from word2vec gensim model
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    
    for word, i in vocab.items():
        print(word)
        weight_matrix[i] = model.wv[word]
    return weight_matrix

In [None]:
#print(word_index)
for word, i in word_index.items():
    print(word)
    weight_matrix[i] = w2v_model.wv[word]

In [None]:
embedding_vectors = get_weight_matrix(w2v_model, word_index)

In [None]:
len(embedding_vectors)

In [None]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y) 

In [None]:
model.fit(X_train, y_train, validation_split=0.3, epochs=6)

In [None]:
y_pred = (model.predict(X_test) >= 0.5).astype("int")

In [None]:
def precision_recall(true_label, predicted_label):
    precision = precision_score(true_label, predicted_label)
    print('Precision: %f' % precision)

    recall = recall_score(true_label, predicted_label)
    print('Recall: %f' % recall)
    
    accuracy = metrics.accuracy_score(true_label, predicted_label)
    print('Accuracy: %f' % accuracy)
    
    f1score = metrics.f1_score(true_label, predicted_label)
    print('F1 Score: %f' % f1score)

In [None]:
precision_recall(y_test, y_pred)

In [None]:
from gensim.models.keyedvectors import KeyedVectors

In [None]:
word_vectors = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
EMBEDDING_DIM=300

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [None]:
print(len(y_train))

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Conv1D(activation='relu', filters=4, kernel_size=4))
model.add(MaxPool1D())
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.fit(X_train, y_train, validation_split=0.3, epochs=12)