In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv("news.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
data.shape

(6335, 4)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
data.duplicated().sum()

0

In [6]:
data.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [7]:
data.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [8]:
data['title'][67]

"Why it's funny Republicans are upset with Facebook: Column"

In [9]:
data['text'][1]

'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @SpeakerRyan says he voted for @realDonaldTrump : “Republicans, it is time to come home” ht

In [10]:
data['label'].value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [11]:
data = data.drop(["Unnamed: 0"], axis=1)
data.head(5)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [12]:
le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label'])

In [13]:
data.head(2)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0


In [14]:
embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
training_size = 6300
test_portion = 0.1

In [15]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(data['title'][x])
    text.append(data['text'][x])
    labels.append(data['label'][x])

In [16]:
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index
len(word_index1)

11686

In [17]:
vocab_size1 = len(word_index1)
sequences1 = tokenizer1.texts_to_sequences(title)
len(sequences1)

6300

In [18]:
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)
padded1.shape

(6300, 54)

In [19]:
split = int(test_portion * training_size)
training_sequences1 = padded1[split:training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

In [20]:
training_sequences1.shape

(5670, 54)

In [21]:
test_sequences1.shape

(630, 54)

In [22]:
training_sequences1 = np.array(training_sequences1)
test_sequences1 = np.array(test_sequences1)

In [23]:
embedding_index = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        
embedding_matrix = np.zeros((vocab_size1 + 1, embedding_dim))

for word, i in word_index1.items():
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [24]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1 + 1, embedding_dim, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [25]:
history = model.fit(training_sequences1,np.array(training_labels), epochs=50, validation_data=(test_sequences1,np.array(test_labels)), verbose=2)

Epoch 1/50
178/178 - 4s - 25ms/step - accuracy: 0.6614 - loss: 0.6078 - val_accuracy: 0.7159 - val_loss: 0.5394
Epoch 2/50
178/178 - 2s - 11ms/step - accuracy: 0.7206 - loss: 0.5448 - val_accuracy: 0.7302 - val_loss: 0.5463
Epoch 3/50
178/178 - 3s - 14ms/step - accuracy: 0.7453 - loss: 0.5077 - val_accuracy: 0.7667 - val_loss: 0.4673
Epoch 4/50
178/178 - 2s - 10ms/step - accuracy: 0.7771 - loss: 0.4629 - val_accuracy: 0.7857 - val_loss: 0.4639
Epoch 5/50
178/178 - 2s - 11ms/step - accuracy: 0.7949 - loss: 0.4275 - val_accuracy: 0.7746 - val_loss: 0.4581
Epoch 6/50
178/178 - 2s - 12ms/step - accuracy: 0.8132 - loss: 0.4067 - val_accuracy: 0.7810 - val_loss: 0.4515
Epoch 7/50
178/178 - 2s - 10ms/step - accuracy: 0.8326 - loss: 0.3742 - val_accuracy: 0.7571 - val_loss: 0.4584
Epoch 8/50
178/178 - 2s - 11ms/step - accuracy: 0.8520 - loss: 0.3451 - val_accuracy: 0.7683 - val_loss: 0.4792
Epoch 9/50
178/178 - 3s - 15ms/step - accuracy: 0.8607 - loss: 0.3129 - val_accuracy: 0.7857 - val_loss:

In [26]:
X = "Karry to go to France in gesture of sympathy"
sequences = tokenizer1.texts_to_sequences([X])
sequences

[[2, 197, 2, 843, 3, 3451, 4, 3452]]

In [27]:
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
sequences

array([[   2,  197,    2,  843,    3, 3451,    4, 3452,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [28]:
model.predict(sequences, verbose=0)[0][0]

0.9990952

In [29]:
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")

This news is True
