In [71]:
import pandas as pd
import numpy as np
from tensorflow import keras
from math import sqrt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize as st

In [95]:
def tf_idf(text):
    sentences = st(text)
    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(sentences)
    matrix = np.asarray((tfidf * tfidf.T).A)
    matrix.resize(20, 20)
    return matrix.flatten()

In [96]:
def flat_input(corpus, classes):
    docs = [title + '. ' + text for title, text in corpus]
    prep = [tf_idf(text) for text in docs]
    outp = [[1, 0] if i == 'FAKE' else [0, 1] for i in classes]
    return np.asarray(prep), np.asarray(outp)

In [97]:
cols = ['ID', 'TITLE', 'TEXT', 'LABEL']
data = pd.read_csv("fake_or_real_news.csv", names=cols, header=0)

In [98]:
k = len(data[cols[3]]) // 2
x_train, y_train = flat_input(list(zip(data[cols[1]], data[cols[2]]))[:k], data[cols[3]][:k])
x_validate, y_validate = flat_input(list(zip(data[cols[1]], data[cols[2]]))[k:], data[cols[3]][k:])

In [114]:
model = keras.Sequential()
model.add(keras.layers.Dense(units=10, activation='relu', input_dim=400))
model.add(keras.layers.Dense(units=5, activation='softmax'))
model.add(keras.layers.Dense(units=2, activation='softmax'))

In [115]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True),
              metrics=['accuracy'])

In [121]:
model.fit(x_train, y_train, epochs=25, batch_size=32)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x2655572d780>

In [117]:
model.evaluate(x_validate, y_validate, batch_size=32)



[0.6181813797565422, 0.6805555555555556]

In [118]:
result = model.predict(x_validate, batch_size=32)
[(pred, real) for pred, real in zip(result, y_validate)]

[(array([0.407548, 0.592452], dtype=float32), array([0, 1])),
 (array([0.42641497, 0.57358503], dtype=float32), array([0, 1])),
 (array([0.6829767 , 0.31702325], dtype=float32), array([0, 1])),
 (array([0.7278699 , 0.27213004], dtype=float32), array([0, 1])),
 (array([0.44409913, 0.55590093], dtype=float32), array([0, 1])),
 (array([0.74611485, 0.25388518], dtype=float32), array([1, 0])),
 (array([0.36548722, 0.6345128 ], dtype=float32), array([1, 0])),
 (array([0.75415754, 0.24584249], dtype=float32), array([1, 0])),
 (array([0.33925092, 0.6607491 ], dtype=float32), array([1, 0])),
 (array([0.36792365, 0.6320764 ], dtype=float32), array([0, 1])),
 (array([0.33482856, 0.6651715 ], dtype=float32), array([0, 1])),
 (array([0.43760657, 0.5623934 ], dtype=float32), array([1, 0])),
 (array([0.30983794, 0.69016206], dtype=float32), array([1, 0])),
 (array([0.70555544, 0.29444456], dtype=float32), array([1, 0])),
 (array([0.4601399 , 0.53986007], dtype=float32), array([0, 1])),
 (array([0.725

In [119]:
c = data[cols[3]].tolist()[:k]
str(c.count('FAKE')) + ' / ' + str(c.count('REAL'))

'1601 / 1566'