In [None]:
import numpy as np
import pandas as pd
import re
import string
real_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv')
real_articles = real_data["text"]
fake_articles = fake_data["text"]
translator = {}
for punct in string.punctuation:
    translator[punct] = " "
translator["."] = " . "
translator[","] = " , "
trans = str.maketrans(translator)
OUTFILE = "articles.txt"
def preprocess_article(article_raw):
    article = ' '.join(article_raw.split("-")[1:]).strip().lower()
    article = re.sub('u\.s\.', "USA", article)
    article = article.translate(trans)
    article = re.sub('\d{4}', " <YEAR> ", article)
    article = re.sub('\d+', " # ", article)
    article = re.sub('#.+#', "#", article)
    article = article.encode("ascii", "ignore").decode()
    article = article.replace("\n", " ")
    article = re.sub("# jpexyr", " ", article)
    article += " <EOA>"
    article = "<SOA> "+article
    article = re.sub(" +", " ", article)
    article = article.strip()
    return article
with open(OUTFILE, "w") as f:
    for article_raw in real_articles:
        article = preprocess_article(article_raw)
        if len(article.split(" "))>20:
            f.write(article)
            f.write("\n")
    for article_raw in fake_articles:
        article = preprocess_article(article_raw)
        if len(article.split(" "))>20:
            f.write(article)
            f.write("\n")
all_words = []
with open(OUTFILE, "r") as f:
    for line in f.readlines():
        line_words = line.strip().split(" ")
        all_words += line_words
unique_words = list(set(all_words))
print("Total Words: {}".format(len(all_words)))
print("Unique Words: {}".format(len(unique_words)))
print("Average count of each word: {}".format(len(all_words)/len(unique_words)))
from collections import Counter
c = Counter(all_words)
neg_index = len(c)-VOCAB_SIZE+1
disqualify_words = c.most_common()[-neg_index:-1]
word2token = dict()
token2word = dict()
word2token["<pad>"] = 0
token2word[0] = "<pad>"
i = 1
for word in c:
    word2token[word] = i
    token2word[i] = word
    i+=1
import pickle
pickle.dump(word2token, open("word2token.pkl", "wb"))
pickle.dump(token2word, open("token2word.pkl", "wb"))
tokenized_articles = []
with open(OUTFILE, 'r') as f:
    for line in f.readlines():
        words = line.strip().split(" ")
        tokenized_article = []
        for word in words:
            try:
                tokenized_article.append(word2token[word])
            except KeyError:
                print(word)
                continue
        tokenized_articles.append(tokenized_article)
print("Total Number of Articles: {}".format(len(tokenized_articles)))
pickle.dump(tokenized_articles, open("tokenized_articles.pkl", "wb"))

In [None]:
import pickle
word2token = pickle.load(open("word2token.pkl", "rb"))
token2word = pickle.load(open("token2word.pkl", "rb"))
tokenized_articles = pickle.load(open("tokenized_articles.pkl", 'rb'))
VOCAB_SIZE = 10000
def get_next_example():
    for article in tokenized_articles:
        if(any([token>=VOCAB_SIZE for token in article])):
            continue
        article = [0]*32+article
        article_length = len(article)
        for word_number in range(article_length-32):
            yield article[word_number: word_number+32], article[word_number+32]
import tensorflow
physical_devices = tensorflow.config.list_physical_devices('GPU')
tensorflow.config.experimental.set_memory_growth(physical_devices[0], enable=True)
import tensorflow as tf
ds = tf.data.Dataset.from_generator(get_next_example, output_types=(tf.int64, tf.int64), output_shapes = ((32, ), ()))
ds = ds.batch(512)

In [None]:
def generate_sample(gen_start):
    gen_tokens = [word2token[word] for word in gen_start.strip().split(" ")]
    gen_tokens = [0]*32+gen_tokens
    for i in range(300):
        next_token = np.argmax(model.predict(np.asarray([gen_tokens[-33:]]))[0])
        gen_tokens = gen_tokens+[next_token]
        if next_token == word2token["<EOA>"]:
            break
    return " ".join([token2word[token] for token in gen_tokens[32:]])
generate_sample("<SOA> the trump administration")

In [None]:
import os
from tensorflow.keras.callbacks import Callback
from tensorflow.summary import create_file_writer, scalar, text
import tensorflow.keras.backend as TF
from tensorflow.keras.models import Sequential, save_model
class MyCallback(Callback):
    def __init__(self, run_base_dir):
        self.save_counter = 0
        self.least_loss = -1
        self.base_dir = run_base_dir
        os.makedirs(self.base_dir, exist_ok=True)
        os.makedirs(os.path.join(self.base_dir, "models"), exist_ok=True)
        os.makedirs(os.path.join(self.base_dir, "logs"), exist_ok=True)
        self.summary_writer = create_file_writer(os.path.join(self.base_dir, "logs"))
        self.summary_writer.set_as_default()
        self.iters_since_last_model_save = 0
    def on_epoch_end(self, epoch, logs=None):
        iter_no = TF.get_value(self.model.optimizer.iterations)
        loss = logs['loss']
        scalar("loss", data=loss, step = iter_no)
        text("sample", generate_sample("<SOA> the trump administration"), step=iter_no)
        self.iters_since_last_model_save += 1
        if self.least_loss<0 or loss<self.least_loss:
            self.least_loss = loss
            print("Loss decreased in iter {}".format(iter_no))
            if self.iters_since_last_model_save>0:
                print("Saving model at iteration {} with loss {}".format(iter_no, loss))
                save_model(self.model, os.path.join(self.base_dir, "models", "model-{0:.4f}.h5".format(loss)))
                self.iters_since_last_model_save = 0
#     def apply_lr(self):
#         TF.set_value(self.model.optimizer.lr, K.get_value(0.0001))
        

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.optimizers import Adam
import numpy as np
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 300))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(VOCAB_SIZE, activation='softmax'))
model.summary()
model.compile(Adam(0.001), loss='sparse_categorical_crossentropy')
for x, y in ds.take(1):
    print(model.predict(x).shape)
    print(y)
model.fit(ds, epochs=100, callbacks=[MyCallback("second")], verbose=False)

In [87]:
from tensorflow.keras.models import load_model
model = load_model("second/models/model-1.3227.h5")
model.summary()

Model: "sequential_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, None, 300)         3000000   
_________________________________________________________________
lstm_44 (LSTM)               (None, None, 256)         570368    
_________________________________________________________________
lstm_45 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dense_18 (Dense)             (None, 10000)             1290000   
Total params: 5,057,488
Trainable params: 5,057,488
Non-trainable params: 0
_________________________________________________________________


In [88]:
generate_sample("<SOA> the trump administration")

'<SOA> the trump administration is ready in principle to resume population americans rather than a total of hate on twitter , when the president shared her aim of raising awareness of issues long as it was related to their capital . if a similar group report for trump s visit to pyongyang , and they should be tackled head from the united kingdom . we are doing a lot of people , including teachers , and cannot go through . the sheer length of the newly discovered most recently was told the office of management and does not think the united states has become no , he said . the memo was sent to that flynn was a former army of the south china sea , and i am not going to be released by the right , in a preamble that we recognize their closest could do something do be done , especially . . . it is expected to endorse trump campaign colluded with russia , the white house said in a statement . <EOA>'

In [94]:
import io

RUN_NAME = "second"
out_v = io.open(os.path.join(RUN_NAME, 'vecs.tsv'), 'w', encoding='utf-8')
out_m = io.open(os.path.join(RUN_NAME, 'meta.tsv'), 'w', encoding='utf-8')
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)
print(len(token2word))
for x in range(VOCAB_SIZE):
    word = token2word[x]
    if not word in ['', " ", '\t', "\n", "  "]:
        if(len(word)>2):
            vec = weights[x]
            out_m.write(token2word[x]+"\n")
            out_v.write('\t'.join([str(x) for x in vec])+"\n")
out_v.close()
out_m.close()

(10000, 300)
70717
