In [1]:
#Pandas Dataframe
import pandas as pd 

#NLTK libraries
import nltk 
#from nltk.corpus import stopwords
#nltk.download('stopwords')

#Gensim libraries
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

#Data Visualisation
import pyLDAvis
import pyLDAvis.gensim

In [2]:
real_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

In [3]:
real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


We will use topic modelling later to establish our own categories, hence we will drop the subject column first. 

In [4]:
del real_news['subject']
del fake_news['subject']

In addition, we also need to add labels indicating the ground truth. 

In [5]:
real_news['label'] = 1
fake_news['label'] = 0 

Note that an article can have the same topic, but we identify an article as a unique entry if it its text is distinct. Therefore, we consider an article to be a duplicate entry if it has the same text as another article. 

We now proceed to drop duplicate entries from our two datasets.

In [6]:
real_news = real_news.drop_duplicates(subset = ['text'])
fake_news = fake_news.drop_duplicates(subset = ['text'])

In [7]:
real_news = real_news.reset_index()
del real_news['index']

Note that in our dataset, all real news articles were scraped from Reuters, an Engish news agency company. Since all Reuters articles begin with the phrase "XXX (Reuters)", where XXX refers to the country the article is based on, we need to delete this part from our text data as failing to do so may lead to severe overfitting in the models we train later on. 

In [8]:
def index_filter(text, substring):
    index_one = text.find(substring)
    return (index_one-1) + len(substring)

In [9]:
lst = real_news['text']
for i in range(len(lst)):
    if "(Reuters)" in lst[i]:
        index = index_filter(lst[i], "(Reuters)")
        new_text = real_news['text'].iloc[i][index + 2:]
        real_news.at[i, 'text'] = new_text
    else:
        pass 

Need to remove Reuters label from the text

Note: One unaddressed loophole - even though we deleted (Reuters) which addresses the source prior to beginning the article, content of the article itself may still contain the word Reuters. 

Examples 

1. Lawyers for Papadopoulos did not immediately respond to requests by Reuters for comment. 
2. Moore has denied wrongdoing and Reuters has not been able to independently verify the allegations.

Sanity check performed below 

In [10]:
count = 0 
for i in range(1000):
    if "Reuters" in real_news['text'].iloc[i]:
        print(real_news['text'].iloc[i])
        count += 1
        if count > 3:
            break 

- Trump campaign adviser George Papadopoulos told an Australian diplomat in May 2016 that Russia had political dirt on Democratic presidential candidate Hillary Clinton, the New York Times reported on Saturday. The conversation between Papadopoulos and the diplomat, Alexander Downer, in London was a driving factor behind the FBI’s decision to open a counter-intelligence investigation of Moscow’s contacts with the Trump campaign, the Times reported. Two months after the meeting, Australian officials passed the information that came from Papadopoulos to their American counterparts when leaked Democratic emails began appearing online, according to the newspaper, which cited four current and former U.S. and foreign officials. Besides the information from the Australians, the probe by the Federal Bureau of Investigation was also propelled by intelligence from other friendly governments, including the British and Dutch, the Times said. Papadopoulos, a Chicago-based international energy lawye

Next, we merge the title and the text together to form the overall content of the article.

In [11]:
real_news['content'] = real_news['title'] + real_news['text']
fake_news['content'] = fake_news['title'] + fake_news['text']

Next, we convert our content into lower casing only.

In [12]:
real_content = real_news['content']
fake_content = fake_news['content']

for i in range(len(real_content)):
    real_news.at[i, 'content'] = real_content[i].lower()
for j in range(len(fake_content)):
    fake_news.at[i, 'content'] = fake_content[i].lower()

In [13]:
overall_data = pd.concat([real_news, fake_news])

In [14]:
overall_data = overall_data.reset_index()
del overall_data['index']

We are now ready to begin our topic modelling analysis. 

In [15]:
data = list(overall_data['content'])

A sample of our text can be found above. Next, we create a Python function to lemmatize the content in our articles.

In [16]:
def lemmatize(text, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
#we only consider the nouns, adjective, verbs and adverbs as these are the POS tags which give our text most contextual meaning 
    nlp = spacy.load("en_core_web_sm", exclude = ["parser", "ner"])
    output = []
    for content in text:
        contents = nlp(content)
        temp = []
        for word in contents:
            if word.pos_ in allowed_postags: 
                temp.append(word.lemma_)
        lemmatized_content = " ".join(temp)
        output.append(lemmatized_content)
    return output 

In [17]:
%%time 

output = lemmatize(data)

CPU times: total: 36min 56s
Wall time: 42min 57s


Next, we implement a preprocessing function to remove stopwords from our output. 
We will do this by implementing an iterative function which uses the simple_preprocess function from the gensim library.

In [18]:
def preprocess(text):
    result = []
    for article in text:
        temporary = gensim.utils.simple_preprocess(article)
        result.append(temporary)
    return result

In [19]:
all_words = preprocess(output)

In [20]:
id2word = corpora.Dictionary(all_words) #attach unique ID to each word, store into a dictionary

corpus = []
for text in all_words:
    frequencies = id2word.doc2bow(text) #counts occurence of each word in every document
    corpus.append(frequencies) #stores into corpus, which becomes a list of lists

In [21]:
%%time 
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                            num_topics = 10,
                                            id2word = id2word,
                                            chunksize = 2000,
                                            passes = 10,
                                            update_every = 1,
                                            alpha = 'auto',
                                            random_state = 4222)

CPU times: total: 10min 47s
Wall time: 12min 57s


In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 10)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


From the data visualisation dashboard above, we can see that many of the relevant words in each of our topics do not provide much meaning with regards to understanding the topic identified. For instance in Topic 1, words such as "go", "just" and "get" do not give much insight as to what the topic is about. This is in stark contrast to words such as "nuclear", "weapon" and "security" in Topic 6, where we can easily infer that the topic is regarding US foreign policy on nuclear weapons. 

Therefore, we will need to further preprocess our text in order to obtain clearer keywords via our LDA model. 

In [23]:
bigram_phrases = gensim.models.Phrases(all_words, min_count = 5, threshold = 50) #min_count and threshold are hyperparameters we can tune later
trigram_phases = gensim.models.Phrases(bigram_phrases[all_words], min_count = 5, threshold = 50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(bigram_phrases)

In [24]:
new_corpus_bigrams = []
for text in all_words:
    new_corpus_bigrams.append(bigram[text])

In [25]:
new_corpus_trigrams = []
for text in all_words:
    new_corpus_trigrams.append(trigram[bigram[text]])

In [26]:
for i in range(len(new_corpus_bigrams)):
    if len(new_corpus_bigrams[i]) != len(new_corpus_trigrams[i]):
        print(i)

It appears that we do not have any trigrams in our text, hence we can drop new_corpus_trigrams.

In [27]:
id2word_new = corpora.Dictionary(new_corpus_bigrams)

new_corpus = []
for text in new_corpus_bigrams:
    frequencies = id2word_new.doc2bow(text) #counts occurence of each word in every document
    new_corpus.append(frequencies) #stores into corpus, which becomes a list of lists

In [28]:
from gensim.models import TfidfModel

tfidf = TfidfModel(new_corpus, id2word = id2word_new)

low_value = 0.05
for i in range(0, len(new_corpus)):
    bow = new_corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]         
    new_corpus[i] = new_bow

In [29]:
%%time 
lda_model2 = gensim.models.ldamodel.LdaModel(corpus = new_corpus, 
                                            num_topics = 10,
                                            id2word = id2word_new,
                                            chunksize = 2000,
                                            passes = 10,
                                            update_every = 1,
                                            alpha = 'auto',
                                            random_state = 4222)

CPU times: total: 4min 53s
Wall time: 5min 29s


In [30]:
vis2 = pyLDAvis.gensim.prepare(lda_model2, new_corpus, id2word_new, mds = "mmds", R = 10)
vis2

  by='saliency', ascending=False).head(R).drop('saliency', 1)


We now perform hyperparameter tuning to determine the number of topics to train our LDA on. In order to determine this, we will use the coherence score metric to do. Therefore, we first implement a Python function that will allow us to determine a coherence score for a given LDA model. 

First, we implement a Python function that allows us to determine a coherence score for a given LDA model. 

Note: The corpus and id2word parameters used here will be as per our previous iteration. 

In [31]:
def calculate_coherence_score(n):
    lda = gensim.models.ldamodel.LdaModel(corpus= new_corpus,
                                          id2word = id2word_new,
                                          num_topics = n,
                                          random_state = 4222, 
                                          update_every = 1,
                                          chunksize = 2000,
                                          passes = 10,
                                          alpha = "auto")
    coherence_model_lda = CoherenceModel(model = lda, corpus = new_corpus, dictionary = id2word_new, coherence = 'u_mass')
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

A list of possible number of topics is given below. 

In [32]:
topics_list = [3,5,7,10,12,15] 

In [33]:
%%time 
for n in topics_list:
    coherence_score = calculate_coherence_score(n)
    print(f"n : {n} ; Coherence Score : {coherence_score}")

n : 3 ; Coherence Score : -2.7247037986759803
n : 5 ; Coherence Score : -2.694920367119738
n : 7 ; Coherence Score : -2.7577661342661153
n : 10 ; Coherence Score : -2.8120882615074807
n : 12 ; Coherence Score : -2.852995093378056
n : 15 ; Coherence Score : -2.928514812213458
CPU times: total: 21min 55s
Wall time: 22min 55s


As we can see, n = 5 gives the highest coherence score. Hence, we will train an LDA model using num_topics = 5.

In [34]:
%%time 
lda_model3 = gensim.models.ldamodel.LdaModel(corpus = new_corpus, 
                                            num_topics = 5,
                                            id2word = id2word_new,
                                            chunksize = 2000,
                                            passes = 10,
                                            update_every = 1,
                                            alpha = 'auto',
                                            random_state = 4222)

CPU times: total: 3min 28s
Wall time: 3min 46s


In [35]:
outputs = []
for i in range(len(new_corpus)):
    topic_distribution = lda_model3.get_document_topics(new_corpus[i], minimum_probability = 0.0)
    outputs.append(topic_distribution)

In [36]:
import numpy as np
outputs = np.array(outputs).T.tolist()
outputs = outputs[1:]

In [37]:
outputs

[[[0.09608548879623413,
   0.15252438187599182,
   0.22686529159545898,
   0.32301953434944153,
   0.00126129318960011,
   0.15638336539268494,
   0.330515593290329,
   0.0031949649564921856,
   0.3844698965549469,
   0.00583462743088603,
   0.0015112023102119565,
   0.0009585050866007805,
   0.004931935574859381,
   0.17668791115283966,
   0.0015285013942047954,
   0.0017265782225877047,
   0.3938508629798889,
   0.002291310578584671,
   0.0014984182780608535,
   0.0023713966365903616,
   0.1533978283405304,
   0.0009218288469128311,
   0.10802681744098663,
   0.21921490132808685,
   0.14484986662864685,
   0.08879397809505463,
   0.0012735901400446892,
   0.11748622357845306,
   0.2670498788356781,
   0.13773934543132782,
   0.10721123963594437,
   0.08036885410547256,
   0.003651572857052088,
   0.7361788153648376,
   0.0007195110083557665,
   0.053525716066360474,
   0.10104335844516754,
   0.14873458445072174,
   0.10098641365766525,
   0.003912739921361208,
   0.00193641218356788

In [38]:
overall_data['Topic 1 Probability'] = outputs[0]
overall_data['Topic 2 Probability'] = outputs[1]
overall_data['Topic 3 Probbility'] = outputs[2]
overall_data['Topic 4 Probability'] = outputs[3]
overall_data['Topic 5 Probability'] = outputs[4]

ValueError: Length of values (5) does not match length of index (38647)