# Importing the required libraries

In [1]:
import pandas as pd
from langdetect import detect
from tqdm import tqdm_notebook
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from itertools import chain
from gensim.models import Phrases
from gensim import corpora
from gensim import models
import numpy as np
from gensim.utils import simple_preprocess

# Reading data

It is a dataset of articles taken from BBC’s website. The csv file contains the articles and its corresponding language in two seperate columns.

In [2]:
data = pd.read_csv('articles_bbc_2018_01_30.csv')

In [3]:
data.shape

(309, 2)

In [4]:
# Removing the rows containing empty cells
data = data.dropna().reset_index(drop=True)
data.shape

(308, 2)

In [5]:
data.head(5)

Unnamed: 0,articles,lang
0,Image copyright PA/EPA Image caption Oligarch ...,en
1,Husband admits killing French jogger\r\n\r\nTh...,en
2,Media playback is unsupported on your device M...,en
3,Manchester City's Leroy Sane is ruled out for ...,en
4,Image copyright AFP Image caption Sebastien Br...,en


# Data cleaning

### 1) Keeping English articles only

In [6]:
tqdm_notebook().pandas()
# Detecting the language of all the articles using detect function and put it the the column lang
data['lang'] = data.articles.progress_map(detect)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  from pandas import Panel





HBox(children=(IntProgress(value=0, max=308), HTML(value='')))




In [7]:
# Counting the number of articles in each language
data.lang.value_counts()

en    257
fa      9
fr      7
id      5
uk      4
ar      4
hi      4
vi      4
ru      4
sw      3
tr      2
es      2
pt      2
de      1
Name: lang, dtype: int64

In [8]:
# Keeping the articles in english only
data = data.loc[data.lang=='en']

### 2) Tokenization

In [9]:
# Splitting the articles into sentences in 'sentences' column
data['sentences'] = data.articles.progress_map(sent_tokenize)

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [10]:
# Print the first 3 sentences of the 1st article
data['sentences'].head(1).tolist()[0][:3]

['Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list\r\n\r\nRussian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.',
 'The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.',
 'However, the US stressed those named were not subject to new sanctions.']

In [11]:
# Splitting the sentences into words/tokens in 'tokens_sentences' column
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [12]:
# Print the first 3 sentences of the 1st article after word tokenizations
print(data['tokens_sentences'].head(1).tolist()[0][:3])

[['Image', 'copyright', 'PA/EPA', 'Image', 'caption', 'Oligarch', 'Roman', 'Abramovich', '(', 'l', ')', 'and', 'PM', 'Dmitry', 'Medvedev', 'are', 'on', 'the', 'list', 'Russian', 'President', 'Vladimir', 'Putin', 'says', 'a', 'list', 'of', 'officials', 'and', 'businessmen', 'close', 'to', 'the', 'Kremlin', 'published', 'by', 'the', 'US', 'has', 'in', 'effect', 'targeted', 'all', 'Russian', 'people', '.'], ['The', 'list', 'names', '210', 'top', 'Russians', 'as', 'part', 'of', 'a', 'sanctions', 'law', 'aimed', 'at', 'punishing', 'Moscow', 'for', 'meddling', 'in', 'the', 'US', 'election', '.'], ['However', ',', 'the', 'US', 'stressed', 'those', 'named', 'were', 'not', 'subject', 'to', 'new', 'sanctions', '.']]


### 3) Lemmatizing with POS tagging

#### POS Tagging of words/tokens in all sentences of each article

In [13]:
# Using pos tagging before lemmtization and save it in 'POS_tokens' column
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [14]:
# Printing the Pos tag output of the first 3 sentences of the 1st article 
print(data['POS_tokens'].head(1).tolist()[0][:3])

[[('Image', 'NN'), ('copyright', 'NN'), ('PA/EPA', 'NNP'), ('Image', 'NNP'), ('caption', 'NN'), ('Oligarch', 'NNP'), ('Roman', 'NNP'), ('Abramovich', 'NNP'), ('(', '('), ('l', 'NN'), (')', ')'), ('and', 'CC'), ('PM', 'NNP'), ('Dmitry', 'NNP'), ('Medvedev', 'NNP'), ('are', 'VBP'), ('on', 'IN'), ('the', 'DT'), ('list', 'NN'), ('Russian', 'NNP'), ('President', 'NNP'), ('Vladimir', 'NNP'), ('Putin', 'NNP'), ('says', 'VBZ'), ('a', 'DT'), ('list', 'NN'), ('of', 'IN'), ('officials', 'NNS'), ('and', 'CC'), ('businessmen', 'NNS'), ('close', 'RB'), ('to', 'TO'), ('the', 'DT'), ('Kremlin', 'NNP'), ('published', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('US', 'NNP'), ('has', 'VBZ'), ('in', 'IN'), ('effect', 'NN'), ('targeted', 'VBN'), ('all', 'DT'), ('Russian', 'JJ'), ('people', 'NNS'), ('.', '.')], [('The', 'DT'), ('list', 'NN'), ('names', 'RB'), ('210', 'CD'), ('top', 'JJ'), ('Russians', 'NNPS'), ('as', 'IN'), ('part', 'NN'), ('of', 'IN'), ('a', 'DT'), ('sanctions', 'NNS'), ('law', 'NN'), ('aimed', 

#### Lemmatization

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
# this function takes the tag obtained by POS tag and returns a wordnet tag to be used by the lemmatizer
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [17]:
# Lemmatizing each word with its POS tag, in each sentence and saving lemmatized words in 'tokens_sentences_lemmatized' column
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1]))
            if get_wordnet_pos(el[1]) != '' else el[0]
            for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [18]:
# Printing the lemmatization output of the first 3 sentences of the 1st article 
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

[['Image',
  'copyright',
  'PA/EPA',
  'Image',
  'caption',
  'Oligarch',
  'Roman',
  'Abramovich',
  '(',
  'l',
  ')',
  'and',
  'PM',
  'Dmitry',
  'Medvedev',
  'be',
  'on',
  'the',
  'list',
  'Russian',
  'President',
  'Vladimir',
  'Putin',
  'say',
  'a',
  'list',
  'of',
  'official',
  'and',
  'businessmen',
  'close',
  'to',
  'the',
  'Kremlin',
  'publish',
  'by',
  'the',
  'US',
  'have',
  'in',
  'effect',
  'target',
  'all',
  'Russian',
  'people',
  '.'],
 ['The',
  'list',
  'names',
  '210',
  'top',
  'Russians',
  'as',
  'part',
  'of',
  'a',
  'sanction',
  'law',
  'aim',
  'at',
  'punish',
  'Moscow',
  'for',
  'meddle',
  'in',
  'the',
  'US',
  'election',
  '.'],
 ['However',
  ',',
  'the',
  'US',
  'stress',
  'those',
  'name',
  'be',
  'not',
  'subject',
  'to',
  'new',
  'sanction',
  '.']]

### 4) Regrouping tokens and removing stop words

A common thing with LDA is that words appear in multiple topics. One way to cope with this is to add these words to your stopwords list.

In [19]:
# Defining our stopwards list
stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']
stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something']
my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

In [20]:
# flatten the list of sentences of tokens into a list of tokens
# and saving them in 'tokens' column
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))

In [21]:
# Printing the first 3 sentences of the 1st article after flatten 
data['tokens'].head(1).tolist()[0][:30]

['Image',
 'copyright',
 'PA/EPA',
 'Image',
 'caption',
 'Oligarch',
 'Roman',
 'Abramovich',
 '(',
 'l',
 ')',
 'and',
 'PM',
 'Dmitry',
 'Medvedev',
 'be',
 'on',
 'the',
 'list',
 'Russian',
 'President',
 'Vladimir',
 'Putin',
 'say',
 'a',
 'list',
 'of',
 'official',
 'and',
 'businessmen']

In [22]:
# lower casing the lemmatized tokens, removing stop words, make sure that the length is more than one 
# and make sure that each token has only alphabetic characters(removing tokens with punctuation or Removing words with digits)
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [23]:
# Printing the first 3 sentences of the 1st article
data['tokens'].head(1).tolist()[0][:30]

['oligarch',
 'roman',
 'abramovich',
 'pm',
 'dmitry',
 'medvedev',
 'list',
 'russian',
 'president',
 'vladimir',
 'putin',
 'list',
 'official',
 'businessmen',
 'close',
 'kremlin',
 'publish',
 'us',
 'effect',
 'target',
 'russian',
 'people',
 'list',
 'names',
 'top',
 'russians',
 'part',
 'sanction',
 'law',
 'aim']

# LDA 

LDA (short for Latent Dirichlet Allocation) is an unsupervised machine-learning model that takes documents as input and finds topics as output. The model also says in what percentage each document talks about each topic.
A topic is represented as a weighted list of words.

## Data preparation

#### 1) Prepare bi-grams and tri-grams

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring. This step is useful to grasp more relevant information.

In [24]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

#### 2) Prepare objects for LDA gensim implementation

The two main inputs to the LDA topic model are the dictionary and the corpus(in bag of words form).

In [25]:
dictionary_LDA = corpora.Dictionary(tokens)
#Filter out tokens that appear inless than 3 documents
dictionary_LDA.filter_extremes(no_below=3) 
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## LDA implementation and running for 4 epochs

The other important parameters of the model are:
- the number of topics is equal to num_topics
- the number of words per topic is handled by eta
- the number of topics per document is handled by alpha

In [26]:
np.random.seed(123456)
num_topics = 20 
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

Wall time: 16.4 s


## LDA results

### 1) printing the topics

The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage (importance) to the topic. all the 20 topics are printed with their first 20 most relevant words the weightage of each keyword.

In [27]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.012*"specie" + 0.010*"prey" + 0.007*"act" + 0.006*"animal" + 0.006*"help" + 0.005*"back" + 0.005*"find" + 0.005*"become" + 0.005*"behaviour" + 0.005*"number" + 0.005*"area" + 0.005*"include" + 0.004*"order" + 0.004*"kill" + 0.004*"approach" + 0.004*"however" + 0.004*"move" + 0.004*"host" + 0.004*"evolve" + 0.004*"well"

1: 0.010*"show" + 0.008*"game" + 0.007*"another" + 0.007*"light" + 0.007*"night" + 0.006*"find" + 0.006*"could" + 0.006*"predator" + 0.006*"animal" + 0.006*"give" + 0.006*"images" + 0.006*"question" + 0.006*"however" + 0.006*"group" + 0.006*"live" + 0.006*"time" + 0.005*"transparent" + 0.005*"drug" + 0.005*"eye" + 0.005*"call"

2: 0.006*"first" + 0.006*"people" + 0.006*"work" + 0.005*"city" + 0.005*"time" + 0.005*"year" + 0.004*"day" + 0.004*"part" + 0.004*"find" + 0.004*"could" + 0.004*"us" + 0.003*"run" + 0.003*"restaurant" + 0.003*"plan" + 0.003*"home" + 0.003*"start" + 0.003*"tunnel" + 0.003*"include" + 0.003*"bring" + 0.003*"dish"

3: 0.014*"separatist" + 0.01

### 2) Allocating topics to documents
 
We can print the percentage of topics a document is about as follows:

In [28]:
# printing the first article
print(data.articles.loc[0])

Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list

Russian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.

The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.

However, the US stressed those named were not subject to new sanctions.

Mr Putin said the list was an unfriendly act that complicated US-Russia ties but he said he did not want to escalate the situation.

Mr Putin said Russia should instead be thinking about "ourselves and the economy".

The list was also derided by a number of senior Russian officials who said it bore a strong resemblance to the Forbes magazine ranking of Russian billionaires. A US Treasury Department later told Buzzfeed that an unclassified annex of the report had been derived from the magazine.

Why did the US publish the list?

The gove

In [29]:
# predicting the percentage of topics in the first article
lda_model[corpus[0]]

[(15, 0.99830645)]

Which means that article 1 contains topic 15 with percentage 99.8%

### 3) Predicting topics on unseen documents

In [30]:
document = '''Eric Tucker, a 35-year-old co-founder of a marketing company in Austin, Tex., had just about 40 Twitter followers. But his recent tweet about paid protesters being bused to demonstrations against President-elect Donald J. Trump fueled a nationwide conspiracy theory — one that Mr. Trump joined in promoting. 

Mr. Tucker's post was shared at least 16,000 times on Twitter and more than 350,000 times on Facebook. The problem is that Mr. Tucker got it wrong. There were no such buses packed with paid protesters.

But that didn't matter.

While some fake news is produced purposefully by teenagers in the Balkans or entrepreneurs in the United States seeking to make money from advertising, false information can also arise from misinformed social media posts by regular people that are seized on and spread through a hyperpartisan blogosphere.

Here, The New York Times deconstructs how Mr. Tucker’s now-deleted declaration on Twitter the night after the election turned into a fake-news phenomenon. It is an example of how, in an ever-connected world where speed often takes precedence over truth, an observation by a private citizen can quickly become a talking point, even as it is being proved false.'''


In [31]:
# saving all topics and the corresponding keywords in a variable
topics_all = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
# tokenization of the new document
tokens = word_tokenize(document)
# creating the bag of words using the previously created dictionary
bow_corpus=dictionary_LDA.doc2bow(tokens)
# predicting the topics of the document using the LDA model
topics_pred = lda_model[bow_corpus]
# creating a dataframe contains the predicted topics in the doc, the corresponding percentage of each topic 
# as well as the keywords of this topic
pd.DataFrame([(el[0], round(el[1],2), topics_all[el[0]][1]) for el in topics_pred], columns=['topic #', 'weight', 'words in topic'])

Unnamed: 0,topic #,weight,words in topic
0,2,0.4,"0.006*""first"" + 0.006*""people"" + 0.006*""work"" ..."
1,8,0.22,"0.012*""us"" + 0.008*""russia"" + 0.007*""could"" + ..."
2,10,0.29,"0.009*""people"" + 0.006*""government"" + 0.006*""u..."
3,17,0.09,"0.010*""idea"" + 0.008*""work"" + 0.008*""link"" + 0..."


### 4) Finding the dominant topic in each sentence

One of the practical application of topic modeling is to determine what topic a given document is about.
To find that, we find the topic number that has the highest percentage contribution in that document.
The format_topics_sentences() function below nicely aggregates this information in a presentable table.

In [32]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data.articles):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data.articles)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,15.0,0.9983,"list, data, us, publish, name, could, base, st...",Image copyright PA/EPA Image caption Oligarch ...
1,1,8.0,0.9768,"us, russia, could, time, write, president, roc...",Husband admits killing French jogger\r\n\r\nTh...
2,2,3.0,0.9488,"separatist, people, die, call, coalition, nort...",Media playback is unsupported on your device M...
3,3,8.0,0.9736,"us, russia, could, time, write, president, roc...",Manchester City's Leroy Sane is ruled out for ...
4,4,2.0,0.999,"first, people, work, city, time, year, day, pa...",Image copyright AFP Image caption Sebastien Br...
5,5,16.0,0.9969,"find, light, us, people, work, help, day, much...",The middle of nowhere\r\n\r\nFive miles from t...
6,6,8.0,0.9793,"us, russia, could, time, write, president, roc...",Image copyright Reuters Image caption Mr Trump...
7,7,15.0,0.7989,"list, data, us, publish, name, could, base, st...",Putin says US list targets all Russians\r\n\r\...
8,8,2.0,0.8417,"first, people, work, city, time, year, day, pa...",Image copyright Getty Images\r\n\r\nIt is the ...
9,9,10.0,0.9983,"people, government, uk, think, brexit, tell, t...",Image copyright Reuters Image caption The high...


### 5) Find the most representative document for each topic (Summeriation)

Sometimes just the topic keywords may not be enough to make sense of what a topic is about. So, to help with understanding the topic, you can find the documents a given topic has contributed to the most and infer the topic by reading that document.

In [35]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:

    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(5)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9987,"specie, prey, act, animal, help, back, find, b...",
1,1.0,0.9991,"show, game, another, light, night, find, could...",Image copyright Square Enix Image caption Life...
2,2.0,0.9995,"first, people, work, city, time, year, day, pa...",Series 4\r\n\r\nUrsula and Fiona: I Never Got ...
3,3.0,0.9883,"separatist, people, die, call, coalition, nort...",That Was Then\r\n\r\nEpisode 5\r\n\r\nUncover ...
4,4.0,0.9993,"people, wave, tell, town, water, ocean, benefi...",Boring Talks #02 - Book Pricing Algorithms\r\n...
