# Import articles and inspect data

In [478]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd

In [2]:
%pwd

'/Users/apple/py/notebooks'

In [3]:
df = pd.read_csv('/Users/apple/Downloads/articles1.csv')

We need to drop some columns that will not be of use

In [4]:
df = df.drop(['Unnamed: 0', 'url'], axis=1)

In [5]:
df.columns

Index(['id', 'title', 'publication', 'author', 'date', 'year', 'month',
       'content'],
      dtype='object')

In [11]:
df.shape

(50000, 8)

In [12]:
df.publication.value_counts()

Breitbart           23781
CNN                 11488
New York Times       7803
Business Insider     6757
Atlantic              171
Name: publication, dtype: int64

# Preprocess article content

First, we import the tqdm library for keeping track of operation execution progress

In [503]:
from tqdm import tqdm_notebook, tqdm
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

We tokenize each article into sentences using the nltk library

In [13]:
from nltk.tokenize import sent_tokenize

In [14]:
df['sentences'] = df.content.progress_map(sent_tokenize)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




In [15]:
df['sentences'].sample().tolist()[0][0]

'Former Mexican President Vicente Fox hosted a mock debate with GOP frontrunner Donald Trump, harshly criticizing the billionaire for wanting to restrict immigration from Mexico into the U. S.[“Today, we are here to debate immigration,” Fox says in heavily accented English.'

Then, we tokenize the sentences by word

In [16]:
from nltk.tokenize import word_tokenize

In [17]:
df['tokens_sentences'] = df['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




In [18]:
print(df['tokens_sentences'].sample().iloc[0][:3])

[['While', 'Europe', '’', 's', 'socialist', 'leaders', 'predictably', 'denounced', 'President', 'Trump', '’', 's', 'recent', 'temporary', 'ban', 'on', 'new', 'refugees', 'from', 'a', 'list', 'of', 'countries', ',', 'rising', 'populist', 'leaders', 'praised', 'him', 'as', 'a', 'model', 'for', 'what', 'Europe', 'should', 'be', 'doing', '.'], ['[', 'Dutch', 'populist', 'Geert', 'Wilders', 'said', 'in', 'a', 'tweet', ':', '“', 'Well', 'done', '@', 'POTUS', 'it', '’', 's', 'the', 'only', 'way', 'to', 'stay', 'safe', '+', 'free', '.'], ['I', 'would', 'do', 'the', 'same', '.']]


We will __lemmatize__ the words in the articles. First step is to get the __Part-of-Speech tag__ of each word in the data

In [20]:
from nltk import pos_tag

In [21]:
df['POS_tokens'] = df['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




In [22]:
df['POS_tokens'].sample().iloc[0][0][0][1]

'NNP'

__Lemmatization__

We lemmatize text based on POS tags. For this we need first a function that converts POS tags into wordnet tags. As found here: https://www.programcreek.com/python/example/91607/nltk.corpus.wordnet.VERB

In [23]:
from nltk.corpus import wordnet as wn

def pos_tag_text(tagged_text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
        
    tagged_wn_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in tagged_text]
    return tagged_wn_text

Now we lemmatize all words

In [24]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [25]:
df['tokens'] = df['POS_tokens'].progress_map(
    lambda POS_list: [lemmatizer.lemmatize(pos[0], pos[1]) 
                      if pos[1] is not None else pos[0]
                      for tokens_pos in POS_list
                      for pos in pos_tag_text(tokens_pos)
                      ])

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




__Stopword removal__

Stopword removal is an iterative process. We start by using the standard English language stopwords from nltk plus some verbs, prepositions and adverbs.

In [225]:
from nltk.corpus import stopwords

In [415]:
verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can', 'tell', 'think']

In [416]:
prep = ['about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath',
        'beside', 'between', 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 
        'on', 'onto', 'outside', 'over', 'past', 'since', 'through', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon',
        'with', 'within', 'without', 'according', 'because', 'instead']

In [417]:
adverb = ['afterward', 'already', 'almost', 'back', 'better', 'best', 'even', 'far', 'fast', 'hard', 'here', 'how',
          'late', 'long', 'low', 'more', 'near', 'never', 'next','now', 'often', 'quick', 'rather', 'slow', 'so', 'soon', 'still',
          'then', 'today', 'tomorrow', 'too', 'very', 'well', 'where', 'yesterday']

We then use TF-IDF vectorization to identify some more stopword candidates. This is not automatic, as we pick words based on our own judgement from the top 20 TF-IDF words

In [266]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tvec = TfidfVectorizer()

In [423]:
tvec_weights = tvec.fit_transform(df.tokens_filtered.progress_map(lambda x: ' '.join(x)).dropna())
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

Unnamed: 0,term,weight
134695,trump,0.048625
24761,clinton,0.024809
124409,state,0.020159
103206,president,0.019905
109483,republican,0.015277
93231,obama,0.014165
19735,campaign,0.013407
101535,police,0.013233
4558,american,0.012869
36142,donald,0.012615


We put these in the miscalaneous list of stopwords. We complement this list by adding words that appear after the LDA has been done, and we find some words to create noisy/bad topics.

In [480]:
misc = ["look", "life", "way", "thing", "even", "could", "find", "good", "really", "us", "back",
        "many", "much", "something", "still",
        "world", "give", "start", "never", 'people', 'one', 'year', 'time', 'new', 'news', 'also', 'report', 'first',
        'call', 'day', 'ask', 'team', 'car', 'man', 'insist',
        'woman', 'show', 'work', 'two', 'company', 'help', 'lot', "win", "big", "well", "cnn", "play",
        "country", "follow", "follow",
        "love", "friend", "family", "old", "young", "live", "someone", "though", "change", "name", "actually", "father", "enough",
        "let", "hand", "talk", "watch", "moment", "line", "minute", "guy", "bad", "hear", "ever", "wear",
        "sit", "head", "side", "break", "sure", "second", "stop", "night", "everything", "everyone", "talk",
        "little", "seem", "every", "happen", "part", "space", "keep", "kind", "turn", "leave", "away",
        "end", "another" , "always", "mean", "hour", "put", "move", "different", "place",
        "become", "last", "right", "try", "story", "great", "around", "point", "run", "might", "feel",
        "home", "best", "house", "problem", "idea", "less", "question", "fact", "yet", "reason", "course",
        "important", "future", "system", "believe", "example", "high", "real", "sense" , "create", "grow", "child",
        "mother", "son", "room", "walk", "daughter", "later", "wife", "three", "saw", "girl", "brother", "kid", "felt",
        "stay", "men", "husband", "couple", "boy", "write", "experience", "book", "read", "learn", "without", "anything",
        "word", "spend", "face", "image", "able", "understand", "person", "easy", "human", "together", "mind", "instead", "begin",
        "maybe", "matter", "perhaps", "explain" , "remember", "sometimes", "probably","else", "design", "build", "nothing", 
        "bring", "past", "true", "answer", "food", "building", "city", "small", "dog", "animal", "open", "train", 
        "week", "month", "close" , "sign", "stand", "since", "speak", "decide", "continue", 
        "january", "february", "march", "april", "may", "june", "july","august", "september", "november", "october", "december",
        "friday" , "saturday", "sunday", "monday", "tuesday", "wednesday", "thursday"]


In [481]:
stop = set(stopwords.words('English') + verbs + adverb + misc)

In [482]:
def filter_stop(tokens):
    return [token for token in tokens if token.isalpha() 
            and token not in stop and len(token)>1]

In [483]:
df['tokens_filtered'] = df['tokens'].progress_map(filter_stop)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

__Saving__

We save the the dataframe into csv to avoid having to redo all the calculations

In [484]:
df.to_csv('articles_preprocessed.csv', index=False)
# df = pd.read_csv('articles_preprocessed.csv')

__Bigrams & trigrams__

We use the useful gensim Phrases model to identify common bigrams and trigrams. (e.g. "President Barack Obama", "Hillary Clinton", etc)

In [485]:
from gensim.models.phrases import Phrases, Phraser

In [486]:
tokens = df['tokens_filtered'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

# LDA Model

In [487]:
from gensim import corpora

In [488]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=10)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [489]:
from gensim import models

We choose 40 topics after trying other values that give less interesting results (5,10,20,30,50)

In [521]:
np.random.seed(123456)
num_topics = 40
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

CPU times: user 13min 52s, sys: 1min 46s, total: 15min 39s
Wall time: 4min 12s


In [522]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

0: 0.053*"facebook" + 0.029*"gay" + 0.027*"abortion" + 0.021*"sex" + 0.019*"transgender" + 0.018*"zuckerberg" + 0.017*"marriage" + 0.016*"north_carolina" + 0.014*"conservative" + 0.012*"milo"

1: 0.148*"california" + 0.069*"brown" + 0.052*"los_angeles" + 0.038*"duterte" + 0.038*"et" + 0.037*"san_francisco" + 0.034*"philippine" + 0.023*"harris" + 0.016*"san_diego" + 0.013*"drug"

2: 0.040*"president" + 0.031*"white" + 0.023*"comey" + 0.022*"penny" + 0.020*"session" + 0.017*"meeting" + 0.017*"committee" + 0.013*"flynn" + 0.013*"washington" + 0.012*"senate"

3: 0.106*"tesla" + 0.053*"model" + 0.049*"vehicle" + 0.037*"musk" + 0.028*"driver" + 0.022*"ford" + 0.016*"drive" + 0.015*"electric" + 0.014*"automaker" + 0.011*"truck"

4: 0.023*"water" + 0.016*"area" + 0.013*"state" + 0.009*"resident" + 0.008*"storm" + 0.007*"official" + 0.007*"fire" + 0.006*"expect" + 0.006*"ship" + 0.006*"roof"

5: 0.027*"attack" + 0.021*"police" + 0.020*"kill" + 0.013*"accord" + 0.012*"victim" + 0.012*"authority"

We want to look at the dominant topic for each document. This function is from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#20topicdistributionacrossdocuments

In [528]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, contents=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [529]:
df_topic_sents_keywords = format_topics_sentences()



0it [00:00, ?it/s][A[A

3it [00:00, 27.97it/s][A[A

12it [00:00, 34.93it/s][A[A

21it [00:00, 42.71it/s][A[A

31it [00:00, 51.09it/s][A[A

42it [00:00, 60.62it/s][A[A

52it [00:00, 68.39it/s][A[A

63it [00:00, 76.79it/s][A[A

73it [00:00, 82.17it/s][A[A

84it [00:00, 88.72it/s][A[A

94it [00:01, 88.91it/s][A[A

105it [00:01, 92.76it/s][A[A

115it [00:01, 94.59it/s][A[A

125it [00:01, 94.23it/s][A[A

135it [00:01, 95.45it/s][A[A

146it [00:01, 98.83it/s][A[A

157it [00:01, 101.23it/s][A[A

168it [00:01, 103.04it/s][A[A

179it [00:01, 103.38it/s][A[A

190it [00:01, 103.97it/s][A[A

202it [00:02, 106.70it/s][A[A

213it [00:02, 106.71it/s][A[A

224it [00:02, 106.62it/s][A[A

235it [00:02, 105.21it/s][A[A

246it [00:02, 105.05it/s][A[A

258it [00:02, 107.27it/s][A[A

269it [00:02, 104.55it/s][A[A

280it [00:02, 95.22it/s] [A[A

290it [00:02, 90.00it/s][A[A

300it [00:03, 86.18it/s][A[A

309it [00:03, 74.93it/s][A[A

317it [00:0

In [530]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,25.0,0.3871,"republican, state, vote, democrat, bill, ryan,...",WASHINGTON — Congressional Republicans have...
1,1,6.0,0.3306,"police, officer, gun, shooting, police_officer...","After the bullet shells get counted, the blood..."
2,2,28.0,0.3198,"meet, photo, early, light, die, history, ago, ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,28.0,0.3759,"meet, photo, early, light, die, history, ago, ...","Death may be the great equalizer, but it isn’t..."
4,4,13.0,0.6698,"trump, president, united_state, american, admi...","SEOUL, South Korea — North Korea’s leader, ..."
5,5,4.0,0.2433,"water, area, state, resident, storm, official,...","LONDON — Queen Elizabeth II, who has been b..."
6,6,13.0,0.4301,"trump, president, united_state, american, admi...",BEIJING — President Tsai of Taiwan sharpl...
7,7,37.0,0.2459,"bit, wait, whole, wrong, pretty, anyone, job, ...","Danny Cahill stood, slightly dazed, in a blizz..."
8,8,28.0,0.2906,"meet, photo, early, light, die, history, ago, ...","Just how is Hillary Kerr, the founder of ..."
9,9,28.0,0.3393,"meet, photo, early, light, die, history, ago, ...",Angels are everywhere in the Muñiz family’s ap...


__Top 20 topics__

In [532]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().reset_index(drop=True)

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.sort_values('Perc_Documents', ascending=False).head(20)

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
29,35.0,"law, state, court, case, rule, decision, order...",9707,0.1941
15,16.0,"number, percent, likely, accord, lead, study, ...",3053,0.0611
21,18.0,"money, group, million, fund, organization, ira...",3032,0.0606
28,8.0,"muslim, church, christian, religion, islam, fa...",2883,0.0577
5,37.0,"bit, wait, whole, wrong, pretty, anyone, job, ...",2864,0.0573
13,11.0,"store, brand, product, launch, restaurant, cus...",2694,0.0539
33,27.0,"cruz, london, britain, british, vote, remain, ...",2408,0.0482
19,23.0,"europe, party, france, turkey, government, fre...",2143,0.0429
25,31.0,"film, movie, star, actor, character, role, ser...",2113,0.0423
12,39.0,"mexico, state, government, prison, cuban, bord...",1893,0.0379


__Visualization__

The very useful pyLDAvis library shows the topics ranked, with top keywords for each topic. Suggested relevance parameter lambda at 0.6

In [527]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)


11711it [02:03, 101.29it/s][A