# Source data exploration

In [1]:
#imports
import pandas as pd
import numpy as np
from collections import Counter
import re

"""
import findspark
findspark.init()
from pyspark.sql import *
import pyspark.sql.functions as sf
"""
import gensim
from gensim.parsing.preprocessing import STOPWORDS

import warnings
warnings.filterwarnings('ignore')
# Import stopwords with nltk.
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/gorkem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Read the data
data_folder = 'sample_data/'
sources = pd.read_table(data_folder+'now-samples-sources.txt',encoding = "ISO-8859-1", skiprows =[0,1], header=None)
                    # names=['textID', '#words','date','country','website','url','title'])
sources.dropna(axis=1, inplace=True)
sources.rename(columns={0:'textID', 1:'#words',2:'date',3:'country',4:'website',5:'url',6:'title'}, inplace=True)
sources.head()

Unnamed: 0,textID,#words,date,country,website,url,title
0,11241,397,13-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H..."
1,11242,757,13-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...
2,11243,755,13-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant
3,11244,1677,13-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...
4,21242,794,13-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...


In [3]:
sources.shape

(2960, 7)

In [4]:
sources.dtypes

textID      int64
#words      int64
date       object
country    object
website    object
url        object
title      object
dtype: object

In [5]:
#sources.date.to_datetime()
sources.date =  '20'+sources.date
sources.date =  pd.to_datetime(sources.date, format='%Y-%m-%d')

In [6]:
sources.head()

Unnamed: 0,textID,#words,date,country,website,url,title
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...


In [7]:
sources.dtypes

textID              int64
#words              int64
date       datetime64[ns]
country            object
website            object
url                object
title              object
dtype: object

In [8]:
# how many unique country?
print(sources.country.unique())
print(len(sources.country.unique()))

['US' 'IE' 'AU' 'GB' 'CA' 'IN' 'NZ' 'ZA' 'LK' 'SG' 'PH' 'GH' 'NG' 'KE'
 'HK' 'JM' 'PK' 'BD' 'MY' 'TZ']
20


In [9]:
# how many unique website?
print(sources.website.unique())
print(len(sources.website.unique()))

['Kotaku' 'Michigan Radio' 'New York Daily News' ...
 'Business Wire (press release)' 'Fox News' 'Firstcoastnews.com']
1104


In [10]:
# There is any NAN value?
sources.isnull().any().any()

False

In [11]:
# Are there duplicate lines?
sources.duplicated().any()

False

In [12]:
# How many articles per country?
articles_per_country = sources.groupby(by=['country'])['textID'].count()
print(articles_per_country)

country
AU    147
BD     21
CA    356
GB    402
GH     61
HK      6
IE    278
IN    388
JM     15
KE     33
LK     16
MY     73
NG    143
NZ    126
PH    118
PK     80
SG     98
TZ      6
US    396
ZA    197
Name: textID, dtype: int64


In [13]:
# How many website per country?   (count distinct websites)
websites_per_country = sources.groupby(by=['country'])['website'].nunique()
print(websites_per_country)

country
AU     73
BD      6
CA    144
GB    157
GH     10
HK      4
IE     73
IN    106
JM      3
KE     12
LK     10
MY     20
NG     39
NZ     33
PH     27
PK     25
SG     32
TZ      2
US    283
ZA     61
Name: website, dtype: int64


In [14]:
# How many total words per country?
words_per_country = sources.groupby(by=['country'])['#words'].sum()
print(words_per_country)

country
AU     90901
BD      8178
CA    253240
GB    250460
GH     24053
HK      2149
IE    141039
IN    195676
JM      9705
KE     13199
LK     16117
MY     35644
NG     68897
NZ     66178
PH     57194
PK     38553
SG     43241
TZ      2634
US    302945
ZA     95084
Name: #words, dtype: int64


In [15]:
# How many articles per website?
articles_per_website = sources.groupby(by=['website'])['textID'].count()
articles_per_website.sort_values(ascending=False, inplace=True)
print(articles_per_website.head(25))

website
Times of India          95
Telegraph.co.uk         53
Independent Online      49
Daily Mail              46
Irish Independent       41
Stuff.co.nz             39
Irish Times             32
BBC News                31
Inquirer.net            29
The Hindu               28
The Guardian            27
Irish Examiner          27
GhanaWeb                26
Toronto Star            25
Goal.com                25
Globe and Mail          23
The Independent         21
News24                  20
ABC Online              19
The Nation Newspaper    19
Otago Daily Times       18
Vanguard                17
InterAksyon             17
CBC.ca                  16
The42                   16
Name: textID, dtype: int64


In [16]:
# Is URLs unique?
sources.url.is_unique

True

### Try to find MAIN TOPICS

In [17]:
# Try to find topic from sources
sources.url.head(10)

0    http://kotaku.com/5973495/author-of-the-warrio...
1    http://michiganradio.org/post/thats-what-they-...
2    http://www.nydailynews.com/life-style/eats/bes...
3    http://www.oregonlive.com/performance/index.ss...
4    http://arstechnica.com/gadgets/2013/01/ask-ars...
5    http://worldnews.nbcnews.com/_news/2013/01/11/...
6    http://www.independent.ie/sport/other-sports/o...
7    http://entertainment.ie/celebrity-gossip/showb...
8    http://www.independent.ie/irish-news/courts/de...
9    http://www.npr.org/2013/01/22/170007521/rape-a...
Name: url, dtype: object

In [18]:
def getWordsFromURL(url):
    return re.compile(r'[\:/?=\-&]+',re.UNICODE).split(url)

In [19]:
sources['words'] = sources.url.apply(lambda x: getWordsFromURL(x))
sources

Unnamed: 0,textID,#words,date,country,website,url,title,words
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t..."
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,..."
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index...."
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask..."
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,..."
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport..."
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh..."
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts..."
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r..."


In [20]:
unnecessary_words=['http','https','article','articleshow','new','news']
def removeStopAndUnnecessaryWord(strlist):
    res=[]
    for word in strlist:
        if (word not in (stop)) and (word.isdigit()==False) and (word not in unnecessary_words) and ('www' not in word):
            res.append(word)
    return res

In [21]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
sources['words_without_stopwords'] = sources['words'].apply(lambda x: removeStopAndUnnecessaryWord(x))
display(sources)

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w...","[kotaku.com, author, warriors, cult, film, ada..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t...","[michiganradio.org, post, thats, say, dialect,..."
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,...","[life, style, eats, best, york, croissant, 1.1..."
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index....","[performance, index.ssf, reflecting_on_a_quart..."
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask...","[arstechnica.com, gadgets, ask, ars, facebook,..."
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,...","[worldnews.nbcnews.com, _news, accused, associ..."
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport...","[sport, sports, olympic, star, conlon, aims, r..."
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh...","[entertainment.ie, celebrity, gossip, showbiz,..."
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts...","[irish, courts, declan, ganley, forced, pay, e..."
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r...","[rape, significant, disturbing, feature, syria..."


In [22]:
Counter(x for xs in sources.words_without_stopwords for x in set(xs)).most_common(50)

[('', 810),
 ('business', 140),
 ('world', 138),
 ('sport', 97),
 ('timesofindia.indiatimes.com', 95),
 ('city', 93),
 ('story', 87),
 ('life', 69),
 ('india', 65),
 ('entertainment', 61),
 ('politics', 57),
 ('says', 56),
 ('local', 55),
 ('sports', 52),
 ('us', 52),
 ('national', 51),
 ('articles', 48),
 ('first', 46),
 ('uk', 42),
 ('day', 41),
 ('police', 41),
 ('en', 41),
 ('football', 41),
 ('man', 40),
 ('home', 40),
 ('report', 38),
 ('year', 36),
 ('News', 35),
 ('content', 34),
 ('opinion', 34),
 ('stories', 34),
 ('canada', 33),
 ('health', 32),
 ('tv', 31),
 ('video', 27),
 ('one', 27),
 ('singapore', 27),
 ('top', 27),
 ('features', 27),
 ('south', 26),
 ('id', 26),
 ('two', 26),
 ('music', 26),
 ('GhanaHomePage', 26),
 ('game', 25),
 ('best', 25),
 ('africa', 24),
 ('back', 24),
 ('nation', 24),
 ('ireland', 24)]

We can see some popular topics in here such as:
- business
- world
- sport / sports
- entertainment
- politics
- national / local

It might be a way to use these once and try to categorize the articles by topic accordingly if a given selected word is in URL.

In [23]:
topics = ['business','world', 'sport','sports','entertainment',
          'politics','national', 'local','tech','international',
          'weather','health','economy','economics']
def urlContainsTopic(url):
    for t in topics:
        if(t in url):
            return t
    return 'NoTopic'

In [24]:
sources['topic'] = sources.url.apply(lambda x: urlContainsTopic(x))
display(sources)

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords,topic
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w...","[kotaku.com, author, warriors, cult, film, ada...",NoTopic
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t...","[michiganradio.org, post, thats, say, dialect,...",NoTopic
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,...","[life, style, eats, best, york, croissant, 1.1...",NoTopic
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index....","[performance, index.ssf, reflecting_on_a_quart...",NoTopic
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask...","[arstechnica.com, gadgets, ask, ars, facebook,...",tech
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,...","[worldnews.nbcnews.com, _news, accused, associ...",world
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport...","[sport, sports, olympic, star, conlon, aims, r...",sport
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh...","[entertainment.ie, celebrity, gossip, showbiz,...",entertainment
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts...","[irish, courts, declan, ganley, forced, pay, e...",NoTopic
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r...","[rape, significant, disturbing, feature, syria...",NoTopic


In [25]:
# Assigned Topics
sources.topic.value_counts()

NoTopic          2045
business          202
sport             179
world             160
national           72
entertainment      72
local              62
politics           58
tech               56
health             43
economy             5
economics           3
weather             3
Name: topic, dtype: int64

In [26]:
sources.shape

(2960, 10)

By using this approach we can only assign around 1/3rd of the articles. 

# WordLemPos Data 

In [27]:
import csv

In [28]:
wordLemPos_big = pd.read_table(data_folder+'16-10-us.txt', quoting=csv.QUOTE_NONE, encoding = "ISO-8859-1",header=None)

In [29]:
wordLemPos = wordLemPos_big.copy()

In [30]:
numRows = len(wordLemPos)

In [31]:
display(wordLemPos.shape)
wordLemPos.head(20)

(17564427, 5)

Unnamed: 0,0,1,2,3,4
0,14637197,4739839025,@@14637197,,fo
1,14637197,4739839026,<p>,,
2,14637197,4739839027,NEW,new,np1
3,14637197,4739839028,YORK,york,np1
4,14637197,4739839029,(,,(
5,14637197,4739839030,AP,ap,np1
6,14637197,4739839031,),,)
7,14637197,4739839032,--,,jj_nn1
8,14637197,4739839033,Donald,donald,np1
9,14637197,4739839034,Trump,trump,nn1


In [32]:
wordLemPos.rename(columns={0:'textID',1:'ID(seq)',2:'word',3:'lemma',4:'PoS'},inplace=True)
wordLemPos = wordLemPos[['textID','lemma']]
# Drop NA values we are not interested in null lemma's since 
# they are not words but special characters and numbers
wordLemPos.dropna(inplace=True)
wordLemPos.shape

(14243729, 2)

In the WordLemPos data for each news article words are already preprocessed and lemmatized and stemmed. What we need to do first is to get rid off the stop words. And t

In [33]:
wordLemPos = wordLemPos[:numRows]
display(wordLemPos.shape)
display(wordLemPos.head())

(14243729, 2)

Unnamed: 0,textID,lemma
2,14637197,new
3,14637197,york
5,14637197,ap
8,14637197,donald
9,14637197,trump


In [45]:
display(wordLemPos)

Unnamed: 0,textID,lemma
3,14637197,york
8,14637197,donald
9,14637197,trump
11,14637197,five-day
12,14637197,feud
16,14637197,beauty
17,14637197,queen
21,14637197,late
22,14637197,example
25,14637197,insistence


ADJ    adjective    new, good, high, special, big, local
ADP    adposition    on, of, at, with, by, into, under
ADV    adverb    really, already, still, early, now
CONJ    conjunction    and, or, but, if, while, although
DET    determiner, article    the, a, some, most, every, no, which
NOUN    noun    year, home, costs, time, Africa
NUM    numeral    twenty-four, fourth, 1991, 14:24
PRT    particle    at, on, out, over per, that, up, with
PRON    pronoun    he, their, her, its, my, I, us
VERB    verb    is, say, told, given, playing, would

In [47]:
unnecessary_words=['new','good','high','big','with','into','under',
                  'really','already','still','early','while','although','most','every','which',
                  'year','like','time','that','given','would']

In [48]:
# Find stop words, replace with nullin lemma
wordLemPos['lemma'] = wordLemPos.lemma.apply((lambda x: x if x not in (stop) 
                                              and len(x)>3 
                                              and x not in gensim.parsing.preprocessing.STOPWORDS
                                              and x not in unnecessary_words
                                              else None))
# Drop lemma null - which are the stopwords
wordLemPos.dropna(inplace=True)

In [49]:
wordLemPos.shape

(6135938, 2)

In [50]:
wordLemPos.head(10)

Unnamed: 0,textID,lemma
3,14637197,york
8,14637197,donald
9,14637197,trump
11,14637197,five-day
12,14637197,feud
16,14637197,beauty
17,14637197,queen
21,14637197,late
22,14637197,example
25,14637197,insistence


In [51]:
unique_textID = wordLemPos.textID.unique()
docs =[]
for text_id in unique_textID:
    doc = wordLemPos[wordLemPos.textID==text_id]['lemma'].tolist()
    docs.append(doc)

In [52]:
len(docs)

26028

In [53]:
# bag of words on the dataset
dictionary = gensim.corpora.Dictionary(docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 30-day
1 airing
2 article
3 beauty
4 brag
5 brash
6 businessman
7 campaign
8 celebrity
9 cheer
10 combat


In [54]:
# filter out the tokens that are seen less than x documents
docThreshold = 15
dictionary.filter_extremes(no_below=docThreshold, no_above=0.5, keep_n=100000)

In [55]:
# create doc2bow
# For each doc have a dictionary stating how many words and how many times each word appears
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
#bow_corpus[10]

In [56]:
# Check in a document which word appears how many time, print previous results
bow_doc_10 = bow_corpus[10]
for i in range(len(bow_doc_10)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_10[i][0], 
                                               dictionary[bow_doc_10[i][0]],bow_doc_10[i][1]))

Word 11 ("come") appears 3 time.
Word 15 ("cost") appears 2 time.
Word 19 ("example") appears 1 time.
Word 29 ("local") appears 1 time.
Word 32 ("need") appears 1 time.
Word 37 ("people") appears 1 time.
Word 38 ("period") appears 1 time.
Word 39 ("political") appears 1 time.
Word 45 ("service") appears 3 time.
Word 56 ("unfairly") appears 1 time.
Word 68 ("basically") appears 1 time.
Word 78 ("change") appears 1 time.
Word 108 ("gain") appears 1 time.
Word 115 ("happen") appears 1 time.
Word 123 ("instead") appears 1 time.
Word 146 ("opportunity") appears 1 time.
Word 152 ("play") appears 1 time.
Word 188 ("work") appears 2 time.
Word 210 ("heart") appears 1 time.
Word 223 ("project") appears 1 time.
Word 230 ("seat") appears 1 time.
Word 231 ("seek") appears 1 time.
Word 242 ("want") appears 3 time.
Word 259 ("answer") appears 1 time.
Word 275 ("board") appears 11 time.
Word 280 ("business") appears 1 time.
Word 286 ("certain") appears 1 time.
Word 295 ("comment") appears 2 time.
Wor

In [57]:
# TF IDF scores
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.13875160249185722),
 (1, 0.17715031430766318),
 (2, 0.27721028525058744),
 (3, 0.12131747497512263),
 (4, 0.12347796102459174),
 (5, 0.1891044232255576),
 (6, 0.13041958050997313),
 (7, 0.062166725248688805),
 (8, 0.11560367908079414),
 (9, 0.12807161883869791),
 (10, 0.11848732068045718),
 (11, 0.025903339805993027),
 (12, 0.2098881712441338),
 (13, 0.07310700483804931),
 (14, 0.04553721662579709),
 (15, 0.0701098956552853),
 (16, 0.06377944504270776),
 (17, 0.08063285966758814),
 (18, 0.26822299077423606),
 (19, 0.07583102922564613),
 (20, 0.15614468491195388),
 (21, 0.20544178170293612),
 (22, 0.10073535252794036),
 (23, 0.12570775136353204),
 (24, 0.16872380372463114),
 (25, 0.038929328098851024),
 (26, 0.05031869614151054),
 (27, 0.17819931852133633),
 (28, 0.050998930179784505),
 (29, 0.06073026832316018),
 (30, 0.06649222693253704),
 (31, 0.05082413449427594),
 (32, 0.1494953527134458),
 (33, 0.04375062855431459),
 (34, 0.08233282954234428),
 (35, 0.05537289918246951),
 (

In [58]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=4, workers=2)

In [59]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.006*"work" + 0.005*"know" + 0.004*"world" + 0.004*"look" + 0.004*"come" + 0.004*"people" + 0.004*"thing" + 0.004*"want" + 0.003*"love" + 0.003*"watch"
Topic: 1 
Words: 0.019*"company" + 0.012*"share" + 0.010*"million" + 0.009*"market" + 0.009*"price" + 0.008*"stock" + 0.008*"business" + 0.007*"percent" + 0.007*"quarter" + 0.006*"rate"
Topic: 2 
Words: 0.022*"game" + 0.013*"play" + 0.011*"team" + 0.010*"season" + 0.006*"come" + 0.006*"yard" + 0.006*"week" + 0.006*"player" + 0.005*"think" + 0.005*"series"
Topic: 3 
Words: 0.031*"trump" + 0.020*"clinton" + 0.012*"election" + 0.010*"campaign" + 0.009*"republican" + 0.009*"state" + 0.008*"voter" + 0.007*"vote" + 0.007*"donald" + 0.007*"woman"
Topic: 4 
Words: 0.010*"study" + 0.007*"patient" + 0.006*"research" + 0.005*"cancer" + 0.005*"medical" + 0.005*"researcher" + 0.005*"university" + 0.005*"health" + 0.004*"disease" + 0.004*"people"
Topic: 5 
Words: 0.011*"comment" + 0.007*"information" + 0.006*"service" + 0.006*"depar

In [None]:
# conjunction 

### Questions to answer from NOW Corpus Data    

- What are the main topics of the published news? (tech, politics, sports, etc.)
- What are the distributions of these topics over country and time?
- Is there a dominant tone in the articles based on topic/country/time?
- What are some mostly used words in the articles?

## TO DO XXXXXX
- write the first query results in a file
- turn this notebook into script, where we can run on cluster with big data