# Source data exploration

In [1]:
#imports
import pandas as pd
import numpy as np
from collections import Counter
import re

# Import stopwords with nltk.
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/gorkem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Read the data
data_folder = 'sample_data/'
sources = pd.read_table(data_folder+'now-samples-sources.txt',encoding = "ISO-8859-1", skiprows =[0,1], header=None)
                    # names=['textID', '#words','date','country','website','url','title'])
sources.dropna(axis=1, inplace=True)
sources.rename(columns={0:'textID', 1:'#words',2:'date',3:'country',4:'website',5:'url',6:'title'}, inplace=True)
sources.head()

Unnamed: 0,textID,#words,date,country,website,url,title
0,11241,397,13-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H..."
1,11242,757,13-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...
2,11243,755,13-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant
3,11244,1677,13-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...
4,21242,794,13-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...


In [3]:
sources.shape

(2960, 7)

In [4]:
sources.dtypes

textID      int64
#words      int64
date       object
country    object
website    object
url        object
title      object
dtype: object

In [5]:
#sources.date.to_datetime()
sources.date =  '20'+sources.date
sources.date =  pd.to_datetime(sources.date, format='%Y-%m-%d')

In [6]:
sources.head()

Unnamed: 0,textID,#words,date,country,website,url,title
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...


In [7]:
sources.dtypes

textID              int64
#words              int64
date       datetime64[ns]
country            object
website            object
url                object
title              object
dtype: object

In [8]:
# how many unique country?
print(sources.country.unique())
print(len(sources.country.unique()))

['US' 'IE' 'AU' 'GB' 'CA' 'IN' 'NZ' 'ZA' 'LK' 'SG' 'PH' 'GH' 'NG' 'KE'
 'HK' 'JM' 'PK' 'BD' 'MY' 'TZ']
20


In [9]:
# how many unique website?
print(sources.website.unique())
print(len(sources.website.unique()))

['Kotaku' 'Michigan Radio' 'New York Daily News' ...
 'Business Wire (press release)' 'Fox News' 'Firstcoastnews.com']
1104


In [10]:
# There is any NAN value?
sources.isnull().any().any()

False

In [11]:
# Are there duplicate lines?
sources.duplicated().any()

False

In [12]:
# How many articles per country?
articles_per_country = sources.groupby(by=['country'])['textID'].count()
print(articles_per_country)

country
AU    147
BD     21
CA    356
GB    402
GH     61
HK      6
IE    278
IN    388
JM     15
KE     33
LK     16
MY     73
NG    143
NZ    126
PH    118
PK     80
SG     98
TZ      6
US    396
ZA    197
Name: textID, dtype: int64


In [13]:
# How many website per country?   (count distinct websites)
websites_per_country = sources.groupby(by=['country'])['website'].nunique()
print(websites_per_country)

country
AU     73
BD      6
CA    144
GB    157
GH     10
HK      4
IE     73
IN    106
JM      3
KE     12
LK     10
MY     20
NG     39
NZ     33
PH     27
PK     25
SG     32
TZ      2
US    283
ZA     61
Name: website, dtype: int64


In [14]:
# How many total words per country?
words_per_country = sources.groupby(by=['country'])['#words'].sum()
print(words_per_country)

country
AU     90901
BD      8178
CA    253240
GB    250460
GH     24053
HK      2149
IE    141039
IN    195676
JM      9705
KE     13199
LK     16117
MY     35644
NG     68897
NZ     66178
PH     57194
PK     38553
SG     43241
TZ      2634
US    302945
ZA     95084
Name: #words, dtype: int64


In [15]:
# How many articles per website?
articles_per_website = sources.groupby(by=['website'])['textID'].count()
articles_per_website.sort_values(ascending=False, inplace=True)
print(articles_per_website.head(25))

website
Times of India          95
Telegraph.co.uk         53
Independent Online      49
Daily Mail              46
Irish Independent       41
Stuff.co.nz             39
Irish Times             32
BBC News                31
Inquirer.net            29
The Hindu               28
The Guardian            27
Irish Examiner          27
GhanaWeb                26
Toronto Star            25
Goal.com                25
Globe and Mail          23
The Independent         21
News24                  20
ABC Online              19
The Nation Newspaper    19
Otago Daily Times       18
Vanguard                17
InterAksyon             17
CBC.ca                  16
The42                   16
Name: textID, dtype: int64


In [16]:
# Is URLs unique?
sources.url.is_unique

True

### Try to find MAIN TOPICS

In [17]:
# Try to find topic from sources
sources.url.head(10)

0    http://kotaku.com/5973495/author-of-the-warrio...
1    http://michiganradio.org/post/thats-what-they-...
2    http://www.nydailynews.com/life-style/eats/bes...
3    http://www.oregonlive.com/performance/index.ss...
4    http://arstechnica.com/gadgets/2013/01/ask-ars...
5    http://worldnews.nbcnews.com/_news/2013/01/11/...
6    http://www.independent.ie/sport/other-sports/o...
7    http://entertainment.ie/celebrity-gossip/showb...
8    http://www.independent.ie/irish-news/courts/de...
9    http://www.npr.org/2013/01/22/170007521/rape-a...
Name: url, dtype: object

In [18]:
def getWordsFromURL(url):
    return re.compile(r'[\:/?=\-&]+',re.UNICODE).split(url)

In [19]:
sources['words'] = sources.url.apply(lambda x: getWordsFromURL(x))
sources

Unnamed: 0,textID,#words,date,country,website,url,title,words
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t..."
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,..."
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index...."
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask..."
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,..."
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport..."
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh..."
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts..."
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r..."


In [20]:
unnecessary_words=['http','https','article','articleshow','new','news']
def removeStopAndUnnecessaryWord(strlist):
    res=[]
    for word in strlist:
        if (word not in (stop)) and (word.isdigit()==False) and (word not in unnecessary_words) and ('www' not in word):
            res.append(word)
    return res

In [21]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
sources['words_without_stopwords'] = sources['words'].apply(lambda x: removeStopAndUnnecessaryWord(x))
display(sources)

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w...","[kotaku.com, author, warriors, cult, film, ada..."
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t...","[michiganradio.org, post, thats, say, dialect,..."
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,...","[life, style, eats, best, york, croissant, 1.1..."
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index....","[performance, index.ssf, reflecting_on_a_quart..."
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask...","[arstechnica.com, gadgets, ask, ars, facebook,..."
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,...","[worldnews.nbcnews.com, _news, accused, associ..."
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport...","[sport, sports, olympic, star, conlon, aims, r..."
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh...","[entertainment.ie, celebrity, gossip, showbiz,..."
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts...","[irish, courts, declan, ganley, forced, pay, e..."
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r...","[rape, significant, disturbing, feature, syria..."


In [22]:
Counter(x for xs in sources.words_without_stopwords for x in set(xs)).most_common(50)

[('', 810),
 ('business', 140),
 ('world', 138),
 ('sport', 97),
 ('timesofindia.indiatimes.com', 95),
 ('city', 93),
 ('story', 87),
 ('life', 69),
 ('india', 65),
 ('entertainment', 61),
 ('politics', 57),
 ('says', 56),
 ('local', 55),
 ('sports', 52),
 ('us', 52),
 ('national', 51),
 ('articles', 48),
 ('first', 46),
 ('uk', 42),
 ('day', 41),
 ('police', 41),
 ('en', 41),
 ('football', 41),
 ('man', 40),
 ('home', 40),
 ('report', 38),
 ('year', 36),
 ('News', 35),
 ('content', 34),
 ('opinion', 34),
 ('stories', 34),
 ('canada', 33),
 ('health', 32),
 ('tv', 31),
 ('video', 27),
 ('one', 27),
 ('singapore', 27),
 ('top', 27),
 ('features', 27),
 ('south', 26),
 ('id', 26),
 ('two', 26),
 ('music', 26),
 ('GhanaHomePage', 26),
 ('game', 25),
 ('best', 25),
 ('africa', 24),
 ('back', 24),
 ('nation', 24),
 ('ireland', 24)]

We can see some popular topics in here such as:
- business
- world
- sport / sports
- entertainment
- politics
- national / local

It might be a way to use these once and try to categorize the articles by topic accordingly if a given selected word is in URL.

In [23]:
topics = ['business','world', 'sport','sports','entertainment',
          'politics','national', 'local','tech','international',
          'weather','health','economy','economics']
def urlContainsTopic(url):
    for t in topics:
        if(t in url):
            return t
    return 'NoTopic'

In [24]:
sources['topic'] = sources.url.apply(lambda x: urlContainsTopic(x))
display(sources)

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords,topic
0,11241,397,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","[http, kotaku.com, 5973495, author, of, the, w...","[kotaku.com, author, warriors, cult, film, ada...",NoTopic
1,11242,757,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"[http, michiganradio.org, post, thats, what, t...","[michiganradio.org, post, thats, say, dialect,...",NoTopic
2,11243,755,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,"[http, www.nydailynews.com, life, style, eats,...","[life, style, eats, best, york, croissant, 1.1...",NoTopic
3,11244,1677,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,"[http, www.oregonlive.com, performance, index....","[performance, index.ssf, reflecting_on_a_quart...",NoTopic
4,21242,794,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"[http, arstechnica.com, gadgets, 2013, 01, ask...","[arstechnica.com, gadgets, ask, ars, facebook,...",tech
5,21243,690,2013-01-11,US,NBCNews.com,http://worldnews.nbcnews.com/_news/2013/01/11/...,Accused associate of 'Lord of War' arms dealer...,"[http, worldnews.nbcnews.com, _news, 2013, 01,...","[worldnews.nbcnews.com, _news, accused, associ...",world
6,31240,701,2013-01-16,IE,Irish Independent,http://www.independent.ie/sport/other-sports/o...,Olympic star Conlon aims for revenge,"[http, www.independent.ie, sport, other, sport...","[sport, sports, olympic, star, conlon, aims, r...",sport
7,31241,336,2013-01-16,IE,Entertainment.ie,http://entertainment.ie/celebrity-gossip/showb...,Shakira launches online baby shower,"[http, entertainment.ie, celebrity, gossip, sh...","[entertainment.ie, celebrity, gossip, showbiz,...",entertainment
8,31242,262,2013-01-16,IE,Irish Independent,http://www.independent.ie/irish-news/courts/de...,Declan Ganley forced to pay _35000 expenses af...,"[http, www.independent.ie, irish, news, courts...","[irish, courts, declan, ganley, forced, pay, e...",NoTopic
9,41240,764,2013-01-22,US,NPR,http://www.npr.org/2013/01/22/170007521/rape-a...,Rape A 'Significant And Disturbing' Feature Of...,"[http, www.npr.org, 2013, 01, 22, 170007521, r...","[rape, significant, disturbing, feature, syria...",NoTopic


In [25]:
# Assigned Topics
sources.topic.value_counts()

NoTopic          2045
business          202
sport             179
world             160
national           72
entertainment      72
local              62
politics           58
tech               56
health             43
economy             5
weather             3
economics           3
Name: topic, dtype: int64

In [26]:
sources.shape

(2960, 10)

By using this approach we can only assign around 1/3rd of the articles. 

### Questions to answer from NOW Corpus Data    

- What are the main topics of the published news? (tech, politics, sports, etc.)
- What are the distributions of these topics over country and time?
- Is there a dominant tone in the articles based on topic/country/time?
- What are some mostly used words in the articles?

## TO DO XXXXXX
- write the first query results in a file
- turn this notebook into script, where we can run on cluster with big data