# Source data exploration

In [1]:
#imports
import pandas as pd
import numpy as np
from collections import Counter
import re
import csv

import gensim
from gensim.parsing.preprocessing import STOPWORDS

import warnings
warnings.filterwarnings('ignore')
# Import stopwords with nltk.
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/gorkem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Read the data
data_folder = 'sample_data/'
source1 = pd.read_table(data_folder+'now_sources_pt1.txt',encoding = "ISO-8859-1", header=None)
source2 = pd.read_table(data_folder+'now_sources_pt2.txt',encoding = "ISO-8859-1", header=None)
#sources.dropna(axis=1, inplace=True)
source1.rename(columns={0:'textID', 1:'#words',2:'date',3:'country',4:'website',5:'url',6:'title'}, inplace=True)
source1.head()
source2.rename(columns={0:'textID', 1:'#words',2:'date',3:'country',4:'website',5:'url',6:'title'}, inplace=True)
source2.head()

Unnamed: 0,textID,#words,date,country,website,url,title
0,2930853,194,15-01-01,US,Consequence of Sound,http://consequenceofsound.net/2015/01/100000-c...,100000 copies of The Interview will be dropped...
1,2930854,266,15-01-01,US,Los Angeles Times,http://www.latimes.com/entertainment/movies/la...,'The Taking of Tiger Mountain' gets blockbuste...
2,2930855,701,15-01-01,US,TIME,http://time.com/3651697/afghanistan-war-cost/,The True Cost of the Afghanistan War May Surpr...
3,2930856,1963,15-01-01,US,The Week Magazine,http://theweek.com/articles/441310/confessions...,Confessions of a former TSA officer
4,2930859,263,15-01-01,US,Russia Beyond the Headlines,http://rbth.com/arts/2015/01/01/year_of_litera...,"Pushkin, Gogol and Akhmatova to be symbols of ..."


In [3]:
frames = [source1, source2]
sources = pd.concat(frames)

In [4]:
sources.shape

(6132175, 7)

In [5]:
sources.dtypes

textID      int64
#words      int64
date       object
country    object
website    object
url        object
title      object
dtype: object

In [6]:
#Turn the date to date form - first we need to add the 20 in from of years since years were given 2 digitis
sources.date =  '20'+sources.date
sources.date =  pd.to_datetime(sources.date, format='%Y-%m-%d')

In [7]:
# check the dates are correctly changed and the date type
display(sources.head())
display(sources.dtypes)

Unnamed: 0,textID,#words,date,country,website,url,title
0,1334669,334,2010-01-01,US,The Next Web,http://thenextweb.com/2010/01/01/avatar-takes-...,Believe it or not: Avatar takes 1 petabyte of ...
1,1334671,493,2010-01-01,US,People Magazine,"http://www.people.com/people/article/0,,203339...",INSIDE STORY: The Making of Beyonc's 'Single ...
2,1334672,1255,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/bayarea/article/Biblical...,"Biblical scholar's date for rapture: May 21, 2011"
3,1334673,695,2010-01-01,US,CNN,http://www.cnn.com/2010/HEALTH/01/01/multi.vit...,What you need to know about multivitamins
4,1334674,724,2010-01-01,US,MedPage Today,http://www.medpagetoday.com/Psychiatry/sleepdi...,Lack of Sleep Linked to Depression in Adolescents


textID              int64
#words              int64
date       datetime64[ns]
country            object
website            object
url                object
title              object
dtype: object

### General Data Exploration for Source Data

We tried to answer basic questions about the now orpus data first. 
- How many unique countries we have and which ones?
- How many unique websites?
- How many articles per country and website?
- How many websites per country?   
Answering these type of question gives us a better understand of the data and preliminary stats about the countries and website resources.

In [8]:
# how many unique country?
print(sources.country.unique())
print(len(sources.country.unique()))

['US' 'CA' 'GB' 'IE' 'AU' 'NZ' 'IN' 'LK' 'PK' 'BD' 'MY' 'SG' 'PH' 'HK'
 'ZA' 'NG' 'GH' 'KE' 'TZ' 'JM']
20


In [9]:
# how many unique website?
print(sources.website.unique())
print(len(sources.website.unique()))

['The Next Web' 'People Magazine' 'San Francisco Chronicle' ...
 'Reggaeville.com' 'Loop Jamaica (press release) (registration) (blog)'
 'Television Jamaica']
20733


In [10]:
# There is any NAN value?
sources.isnull().any().any()

True

In [11]:
# Are there duplicate lines?
sources.duplicated().any()

False

In [12]:
# How many articles per country?
articles_per_country = sources.groupby(by=['country'])['textID'].count()
print(articles_per_country)

country
AU    441036
BD     36680
CA    788162
GB    823458
GH    103062
HK     25780
IE    469298
IN    729178
JM     41995
KE    104030
LK     37020
MY    150313
NG    225097
NZ    250814
PH    233322
PK    205199
SG    208031
TZ     15848
US    857301
ZA    386551
Name: textID, dtype: int64


In [13]:
# How many website per country?   (count distinct websites)
websites_per_country = sources.groupby(by=['country'])['website'].nunique()
print(websites_per_country)

country
AU     1161
BD       49
CA     1587
GB     3797
GH       91
HK      148
IE      312
IN     1057
JM       22
KE      115
LK      125
MY      174
NG      175
NZ      362
PH      368
PK      269
SG      272
TZ       26
US    11221
ZA      428
Name: website, dtype: int64


In [14]:
# How many total words per country?
words_per_country = sources.groupby(by=['country'])['#words'].sum()
print(words_per_country)

country
AU    280223620
BD     17899964
CA    519548548
GB    537543670
GH     43738013
HK     14120315
IE    241347866
IN    383914249
JM     24693984
KE     43621653
LK     20122787
MY     69214059
NG    120554068
NZ    123016733
PH    117067124
PK     94028598
SG     92041860
TZ      7941870
US    678812809
ZA    208964697
Name: #words, dtype: int64


In [15]:
# How many articles per website?
articles_per_website = sources.groupby(by=['website'])['textID'].count()
articles_per_website.sort_values(ascending=False, inplace=True)
print(articles_per_website.head(25))

website
Times of India         163277
Telegraph.co.uk        112186
Daily Mail             110912
Independent Online      79541
Irish Independent       68906
Stuff.co.nz             65847
ABC Online              61355
Inquirer.net            61175
Irish Times             59725
The Guardian            57486
The Hindu               57269
Toronto Star            48988
CBC.ca                  47798
The Independent         45166
Digital Journal         40965
BBC News                38611
GhanaWeb                38440
Goal.com                35595
The Straits Times       35436
National Post           32315
News24                  32293
Globe and Mail          31750
Otago Daily Times       28766
The Express Tribune     28486
Irish Examiner          27664
Name: textID, dtype: int64


In [16]:
# Is URLs unique?
sources.url.is_unique

False

### Try to find MAIN TOPICS

Our first approach to find topic were to us URLs since most news website categorize their topics in categories already. We first explore to see how much of the articles we can find their topics by using this strategy first.

In [17]:
# Try to find topic from sources
sources.url.head(4)

0    http://thenextweb.com/2010/01/01/avatar-takes-...
1    http://www.people.com/people/article/0,,203339...
2    http://www.sfgate.com/bayarea/article/Biblical...
3    http://www.cnn.com/2010/HEALTH/01/01/multi.vit...
Name: url, dtype: object

In [18]:
def getWordsFromURL(url):
    return re.compile(r'[\:/?=\-&]+',re.UNICODE).split(url)

In [19]:
# Split the urls and get words
sources['words'] = sources.url.apply(lambda x: getWordsFromURL(x))
sources.head(5)

Unnamed: 0,textID,#words,date,country,website,url,title,words
0,1334669,334,2010-01-01,US,The Next Web,http://thenextweb.com/2010/01/01/avatar-takes-...,Believe it or not: Avatar takes 1 petabyte of ...,"[http, thenextweb.com, 2010, 01, 01, avatar, t..."
1,1334671,493,2010-01-01,US,People Magazine,"http://www.people.com/people/article/0,,203339...",INSIDE STORY: The Making of Beyonc's 'Single ...,"[http, www.people.com, people, article, 0,,203..."
2,1334672,1255,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/bayarea/article/Biblical...,"Biblical scholar's date for rapture: May 21, 2011","[http, www.sfgate.com, bayarea, article, Bibli..."
3,1334673,695,2010-01-01,US,CNN,http://www.cnn.com/2010/HEALTH/01/01/multi.vit...,What you need to know about multivitamins,"[http, www.cnn.com, 2010, HEALTH, 01, 01, mult..."
4,1334674,724,2010-01-01,US,MedPage Today,http://www.medpagetoday.com/Psychiatry/sleepdi...,Lack of Sleep Linked to Depression in Adolescents,"[http, www.medpagetoday.com, Psychiatry, sleep..."


In [20]:
# function to remove stop and unnecessary words
unnecessary_words=['http','https','article','articleshow','new','news']
def removeStopAndUnnecessaryWord(strlist):
    res=[]
    for word in strlist:
        if (word not in (stop)) and (word.isdigit()==False) and (word not in unnecessary_words) and ('www' not in word):
            res.append(word)
    return res

After splitting the URL into words, now we want to get rid off the stop words and some most commonlt used unnecessary words that we are not interested in. We exclude them since we will try to find the categories based on most common used words. For removing stop words we use NLTK stopword and gemsim stopword dictionaries.

In [21]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
sources['words_without_stopwords'] = sources['words'].apply(lambda x: removeStopAndUnnecessaryWord(x))
display(sources.head(15))

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords
0,1334669,334,2010-01-01,US,The Next Web,http://thenextweb.com/2010/01/01/avatar-takes-...,Believe it or not: Avatar takes 1 petabyte of ...,"[http, thenextweb.com, 2010, 01, 01, avatar, t...","[thenextweb.com, avatar, takes, petabyte, stor..."
1,1334671,493,2010-01-01,US,People Magazine,"http://www.people.com/people/article/0,,203339...",INSIDE STORY: The Making of Beyonc's 'Single ...,"[http, www.people.com, people, article, 0,,203...","[people, 0,,20333961,00.html]"
2,1334672,1255,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/bayarea/article/Biblical...,"Biblical scholar's date for rapture: May 21, 2011","[http, www.sfgate.com, bayarea, article, Bibli...","[bayarea, Biblical, scholar, date, rapture, Ma..."
3,1334673,695,2010-01-01,US,CNN,http://www.cnn.com/2010/HEALTH/01/01/multi.vit...,What you need to know about multivitamins,"[http, www.cnn.com, 2010, HEALTH, 01, 01, mult...","[HEALTH, multi.vitamins.info, ]"
4,1334674,724,2010-01-01,US,MedPage Today,http://www.medpagetoday.com/Psychiatry/sleepdi...,Lack of Sleep Linked to Depression in Adolescents,"[http, www.medpagetoday.com, Psychiatry, sleep...","[Psychiatry, sleepdisorders]"
5,1334676,1418,2010-01-01,US,Cracked.com (satire),http://www.cracked.com/article/18358_5-real-bu...,5 Real Buried Treasures That Can Make You Rich...,"[http, www.cracked.com, article, 18358_5, real...","[18358_5, real, buried, treasures, make, rich,..."
6,1334678,1247,2010-01-01,US,Common Dreams,http://www.commondreams.org/views/2010/01/01/r...,The Real Top Ten Stories of the Past Decade,"[http, www.commondreams.org, views, 2010, 01, ...","[views, real, top, ten, stories, past, decade]"
7,1334679,574,2010-01-01,US,NBC Dallas-Fort Worth,http://www.nbcdfw.com/the-scene/real-estate/Ho...,How Much Bigger Will We Be?,"[http, www.nbcdfw.com, the, scene, real, estat...","[scene, real, estate, How, Much, Bigger, Will,..."
8,1334680,191,2010-01-01,US,NBC Chicago,http://www.nbcchicago.com/news/local/Battle-of...,New Year Baby Born at Stroke of Midnight,"[http, www.nbcchicago.com, news, local, Battle...","[local, Battle, New, Year, Babies, 80464997.html]"
9,1334681,989,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/entertainment/article/De...,Decade in review: The 10 best TV shows,"[http, www.sfgate.com, entertainment, article,...","[entertainment, Decade, review, The, best, TV,..."


In [22]:
# Count how many times each word appears and get most common 50 using Counter
Counter(x for xs in sources.words_without_stopwords for x in set(xs)).most_common(50)

[('', 1635704),
 ('business', 239774),
 ('world', 238055),
 ('story', 221638),
 ('sport', 204440),
 ('timesofindia.indiatimes.com', 163107),
 ('entertainment', 156793),
 ('city', 148035),
 ('life', 127476),
 ('sports', 126695),
 ('india', 116785),
 ('local', 114367),
 ('national', 104825),
 ('articles', 102389),
 ('says', 100124),
 ('football', 94447),
 ('politics', 93722),
 ('report', 86744),
 ('us', 82856),
 ('health', 77509),
 ('opinion', 76673),
 ('man', 75950),
 ('year', 75664),
 ('canada', 74957),
 ('first', 69590),
 ('review', 69530),
 ('News', 68028),
 ('home', 66721),
 ('top', 65991),
 ('day', 65350),
 ('uk', 63752),
 ('police', 63341),
 ('en', 62358),
 ('one', 61467),
 ('music', 58338),
 ('stories', 57015),
 ('id', 56787),
 ('features', 55515),
 ('south', 53402),
 ('content', 52071),
 ('tv', 51756),
 ('lifestyle', 51550),
 ('crime', 51353),
 ('years', 47813),
 ('best', 47371),
 ('death', 46732),
 ('market', 46017),
 ('time', 45991),
 ('two', 45942),
 ('court', 45650)]

We can see some popular topics in here such as:
- business
- world
- sport / sports
- entertainment
- politics
- national / local

It might be a way to use these once and try to categorize the articles by topic accordingly if a given selected word is in URL.

Next, we try how many URLs will be found with the chosen topics from the above most common ones. Note that we add a few more that we expect to find as well.

In [23]:
topics = ['business','world', 'sport','sports','entertainment',
          'politics','national', 'local','tech','international',
          'weather','health','economy','economics']
def urlContainsTopic(url):
    for t in topics:
        if(t in url):
            return t
    return 'NoTopic'

In [24]:
sources['topic'] = sources.url.apply(lambda x: urlContainsTopic(x))
display(sources)

Unnamed: 0,textID,#words,date,country,website,url,title,words,words_without_stopwords,topic
0,1334669,334,2010-01-01,US,The Next Web,http://thenextweb.com/2010/01/01/avatar-takes-...,Believe it or not: Avatar takes 1 petabyte of ...,"[http, thenextweb.com, 2010, 01, 01, avatar, t...","[thenextweb.com, avatar, takes, petabyte, stor...",NoTopic
1,1334671,493,2010-01-01,US,People Magazine,"http://www.people.com/people/article/0,,203339...",INSIDE STORY: The Making of Beyonc's 'Single ...,"[http, www.people.com, people, article, 0,,203...","[people, 0,,20333961,00.html]",NoTopic
2,1334672,1255,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/bayarea/article/Biblical...,"Biblical scholar's date for rapture: May 21, 2011","[http, www.sfgate.com, bayarea, article, Bibli...","[bayarea, Biblical, scholar, date, rapture, Ma...",NoTopic
3,1334673,695,2010-01-01,US,CNN,http://www.cnn.com/2010/HEALTH/01/01/multi.vit...,What you need to know about multivitamins,"[http, www.cnn.com, 2010, HEALTH, 01, 01, mult...","[HEALTH, multi.vitamins.info, ]",NoTopic
4,1334674,724,2010-01-01,US,MedPage Today,http://www.medpagetoday.com/Psychiatry/sleepdi...,Lack of Sleep Linked to Depression in Adolescents,"[http, www.medpagetoday.com, Psychiatry, sleep...","[Psychiatry, sleepdisorders]",NoTopic
5,1334676,1418,2010-01-01,US,Cracked.com (satire),http://www.cracked.com/article/18358_5-real-bu...,5 Real Buried Treasures That Can Make You Rich...,"[http, www.cracked.com, article, 18358_5, real...","[18358_5, real, buried, treasures, make, rich,...",NoTopic
6,1334678,1247,2010-01-01,US,Common Dreams,http://www.commondreams.org/views/2010/01/01/r...,The Real Top Ten Stories of the Past Decade,"[http, www.commondreams.org, views, 2010, 01, ...","[views, real, top, ten, stories, past, decade]",NoTopic
7,1334679,574,2010-01-01,US,NBC Dallas-Fort Worth,http://www.nbcdfw.com/the-scene/real-estate/Ho...,How Much Bigger Will We Be?,"[http, www.nbcdfw.com, the, scene, real, estat...","[scene, real, estate, How, Much, Bigger, Will,...",NoTopic
8,1334680,191,2010-01-01,US,NBC Chicago,http://www.nbcchicago.com/news/local/Battle-of...,New Year Baby Born at Stroke of Midnight,"[http, www.nbcchicago.com, news, local, Battle...","[local, Battle, New, Year, Babies, 80464997.html]",local
9,1334681,989,2010-01-01,US,San Francisco Chronicle,http://www.sfgate.com/entertainment/article/De...,Decade in review: The 10 best TV shows,"[http, www.sfgate.com, entertainment, article,...","[entertainment, Decade, review, The, best, TV,...",entertainment


Lets check the value counts per topic to see how many we found per topic Note that we give NoTopic for the ones which we couldn't assign.

In [25]:
# Assigned Topics
sources.topic.value_counts()

NoTopic          4258961
sport             409074
business          364333
world             304820
entertainment     172893
national          164896
tech              121844
local             117286
politics          100005
health             88119
economy            14895
weather            12477
economics           2572
Name: topic, dtype: int64

In [26]:
sources.shape

(6132175, 10)

In [27]:
# Ratio of none found topics
4258961*100 /sources.shape[0]

69.4526982677435

By using this approach we can only assign around 30% of the articles. This is not good enough. Our next approach then would be to try to do LDA for topic modelling to find topics. What we will to is to use the WordLemPos data for this. We will use the lemma version of the each word used in articles and apply LDA to find the results. Since we will use a different data file the words in the article for LDA approach, we created another notebook for this approach WordLemPos_Topic_Modelling notebook.