# Newspaper output - Email Articles

* Checking the output from all articles included in all newsletter emails
* Creating `source` variable from url to describe newspaper source
* Define article language?
---

In [18]:
# Imports

import os
import re
import json
import pprint
import datetime
import pandas as pd


# Variables

pp = pprint.PrettyPrinter(indent=4)

# Read data

data = []
with open('./articles_email.json') as input_file:
    for line in input_file:
        data.append(json.loads(line))

In [13]:
df = pd.DataFrame(data)

In [14]:
df.head(5)

Unnamed: 0,authors,keywords,publish_date,summary,text,title,top_image,url
0,[],"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...
1,[],"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...
2,[],"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...
3,[],"[situación, en, bancos, que, por, los, la, se,...",2016-03-01 21:27:34+00:00,"Primero la situación económica de China, despu...",Desde que empezó el año hemos tenido ya 3 fact...,Nota sobre la situación de los bancos,http://www.robust.fondos.gvcgaesco.es/wp-conte...,http://www.robustglobal.com/nota-sobre-la-situ...
4,[],"[search, scheduled, meeting, regularly, fomc, ...",,"Meeting calendars, statements, and minutes (20...","Meeting calendars, statements, and minutes (20...",Meeting calendars and information,,http://www.federalreserve.gov/monetarypolicy/f...


In [15]:
df.shape

(3884, 8)

In [16]:
pp.pprint(list(df.columns))

[   u'authors',
    u'keywords',
    u'publish_date',
    u'summary',
    u'text',
    u'title',
    u'top_image',
    u'url']


In [20]:
# Data Wrangling

keywords = []

df['source'] = df['url'].apply(lambda x: re.findall('(?:\/\/www\.|\/\/)(\w+)\.\w',x))

for idx, row in df.iterrows():
    authors = ' / '.join(row['authors'])
    df.loc[idx, 'authors'] = authors
    
    for keyw in row['keywords']:
        keywords.append(keyw)
        
    source = row['source']
    try:
        df.loc[idx, 'source'] = source[0]
    except:
        df.loc[idx, 'source'] = ""

In [21]:
df.head(5)

Unnamed: 0,authors,keywords,publish_date,summary,text,title,top_image,url,source
0,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
1,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
2,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
3,,"[situación, en, bancos, que, por, los, la, se,...",2016-03-01 21:27:34+00:00,"Primero la situación económica de China, despu...",Desde que empezó el año hemos tenido ya 3 fact...,Nota sobre la situación de los bancos,http://www.robust.fondos.gvcgaesco.es/wp-conte...,http://www.robustglobal.com/nota-sobre-la-situ...,robustglobal
4,,"[search, scheduled, meeting, regularly, fomc, ...",,"Meeting calendars, statements, and minutes (20...","Meeting calendars, statements, and minutes (20...",Meeting calendars and information,,http://www.federalreserve.gov/monetarypolicy/f...,federalreserve


## Source
---

In [41]:
df_source = df.groupby('source').size().sort_values(0, ascending=False).reset_index().rename(columns={0:"count"})
df_source.head(10)

Unnamed: 0,source,count
0,expansion,2031
1,cincodias,1015
2,elconfidencial,655
3,bernsteinresearch,72
4,blogs,30
5,federalreserve,13
6,retina,12
7,economia,6
8,bbvagmr,4
9,euribor,3


## Authors
---

In [10]:
df.groupby(['authors']).size().sort_values(0, ascending=False).reset_index().rename(columns={0: "count"}).head(15)

Unnamed: 0,authors,count
0,,2184
1,Ángeles Gonzalo Alconada,148
2,Cinco Días,99
3,Juande Portillo,59
4,Ángeles Gonzalo Alconada / Pablo Monge,33
5,Eduardo Segovia / Contacta Al Autor,30
6,Nuria Salobral,23
7,Pablo Martín Simón,20
8,Juande Portillo / Pablo Monge,19
9,Bernardo De Miguel,17


## Keywords
---

In [11]:
# keywords analysis

df_kw = pd.DataFrame(pd.Series(keywords)).rename(columns={0: "keyword"})

In [12]:
df_kw.groupby('keyword').size().sort_values(0, ascending=False).reset_index().rename(columns={0: "count"}).head(10)

Unnamed: 0,keyword,count
0,la,3703
1,el,3655
2,en,3627
3,que,3595
4,y,3334
5,los,2800
6,del,2453
7,por,1847
8,se,1637
9,las,1594


## Text
---

#### Articles with missing text

In [56]:
# articles without title (#56)

df[df['text']=='']['text'].count()

56

In [57]:
df[df['text']==''].groupby('source').size().sort_values(0, ascending=False).reset_index().rename(columns={0:"count"})

Unnamed: 0,source,count
0,bernsteinresearch,38
1,elconfidencial,6
2,imf,2
3,federalreserve,2
4,bbvaresearch,2
5,prensa,1
6,nobelprize,1
7,hugin,1
8,grupobancopopular,1
9,cnmv,1


#### Articles with useless text

In [70]:
# those without title do not have text either (#10, already included in query above)

df[df['title']==""]

Unnamed: 0,authors,keywords,publish_date,summary,text,title,top_image,url,source
250,,[],,,,,,https://www.bbvaresearch.com/wp-content/upload...,bbvaresearch
251,,[],,,,,,https://www.bbvaresearch.com/wp-content/upload...,bbvaresearch
330,,[],,,,,,http://cnmv.es/portal/HR/verDoc.axd?t=3D%7bee5...,cnmv
421,,[],,,,,,https://www.federalreserve.gov/monetarypolicy/...,federalreserve
732,,[],,,,,,http://hugin.info/134323/R/2043980/763188.pdf,hugin
817,,[],,,,,,http://www.grupobancopopular.com/ES/Accionista...,grupobancopopular
829,,[],,,,,,http://www.imf.org/external/pubs/ft/weo/2016/0...,imf
842,,[],,,,,,https://www.nobelprize.org/nobel_prizes/econom...,nobelprize
845,,[],,,,,,http://www.imf.org/external/pubs/ft/weo/2016/0...,imf
901,,[],,,,,,http://www.federalreserve.gov/monetarypolicy/b...,federalreserve


In [75]:
df.groupby('text').size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"}).head(10)

Unnamed: 0,text,count
0,,56
1,How Can We Help?\n\nIf you'd like to learn mor...,34
2,Consulte las citas más relevantes de la jornad...,4
3,Don't have an account yet?\n\nThe content of t...,4
4,El ministro de Economía aseguró que la resoluc...,3
5,La salida a Bolsa de Unicaja prevista para est...,2
6,"Emilio Saracho, presidente de Banco Popular\n\...",2
7,Tras el sobresalto del miércoles por la consid...,2
8,La EBA aprueba unas guías para calcular las pé...,2
9,Los consistorios de las dos principales capita...,2


In [89]:
idx = df[df['text'].str.contains('How Can We Help')].index[0]
print "\nBernsteinresearch text:\n\n", df.loc[idx]['text'], "\n"


Bernsteinresearch text:

How Can We Help?

If you'd like to learn more about Bernstein's insights and execution or how they can help advance your business, please contact us. 



In [90]:
# a few bernsteinresearch articles cannot be scrapped (#34)
df[df['text'].str.contains('How Can We Help')].groupby('source').size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"})

Unnamed: 0,source,count
0,bernsteinresearch,34


In [92]:
idx = df[df['title'].str.contains('custom login')].index[0]
print "\nbbvagmr text:\n\n", df.loc[idx]['text'], "\n"


bbvagmr text:

Don't have an account yet?

The content of this website is for the exclusive access of BBVA Corporate & Investment Banking authorized clients.

If you need a username and password or require further information about our services, please contact your Corporate & Investment Banking representative. Alternatively, please use the link below:

Contact us now 



In [94]:
# bbvagmr cannot be scrapped since it requests login credentials (#4)

df[df['title'].str.contains('custom login')]

Unnamed: 0,authors,keywords,publish_date,summary,text,title,top_image,url,source
0,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
1,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
2,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
107,,"[yetthe, website, services, login, custom, rep...",,Don't have an account yet?\nThe content of thi...,Don't have an account yet?\n\nThe content of t...,» custom login,https://www.bbvagmr.com/wp-content/themes/bbva...,http://www.bbvagmr.com/wp-content/plugins/misc...,bbvagmr
