# Concatenating data

In [1]:
import re
import json
import pprint
import datetime
import pandas as pd

pp = pprint.PrettyPrinter(indent=4)

In [2]:
ds_emails = []
with open('../articles_email.json') as input_file:
    for line in input_file:
        ds_emails.append(json.loads(line))
        
df_emails = pd.DataFrame(ds_emails)

ds_newspapers = []

with open('../articles_expansion_hemeroteca.json') as input_file:
    for line in input_file:
        ds_newspapers.append(json.loads(line))
        
with open('../articles_cincodias.json') as input_file:
    for line in input_file:
        ds_newspapers.append(json.loads(line))
        
with open('../articles_elconfidencial.json') as input_file:
    for line in input_file:
        ds_newspapers.append(json.loads(line))
        
df_newspapers = pd.DataFrame(ds_newspapers)

print "df_emails (1): {:,}".format(df_emails.shape[0])
print "df_newspapers (0): {:,}".format(df_newspapers.shape[0])

df_emails (1): 5,146
df_newspapers (0): 15,780


In [3]:
df_emails['flag'] = 1
df_newspapers['flag'] = 0

In [4]:
df_emails.drop_duplicates(subset='url', keep='first', inplace=True)
df_newspapers.drop_duplicates(subset='url', keep='first', inplace=True)

print "df_emails (1): {:,}".format(df_emails.shape[0])
print "df_newspapers (0): {:,}".format(df_newspapers.shape[0])

df_emails (1): 4,955
df_newspapers (0): 15,768


In [5]:
frames = [df_emails, df_newspapers]
df = pd.concat(frames, axis=0, join='outer', join_axes=None, ignore_index=True)

In [6]:
print("Current number of articles: {:,}".format(df.shape[0]))
print("Number of flags = 1 ({:,})".format(df[df['flag']==1]['flag'].count()))
print("Number of flags = 0 ({:,})".format(df[df['flag']==0]['flag'].count()))

Current number of articles: 20,723
Number of flags = 1 (4,955)
Number of flags = 0 (15,768)


In [7]:
# removing duplicates
df.drop_duplicates(subset='url', keep='first', inplace=True)

In [8]:
print("Current number of articles: {:,}".format(df.shape[0]))
print("Number of flags = 1 ({:,})".format(df[df['flag']==1]['flag'].count()))
print("Number of flags = 0 ({:,})".format(df[df['flag']==0]['flag'].count()))

Current number of articles: 20,337
Number of flags = 1 (4,955)
Number of flags = 0 (15,382)


# Creating new features

* domain (newspaper)
* section
* date (without time)
* day of week

In [9]:
# newspaper
df['domain'] = df['url'].apply(lambda x: x.split('/')[2].replace('www.',''))

In [10]:
df[df['flag']==1].groupby('domain').size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"}).head(10)

Unnamed: 0,domain,count
0,expansion.com,2594
1,elconfidencial.com,859
2,cincodias.elpais.com,706
3,cincodias.com,583
4,bernsteinresearch.com,78
5,blogs.elconfidencial.com,35
6,retina.elpais.com,14
7,google.com,9
8,economia.elpais.com,6
9,bloomberg.com,3


In [11]:
df[df['flag']==0].groupby('domain').size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"}).head(10)

Unnamed: 0,domain,count
0,expansion.com,9665
1,cincodias.elpais.com,3943
2,elconfidencial.com,1122
3,blogs.elconfidencial.com,309
4,vanitatis.elconfidencial.com,254
5,gentleman.elconfidencial.com,55
6,brands.elconfidencial.com,21
7,blogs.vanitatis.elconfidencial.com,8
8,alimente.elconfidencial.com,4
9,shiseido.es,1


In [12]:
# section
for idx, row in df.iterrows():
    try:
        if "cincodias" in row['domain']:
            df.loc[idx,'section']=row['url'].split('/')[7]
        else:
            df.loc[idx,'section']=row['url'].split('/')[3]
    except:
        pass

In [13]:
df.groupby(['domain','section']).size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"}).head(15)

Unnamed: 0,domain,section,count
0,expansion.com,empresas,4164
1,expansion.com,economia,2056
2,cincodias.elpais.com,companias,1077
3,expansion.com,mercados,1071
4,cincodias.elpais.com,mercados,957
5,expansion.com,aragon,785
6,elconfidencial.com,empresas,636
7,cincodias.elpais.com,empresas,619
8,expansion.com,extremadura,605
9,cincodias.com,mercados,492


In [14]:
# split the publish date to date and time and remove time because it is not changing
df['date'], df['time'] = df.publish_date.str.split(' ').str
df.drop(['time'], axis=1, inplace=True)

In [15]:
# convert date columns to datetime and create day_of_week column
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.weekday_name

In [16]:
# drop unnecessary columns
df.drop(['top_image', 'publish_date'], axis=1, inplace=True)

In [17]:
df.head(5)

Unnamed: 0,authors,keywords,summary,text,title,url,flag,domain,section,date,day_of_week
0,"[Asunción Infante Fuentes, Andrés Stumpf Guirao]","[tumbar, vuelve, ibex, descenso, son, en, una,...","El Ibex cede un 1,18% y pierde el soporte clav...",Amaneció con subidas en todos los grandes parq...,El sector bancario vuelve a tumbar al Ibex: es...,http://cincodias.com/cincodias/2016/03/17/merc...,1,cincodias.com,mercados,2016-03-17,Thursday
1,[],"[ladrillo, en, desde, menor, junio, su, del, h...","El año pasado, la morosidad de este segmento h...",El pasado año el descenso de la mora tanto de ...,La morosidad del ladrillo cierra 2015 en su me...,http://www.expansion.com/empresas/banca/2016/0...,1,expansion.com,empresas,2016-03-18,Friday
2,"[Eduardo Segovia, Pedro Calvo, Contacta Al Aut...","[inversores, stanley, morgan, en, encuentro, p...",El banco de inversión Morgan stanley organiza ...,El banco de inversión Morgan stanley organiza ...,Noticias del BBVA: Los inversores señalan de q...,http://www.elconfidencial.com/mercados/2016-03...,1,elconfidencial.com,mercados,2016-03-24,Thursday
3,"[Assumpta Zorraquino, Contacta Al Autor, Carlo...","[regulación, servicios, versus, para, por, en,...",La aparición de las start-ups tecnológicas ded...,La aparición de las start-ups tecnológicas ded...,Fintech: Innovación versus regulación,http://blogs.elconfidencial.com/mercados/tribu...,1,blogs.elconfidencial.com,mercados,2016-03-24,Thursday
4,"[Pedro Calvo, Kike Vázquez, Daniel Lacalle, Vi...","[europea, pierde, por, en, y, valor, su, del, ...",Como el que han pagado en el último año las gr...,"Son los bancos españoles. Y los italianos, por...",Tipos de interés: La banca europea pierde un t...,http://www.elconfidencial.com/empresas/2016-03...,1,elconfidencial.com,empresas,2016-03-29,Tuesday


# Train / Test

Preparing the train / test dataset for modelling:

* __`TRAIN`__: articles published before 2017-10-01 (+ articles without date).
* __`TEST`__: articles published from 2017-10-01 on.

In [18]:
# Train examples span from the very start up until 2018. 
df_train = df[(df['date']<'2017-10-01') | (df['date'].isnull())]
df_test  = df[df['date']>='2017-10-01']

In [19]:
print "Number of null dates in train: {:,}".format(df_train[df_train['date'].isnull()]['url'].count())
print "Number of null dates in test: {:,}".format(df_test[df_test['date'].isnull()]['url'].count())

Number of null dates in train: 149
Number of null dates in test: 0


In [20]:
print "-"*50, "\nTRAIN (< 2017-10-01 + None)\n", "-"*50
print "Flag:\t1\t\t0"
print "Nobs:\t{:,}".format(df_train[df_train['flag']==1]['url'].count()),"\t\t{:,}".format(df_train[df_train['flag']==0]['url'].count())

print "\n","-"*50, "\nTEST (>= 2017-10-01)\n", "-"*50
print "Flag:\t1\t\t0"
print "Nobs:\t{:,}".format(df_test[df_test['flag']==1]['url'].count()),"\t\t{:,}".format(df_test[df_test['flag']==0]['url'].count())

-------------------------------------------------- 
TRAIN (< 2017-10-01 + None)
--------------------------------------------------
Flag:	1		0
Nobs:	3,795 		11,244

-------------------------------------------------- 
TEST (>= 2017-10-01)
--------------------------------------------------
Flag:	1		0
Nobs:	1,160 		4,138


In [21]:
# shuffle datasets
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test = df_test.drop('flag', axis=1)

# Dump to JSON

In [49]:
df_train.to_json('./train.json', force_ascii=False)
df_test.to_json('./test.json', force_ascii=False)

In [48]:
df_train.loc[1].to_json('./example.json', force_ascii=False)

# Analysis

In [24]:
# checking articles without text
df[df['text']==""].groupby(['domain','flag']).size().sort_values(0,ascending=False).reset_index().rename(columns={0:"count"})

Unnamed: 0,domain,flag,count
0,expansion.com,0,46
1,elconfidencial.com,1,4
2,plus.google.com,1,3
3,federalreserve.gov,1,2
4,bbvaresearch.com,1,2
5,lavanguardia.com,1,2
6,blogs.cincodias.com,1,1
7,cincodias.elpais.com,0,1
8,cnmv.es,1,1
9,diario.es,1,1
