In [1]:
import time
import sys
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
from unidecode import unidecode
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
import pydot
import graphviz
import glob
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Input, Activation
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from keras.preprocessing import sequence, text
from keras.utils import np_utils, plot_model
from keras.models import Model
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier

from classes import *

# fix random seed for reproducibility
np.random.seed(7)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#para usar solo una GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import tensorflow as tf 
from keras.backend.tensorflow_backend import set_session 
config = tf.ConfigProto() 
config.gpu_options.per_process_gpu_memory_fraction = 1
set_session(tf.Session(config = config))

In [3]:
#Loading the pickle file with the dictionary Journal - List of headlines
#f = open('titulares_creados/headlines_final.pckl', 'rb')
#Loading the pickle file with the dictionary Journal - List of headlines
#f = open('titulares_creados/headlines_final.pckl', 'rb')
f = open('merged_headlines.pckl', 'rb')
lista_periodicos = pickle.load(f)
f.close()

In [4]:
#periodicos_shortened = {'http://estaticos.elmundo.es/elmundo/rss/portada.xml': lista_periodicos['http://estaticos.elmundo.es/elmundo/rss/portada.xml'], 'http://www.abc.es/rss/feeds/abcPortada.xml': lista_periodicos['http://www.abc.es/rss/feeds/abcPortada.xml']}
periodicos_shortened = {'http://ep00.epimg.net/rss/elpais/portada.xml': lista_periodicos['http://ep00.epimg.net/rss/elpais/portada.xml'], 'http://www.abc.es/rss/feeds/abcPortada.xml': lista_periodicos['http://www.abc.es/rss/feeds/abcPortada.xml']}
periodicos_shortened

{'http://ep00.epimg.net/rss/elpais/portada.xml': ['El sábado arranca una semana de fuertes chubascos y tormentas',
  'El Gobierno califica de “error” las cargas policiales del 1-O',
  'Pérez-Reverte: “Picasso no pintó el ‘Guernica’ por patriotismo, sino por dinero”',
  'El nuevo pacto comercial facilita la relación entre López Obrador y Trump',
  '“¡Necesitamos sangre ya!”',
  '“España puede ser líder en Europa si legaliza el cannabis”',
  'Cómo enviar fotos a través de WhatsApp sin que pierdan calidad',
  'Una niña de Toledo sobrevive al primer caso en España de la ameba comecerebros',
  'Podemos abre el debate de la legalización total de la marihuana en España',
  'Gana Luis Enrique, pierde España',
  'La bailarina que inspiró el escandaloso cuadro ‘El origen del mundo’',
  'Interior destituirá a tres mandos de la ‘policía patriótica’ a los que el PP premió con puestos en embajadas',
  'El huracán Lane golpea a Hawái y provoca inundaciones y deslizamientos de tierra',
  'El incendio 

In [5]:
#Creating the Headlines object from the pickle file
ep = 'http://ep00.epimg.net/rss/elpais/portada.xml'
em = 'http://estaticos.elmundo.es/elmundo/rss/portada.xml'
lv = 'http://www.lavanguardia.com/mvc/feed/rss/home.xml'
lr = 'http://www.larazon.es/rss/portada.xml'
abc = 'http://www.abc.es/rss/feeds/abcPortada.xml'

#urls = [ep, em, lv, lr, abc]
#urls= [em, abc]
urls = [ep, abc]

#journals = ['El Pais', 'El Mundo', 'La Vanguardia', 'La Razon', 'ABC']
#journals = ['El Mundo', 'ABC']
journals = ['El Pais', 'ABC']

data = Headlines(periodicos=journals, urls=urls, titulares=periodicos_shortened) 

In [6]:
#Creating dataframe from the headlines
periodicos_df = Headlines.dataframing_headlines(data)
periodicos_df

Unnamed: 0,Headline,Journal
0,Los Mossos se fragmentan: los agentes enfrenta...,ABC
1,El Gobierno afronta la fase más incierta del a...,ABC
2,El TSJ catalán amplía el objeto de la investig...,ABC
3,Los manejos de Gorka Villar implican a altos c...,ABC
4,Así se reparten por Madrid los españoles nacid...,ABC
5,La tarea de enterrar el fantasma de Cristina C...,El Pais
6,Usain Bolt se estrena como goleador con un ‘do...,El Pais
7,Las empresas andaluzas tienen 500 millones de ...,ABC
8,10 cosas que los turistas tienen prohibidas,El Pais
9,El ‘caso Cifuentes’ hunde al PP y dispara a Ci...,El Pais


In [7]:
# How many headlines there are of each journal?
# We should limit the number of headlines in each journal to the minimum number available for one of the classes
min_number = Headlines.min_hl_number(data, periodicos_df) 

El Pais 21406
ABC 21082


In [8]:
min_per_df = Headlines.keeping_min_headlines(data, periodicos_df, min_number)
min_per_df

Unnamed: 0,Headline,Journal
0,Los Juegos del asombro,El Pais
1,El PDeCAT teme un próximo pacto de izquierdas ...,ABC
2,La monarquía veinte años después de Diana,El Pais
3,El PP de Madrid se traga el bulo más loco: Kim...,ABC
4,El motivo por el que estos futbolistas donan e...,El Pais
5,"Merkel, dispuesta a acceder a la reunificación...",ABC
6,Los CDR burlan la prohibición y colocan toalla...,ABC
7,Al menos nueve muertos y 164 heridos tras un t...,ABC
8,Venezuela: hambre y desesperanza,ABC
9,Los cinco grandes «palos» económicos que ha re...,ABC


In [9]:
# Obtaining concantenated headlines in order to build the dictionary
concatenated_hl = Headlines.concatenate_headlines(data, df = min_per_df)

In [10]:
# Building dictionary from headlines
sequence_hl = Text_Sequence(concatenated_hl)
dictionary_hl, len_dict = Text_Sequence.creating_dict(sequence_hl)
print ('The dictionary contains', len_dict, 'words')

The dictionary contains 27162 words


In [11]:
# Mapping headlines to integer
x_int = Headlines.headlines_to_int(data, min_per_df, dictionary_hl)
x_int

array([list([16404, 15483, 8357, 3643]),
       list([10039, 19659, 24944, 25924, 21228, 19191, 8135, 15288, 10318, 5724]),
       list([15746, 17731, 26246, 2849, 9070, 8135, 9255]), ...,
       list([15746, 7952, 8488, 25425, 8135, 9975, 26078, 9272, 7470, 8357, 18524]),
       list([15852, 20406, 8135, 23844, 19245, 3591, 25924, 9864, 13118, 1353, 15746, 21106, 8357, 15247]),
       list([7739, 22961, 24008, 15746, 1814, 8135, 26620, 11039, 6811, 25530, 10039, 22588, 8135, 7832, 15116])],
      dtype=object)

In [12]:
# Labelling journal names 
y_int = Headlines.y_to_int(data, min_per_df)
y_int

array([0, 1, 0, ..., 1, 0, 0])

In [13]:
y_onehot = np_utils.to_categorical(y_int)
y_onehot

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [14]:
# In Keras, sequences must have the same length
# Searching for the longest length of headlines
max_headline_lenght = Headlines.max_hl_length(data, x_int)

The longest headline consists of 26 words


In [15]:
#truncate and pad input sequences
x_samelength = sequence.pad_sequences(x_int, maxlen=max_headline_lenght)
x_samelength

array([[    0,     0,     0, ..., 15483,  8357,  3643],
       [    0,     0,     0, ..., 15288, 10318,  5724],
       [    0,     0,     0, ...,  9070,  8135,  9255],
       ...,
       [    0,     0,     0, ...,  7470,  8357, 18524],
       [    0,     0,     0, ..., 21106,  8357, 15247],
       [    0,     0,     0, ...,  8135,  7832, 15116]], dtype=int32)

In [16]:
# Dividing between training and test set
x_train, x_val, x_test, y_train, y_val, y_test = Headlines.splitting_data_threesets(data, x_samelength, y_onehot, 0.8)



In [17]:
# Loading a pre-trained Spanish embedding 
words, embeddings = pickle.load(open('polyglot-es.pkl', 'rb'), encoding='latin1')
print("Emebddings shape is {}".format(embeddings.shape))

Emebddings shape is (100004, 64)


In [18]:
#Defining Adam optimizer
epochss = 2000
learning_rate = 1e-4
decay_rate = learning_rate/epochss
adamm = Adam(lr=learning_rate)

In [19]:
#create the LSTM model
def baseline_model():
#first layer: embedded layer. uses 5 length vectors to represent each word
    embedding_vector_length = 64
    model=Sequential()
    
    #model.add(Embedding(len_dict, embedding_vector_length, input_length=max_headline_lenght))
    model.add(Embedding(100004, embedding_vector_length, trainable =False, weights= [embeddings], input_length=max_headline_lenght))
    model.add(Dropout(0.5))
    
    model.add(Conv1D(filters=embedding_vector_length, kernel_size=3, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(MaxPooling1D(pool_size=2))
    #model.add(Conv1D(filters=embedding_vector_length, kernel_size=5, padding='same', activation='relu'))
    #model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.15))
    #model.add(Flatten())
#Next layer: LSTM layer with 100 memory units
    #model.add(LSTM(256, dropout=0.5, recurrent_dropout=0.5))
    #model.add(Dropout(0.5))
    
    #esta era la buena!
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.0, activation=None))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    #model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))
    
    #custom attention model
    #model.add(AttentionDecoder(100, 64))
    
#Final layer: Dense output layer with a single neuron and a sigmoid activation function to make 0 or 1 predictions
#model.add(Dense(1, activation='sigmoid'))
    
    #estas tienen que ir
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(32))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(16))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(8))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(4))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    #Because we used a one-hot encoding for our iris dataset, the output layer must create 3 output values, one for each class.
#model.add(Dense(1, activation='sigmoid'))
    #model.add(Flatten())    
    
    #esta tiene q ir
    model.add(Dense(2))
    model.add(BatchNormalization())
    model.add(Activation('softmax'))
    
    
    #model.add(Dense(1, activation='sigmoid'))
#Because it is a binary classification problem, log loss is used as the loss function 
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Because it is a multi-class classification problem, categorical cross entropy is used as the loss function
    #model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    model.compile(loss='categorical_crossentropy', optimizer = adamm, metrics=['accuracy'])
    print(model.summary())
    return model

In [20]:
# WAY 2: no sklearn wrapper, simply using keras
model = baseline_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 64)            6400256   
_________________________________________________________________
dropout_1 (Dropout)          (None, 26, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 26, 64)            12352     
_________________________________________________________________
batch_normalization_1 (Batch (None, 26, 64)            256       
_________________________________________________________________
activation_1 (Activation)    (None, 26, 64)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 26, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
__________

In [None]:
#define the checkpoints
dirr = os.path.dirname(os.path.realpath('__file__'))
filepath = os.path.join(dirr, 'two_journals','weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5') 
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
#history = model.fit(x_train, y_train, validation_data= (x_test, y_test), epochs=500, batch_size=20, callbacks=callbacks_list)
history = model.fit(x_train, y_train, validation_data= (x_val, y_val), epochs=epochss, batch_size=128, callbacks=callbacks_list)

Train on 33731 samples, validate on 4216 samples
Epoch 1/2000

Epoch 00001: val_acc improved from -inf to 0.50308, saving model to /home/angela/repos/headlines_classificator/two_journals/weights-improvement-01-0.5031.hdf5
Epoch 2/2000

Epoch 00002: val_acc did not improve
Epoch 3/2000

Epoch 00003: val_acc did not improve
Epoch 4/2000

Epoch 00004: val_acc did not improve
Epoch 5/2000

Epoch 00005: val_acc did not improve
Epoch 6/2000

Epoch 00006: val_acc did not improve
Epoch 7/2000

Epoch 00007: val_acc did not improve
Epoch 8/2000

Epoch 00008: val_acc did not improve
Epoch 9/2000

Epoch 00009: val_acc did not improve
Epoch 10/2000

Epoch 00010: val_acc did not improve
Epoch 11/2000

Epoch 00011: val_acc did not improve
Epoch 12/2000

Epoch 00012: val_acc did not improve
Epoch 13/2000

Epoch 00013: val_acc did not improve
Epoch 14/2000

Epoch 00014: val_acc did not improve
Epoch 15/2000

Epoch 00015: val_acc did not improve
Epoch 16/2000

Epoch 00016: val_acc did not improve
Epoch 


Epoch 00045: val_acc did not improve
Epoch 46/2000

Epoch 00046: val_acc did not improve
Epoch 47/2000

Epoch 00047: val_acc did not improve
Epoch 48/2000

Epoch 00048: val_acc did not improve
Epoch 49/2000

Epoch 00049: val_acc did not improve
Epoch 50/2000

In [None]:
# buscamos cargar los pesos de mayor val_acc
val_acc = history.history['val_acc']
best_valacc = max(val_acc)
best_valacc_round = round(best_valacc, 4)
best_valacc_round
weights_file = glob.glob(os.path.join(dirr, 'two_journals','weights-improvement-*-%s.hdf5' % best_valacc_round))[0]
weights_file

In [None]:
# load the network weights
model.load_weights(weights_file)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_predprob = model.predict(x_test)
y_pred = model.predict_classes(x_test)
y_pred_decoded = Headlines.int_to_journal(data, y_pred)

y_trpred = model.predict_classes(x_train)
y_trpred_decoded = Headlines.int_to_journal(data, y_trpred)

In [None]:
y_train_tocat = Headlines.onehot_to_categorical(data, y_train)
y_test_tocat = Headlines.onehot_to_categorical(data, y_test)

In [None]:
#Print model report:
print ("\nModel Report")
print ("Accuracy (train set): %.4g" % metrics.accuracy_score(y_train_tocat, y_trpred))
print ("Accuracy (test set): %.4g" % metrics.accuracy_score(y_test_tocat, y_pred))
print("Confusion matrix:")
print (metrics.confusion_matrix(y_test_tocat, y_pred))
print("Detailed classification report:")
print (metrics.classification_report(y_test_tocat, y_pred))

# Desciptive statistics

In [None]:
# number of words for abc headlines?
ep_headlines = min_per_df.loc[min_per_df["Journal"]=="El Pais", "Headline"]
ep_headlines

In [None]:
#ep_long = str.split(ep_headlines)
ep_len = ep_headlines.apply(lambda x:len(text.text_to_word_sequence(unidecode(x), lower=True, split=" ", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r'))).tolist()

In [None]:
# Histogram
bins = np.arange(0, 22, 1) # fixed bin size
plt.xlim([min(ep_len)-5, max(ep_len)+5])

plt.hist(ep_len, bins=bins, alpha=0.5, edgecolor='black', linewidth=1.2)
plt.title('Histogram counts of words El Pais')
plt.xlabel('number of words')
plt.ylabel('count')

plt.show()

In [None]:
abc_headlines = min_per_df.loc[min_per_df["Journal"]=="ABC", "Headline"]

In [None]:
abc_len = abc_headlines.apply(lambda x:len(text.text_to_word_sequence(unidecode(x), lower=True, split=" ", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r'))).tolist()

In [None]:
# Histogram
bins = np.arange(0, 22, 1) # fixed bin size
plt.xlim([min(ep_len)-5, max(ep_len)+5])

plt.hist(abc_len, bins=bins, alpha=0.5, edgecolor='black', linewidth=1.2)
plt.title('Histogram counts of words ABC')
plt.xlabel('number of words')
plt.ylabel('count')

plt.show()