In [1]:
import pickle
import time
import os

import numpy as np
import pydot
import graphviz
import glob
from unidecode import unidecode
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Flatten
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from keras.preprocessing import sequence, text
from keras.utils import np_utils, plot_model
from keras.wrappers.scikit_learn import KerasClassifier

from classes import *

# fix random seed for reproducibility
np.random.seed(7)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#para usar solo una GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf 
from keras.backend.tensorflow_backend import set_session 
config = tf.ConfigProto() 
config.gpu_options.per_process_gpu_memory_fraction = 1
set_session(tf.Session(config = config))

In [3]:
#Loading the pickle file with the dictionary Journal - List of headlines
#f = open('titulares_creados/headlines_final.pckl', 'rb')
f = open('merged_headlines.pckl', 'rb')
lista_periodicos = pickle.load(f)
f.close()

In [4]:
#Creating the Headlines object from the pickle file
ep = 'http://ep00.epimg.net/rss/elpais/portada.xml'
em = 'http://estaticos.elmundo.es/elmundo/rss/portada.xml'
lv = 'http://www.lavanguardia.com/mvc/feed/rss/home.xml'
lr = 'http://www.larazon.es/rss/portada.xml'
abc = 'http://www.abc.es/rss/feeds/abcPortada.xml'

urls = [ep, em, lv, lr, abc]
journals = ['El Pais', 'El Mundo', 'La Vanguardia', 'La Razon', 'ABC']

data = Headlines(periodicos=journals, urls=urls, titulares=lista_periodicos) 

In [5]:
#Creating dataframe from the headlines
periodicos_df = Headlines.dataframing_headlines(data)
periodicos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183206 entries, 0 to 183205
Data columns (total 2 columns):
Headline    183206 non-null object
Journal     183206 non-null object
dtypes: object(2)
memory usage: 2.8+ MB


In [6]:
# How many headlines there are of each journal?
# We should limit the number of headlines in each journal to the minimum number available for one of the classes
min_number = Headlines.min_hl_number(data, periodicos_df) 

El Pais 21406
El Mundo 60946
La Vanguardia 68746
La Razon 11026
ABC 21082


In [7]:
min_per_df = Headlines.keeping_min_headlines(data, periodicos_df, min_number)
min_per_df

Unnamed: 0,Headline,Journal
0,Vuelve para violar a su amiga tras dejarla en ...,ABC
1,El ministro del Interior belga: España “ha ido...,La Vanguardia
2,Hasta tres cucharadas y media de azúcar en un ...,El Mundo
3,El Parlamento británico insiste en que Mark Zu...,La Vanguardia
4,"Es Satty, el imam al que le explotó 'la madre ...",El Mundo
5,Cristina Cifuentes renuncia a su acta de diput...,ABC
6,"Guardiola, que ya votó por correo, insiste: ""N...",El Mundo
7,El 'Forrest Gump' de Libia: correr a Moscú par...,El Mundo
8,La firma de hipotecas para la compra de vivien...,ABC
9,"Willy Toledo cita a la Policía: ""Estaré toda l...",El Mundo


In [8]:
# Obtaining concantenated headlines in order to build the dictionary
concatenated_hl = Headlines.concatenate_headlines(data, df = min_per_df)

In [None]:
# Building dictionary from headlines
sequence_hl = Text_Sequence(concatenated_hl)
dictionary_hl, len_dict = Text_Sequence.creating_dict(sequence_hl)
print ('The dictionary contains', len_dict, 'words')

The dictionary contains 34841 words


In [None]:
# Mapping headlines to integer
x_int = Headlines.headlines_to_int(data, min_per_df, dictionary_hl)
x_int

In [None]:
# Labelling journal names 
y_int = Headlines.y_to_int(data, min_per_df)
y_int

In [None]:
y_int[0:20]

In [None]:
y_onehot = np_utils.to_categorical(y_int)
y_onehot

In [None]:
# In Keras, sequences must have the same length
# Searching for the longest length of headlines
max_headline_lenght = Headlines.max_hl_length(data, x_int)

In [None]:
#truncate and pad input sequences
x_samelength = sequence.pad_sequences(x_int, maxlen=max_headline_lenght)
x_samelength

In [None]:
# Dividing between training and test set
#x_train, x_test, y_train, y_test = Headlines.splitting_data(data, x_samelength, y_onehot, 0.1)
x_train, x_val, x_test, y_train, y_val, y_test = Headlines.splitting_data_threesets(data, x_samelength, y_onehot, 0.8)

In [None]:
# Loading a pre-trained Spanish embedding 
words, embeddings = pickle.load(open('polyglot-es.pkl', 'rb'), encoding='latin1')
print("Emebddings shape is {}".format(embeddings.shape))

In [None]:
#Defining Adam optimizer
epochss = 1000
learning_rate = 1e-4
decay_rate = learning_rate/epochss
adamm = Adam(lr=learning_rate, beta_1=0.1, beta_2=0.001, epsilon=1e-08, decay=decay_rate)

In [None]:
#create the LSTM model
def baseline_model():
#first layer: embedded layer. uses 5 length vectors to represent each word
    embedding_vector_length = 64
    model=Sequential()
    
    #model.add(Embedding(len_dict, embedding_vector_length, input_length=max_headline_lenght))
    model.add(Embedding(100004, embedding_vector_length, trainable =False, weights= [embeddings], input_length=max_headline_lenght))
    model.add(Dropout(0.5))
    
    model.add(Conv1D(filters=embedding_vector_length, kernel_size=3, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

#Next layer: LSTM layer with 100 memory units
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.0, activation=None))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(32))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(16))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))


#Because we used a one-hot encoding for our iris dataset, the output layer must create 3 output values, one for each class.
    model.add(Dense(5))
    model.add(BatchNormalization())
    model.add(Activation('softmax'))

#Because it is a binary classification problem, log loss is used as the loss function 

#Because it is a multi-class classification problem, categorical cross entropy is used as the loss function
    model.compile(loss='categorical_crossentropy', optimizer = adamm, metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
# WAY 2: no sklearn wrapper, simply using keras
model = baseline_model()
#history = model.fit(x_train, y_train, validation_data= (x_test, y_test), epochs=200, batch_size=64)

In [None]:
# define the checkpoint
dirr = os.path.dirname(os.path.realpath('__file__'))
filepath = os.path.join(dirr, 'LSTM_journals','weights-improvement-{epoch:03d}-{val_acc:.4f}.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
#history = model.fit(x_train, y_train, validation_data= (x_test, y_test), epochs=epochss, batch_size=128, callbacks=callbacks_list)

history = model.fit(x_train, y_train, validation_data= (x_val, y_val), epochs=epochss, batch_size=128, callbacks=callbacks_list)

In [None]:
# buscamos cargar los pesos de mayor val_acc
val_acc = history.history['val_acc']
best_valacc = max(val_acc)
best_valacc_round = round(best_valacc, 4)
best_valacc_round
weights_file = glob.glob(os.path.join(dirr, 'LSTM_journals','weights-improvement-*-%s.hdf5' % best_valacc_round))[0]
weights_file

In [None]:
# load the network weights
#filename = "weights-improvement-440-0.3196.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_predprob = model.predict(x_test)
y_pred = model.predict_classes(x_test)
y_pred_decoded = Headlines.int_to_journal(data, y_pred)

y_trpred = model.predict_classes(x_train)
y_trpred_decoded = Headlines.int_to_journal(data, y_trpred)

In [None]:
#Predicting in the test set
#y_pred = estimator.predict(x_test)
#y_pred_decoded = Headlines.int_to_journal(data, y_pred)
#y_pred = encoder.inverse_transform(predictions)

#y_predprob = estimator.predict_proba(x_test)[:,1]
#y_trpred = estimator.predict(x_train)
#y_trpred_decoded = Headlines.int_to_journal(data, y_trpred)

y_pred

In [None]:
#Both y_train and y_test are one-hot-encoded. Decoding them for model reporting.
y_train_tocat = Headlines.onehot_to_categorical(data, y_train)
y_test_tocat = Headlines.onehot_to_categorical(data, y_test)

In [None]:
#Print model report:
print ("\nModel Report")
print ("Accuracy (train set): %.4g" % metrics.accuracy_score(y_train_tocat, y_trpred))
print ("Accuracy (test set): %.4g" % metrics.accuracy_score(y_test_tocat, y_pred))
print("Confusion matrix:")
print (metrics.confusion_matrix(y_test_tocat, y_pred))
print("Detailed classification report:")
print (metrics.classification_report(y_test_tocat, y_pred))

In [None]:
# Evaluating the model with k-Fold Validation
seed=7
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))