In [21]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Flatten
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from keras.preprocessing import sequence, text
from keras.utils import np_utils, plot_model
import pickle
from classes import *
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from keras.wrappers.scikit_learn import KerasClassifier
from unidecode import unidecode
from sklearn.model_selection import KFold, cross_val_score
import matplotlib.pyplot as plt
import pydot
import graphviz
import time
# fix random seed for reproducibility
np.random.seed(7)

In [2]:
#para usar solo una GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf 
from keras.backend.tensorflow_backend import set_session 
config = tf.ConfigProto() 
config.gpu_options.per_process_gpu_memory_fraction = 1
set_session(tf.Session(config = config))

In [3]:
#Loading the pickle file with the dictionary Journal - List of headlines
#f = open('titulares_creados/headlines_final.pckl', 'rb')
f = open('merged_headlines.pckl', 'rb')
lista_periodicos = pickle.load(f)
f.close()

In [4]:
#Creating the Headlines object from the pickle file
ep = 'http://ep00.epimg.net/rss/elpais/portada.xml'
em = 'http://estaticos.elmundo.es/elmundo/rss/portada.xml'
lv = 'http://www.lavanguardia.com/mvc/feed/rss/home.xml'
lr = 'http://www.larazon.es/rss/portada.xml'
abc = 'http://www.abc.es/rss/feeds/abcPortada.xml'

urls = [ep, em, lv, lr, abc]
journals = ['El Pais', 'El Mundo', 'La Vanguardia', 'La Razon', 'ABC']

data = Headlines(periodicos=journals, urls=urls, titulares=lista_periodicos) 

In [5]:
#Creating dataframe from the headlines
periodicos_df = Headlines.dataframing_headlines(data)
periodicos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183206 entries, 0 to 183205
Data columns (total 2 columns):
Headline    183206 non-null object
Journal     183206 non-null object
dtypes: object(2)
memory usage: 2.8+ MB


In [6]:
# How many headlines there are of each journal?
# We should limit the number of headlines in each journal to the minimum number available for one of the classes
min_number = Headlines.min_hl_number(data, periodicos_df) 

El Pais 21406
El Mundo 60946
La Vanguardia 68746
La Razon 11026
ABC 21082


In [7]:
min_per_df = Headlines.keeping_min_headlines(data, periodicos_df, min_number)
min_per_df

Unnamed: 0,Headline,Journal
0,Mafias serbias gitanas compran a niñas para qu...,ABC
1,10 normas para que tu hijo esté siempre seguro...,La Vanguardia
2,La Justicia alemana pide a España que concrete...,El Mundo
3,Supervivientes: Desvelado el jugador del Real ...,La Vanguardia
4,"Fonsi y Daddy Yankee critican la ""propaganda""...",El Mundo
5,Economía multa a Pwc por las cuentas del Popul...,ABC
6,El City amenaza a Tebas con acciones legales,El Mundo
7,"Ildefonso Falcones: ""Llevo cuatro años y medio...",El Mundo
8,Economía advierte: se «ralentiza» el PIB por l...,ABC
9,El criador español que triunfa en Hollywood pi...,El Mundo


In [8]:
# Obtaining concantenated headlines in order to build the dictionary
concatenated_hl = Headlines.concatenate_headlines(data, df = min_per_df)

In [9]:
# Building dictionary from headlines
sequence_hl = Text_Sequence(concatenated_hl)
dictionary_hl, len_dict = Text_Sequence.creating_dict(sequence_hl)
print ('The dictionary contains', len_dict, 'words')

The dictionary contains 34995 words


In [10]:
# Mapping headlines to integer
x_int = Headlines.headlines_to_int(data, min_per_df, dictionary_hl)
x_int

array([list([21472, 30581, 16835, 9046, 2119, 23901, 25051, 27658, 29472, 13444, 34434]),
       list([1491, 24024, 25051, 27658, 33232, 17767, 14500, 30747, 30381, 13444, 13097, 8598]),
       list([20389, 20105, 3382, 25984, 2119, 14269, 27658, 9179, 20389, 2700, 10762, 21637, 9678, 27550]),
       ...,
       list([22355, 16531, 10762, 1491, 4009, 16484, 10762, 20389, 15746, 10762, 20389, 34556, 20250]),
       list([20389, 5694, 27080, 10612, 24245, 21251, 9748, 4046, 9115, 24317]),
       list([12056, 2119, 33423, 30373, 17909, 26523, 13097, 5063, 4022, 13097, 24926, 10762, 6720])],
      dtype=object)

In [11]:
# Labelling journal names 
y_int = Headlines.y_to_int(data, min_per_df)
y_int

array([4, 2, 1, ..., 1, 0, 4])

In [12]:
y_int[0:20]

array([4, 2, 1, 2, 1, 4, 1, 1, 4, 1, 3, 0, 0, 3, 3, 1, 2, 2, 4, 3])

In [13]:
y_onehot = np_utils.to_categorical(y_int)
y_onehot

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [14]:
# In Keras, sequences must have the same length
# Searching for the longest length of headlines
max_headline_lenght = Headlines.max_hl_length(data, x_int)

The longest headline consists of 38 words


In [15]:
#truncate and pad input sequences
x_samelength = sequence.pad_sequences(x_int, maxlen=max_headline_lenght)
x_samelength

array([[    0,     0,     0, ..., 29472, 13444, 34434],
       [    0,     0,     0, ..., 13444, 13097,  8598],
       [    0,     0,     0, ..., 21637,  9678, 27550],
       ...,
       [    0,     0,     0, ..., 20389, 34556, 20250],
       [    0,     0,     0, ...,  4046,  9115, 24317],
       [    0,     0,     0, ..., 24926, 10762,  6720]], dtype=int32)

In [16]:
# Dividing between training and test set
#x_train, x_test, y_train, y_test = Headlines.splitting_data(data, x_samelength, y_onehot, 0.1)
x_train, x_val, x_test, y_train, y_val, y_test = Headlines.splitting_data_threesets(data, x_samelength, y_onehot, 0.8)



In [17]:
# Loading a pre-trained Spanish embedding 
words, embeddings = pickle.load(open('polyglot-es.pkl', 'rb'), encoding='latin1')
print("Emebddings shape is {}".format(embeddings.shape))

Emebddings shape is (100004, 64)


In [18]:
#Defining Adam optimizer
epochss = 1000
learning_rate = 5e-4
decay_rate = learning_rate/epochss
adamm = Adam(lr=learning_rate, beta_1=0.1, beta_2=0.001, epsilon=1e-08, decay=decay_rate)

In [19]:
#create the LSTM model
def baseline_model():
#first layer: embedded layer. uses 5 length vectors to represent each word
    embedding_vector_length = 64
    model=Sequential()
    
    #model.add(Embedding(len_dict, embedding_vector_length, input_length=max_headline_lenght))
    model.add(Embedding(100004, embedding_vector_length, trainable =False, weights= [embeddings], input_length=max_headline_lenght))
    
    model.add(Dropout(0.35))
    model.add(Conv1D(filters=embedding_vector_length, kernel_size=3, padding='same', activation='relu'))

    model.add(Dropout(0.5))

#Next layer: LSTM layer with 100 memory units
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
    
#Final layer: Dense output layer with a single neuron and a sigmoid activation function to make 0 or 1 predictions
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
#Because we used a one-hot encoding for our iris dataset, the output layer must create 3 output values, one for each class.
    model.add(Dense(5, activation='softmax'))

#Because it is a binary classification problem, log loss is used as the loss function 

#Because it is a multi-class classification problem, categorical cross entropy is used as the loss function
    model.compile(loss='categorical_crossentropy', optimizer = adamm, metrics=['accuracy'])
    print(model.summary())
    return model

In [20]:
# WAY 2: no sklearn wrapper, simply using keras
model = baseline_model()
#history = model.fit(x_train, y_train, validation_data= (x_test, y_test), epochs=200, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 38, 64)            6400256   
_________________________________________________________________
dropout_1 (Dropout)          (None, 38, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 38, 64)            12352     
_________________________________________________________________
dropout_2 (Dropout)          (None, 38, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 512)               51712     
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
__________

In [25]:
# define the checkpoint
dirr = os.path.dirname(os.path.realpath('__file__'))
filepath = os.path.join(dirr, 'LSTM_journals','weights-improvement-{epoch:03d}-{val_acc:.4f}.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
#history = model.fit(x_train, y_train, validation_data= (x_test, y_test), epochs=epochss, batch_size=128, callbacks=callbacks_list)

history = model.fit(x_train, y_train, validation_data= (x_val, y_val), epochs=epochss, batch_size=128, callbacks=callbacks_list)

Train on 44104 samples, validate on 5513 samples
Epoch 1/1000

Epoch 00001: val_acc improved from -inf to 0.27517, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-001-0.2752.hdf5
Epoch 2/1000

Epoch 00002: val_acc did not improve
Epoch 3/1000

Epoch 00003: val_acc did not improve
Epoch 4/1000

Epoch 00004: val_acc did not improve
Epoch 5/1000

Epoch 00005: val_acc did not improve
Epoch 6/1000

Epoch 00006: val_acc improved from 0.27517 to 0.28115, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-006-0.2812.hdf5
Epoch 7/1000

Epoch 00007: val_acc did not improve
Epoch 8/1000

Epoch 00008: val_acc did not improve
Epoch 9/1000

Epoch 00009: val_acc did not improve
Epoch 10/1000

Epoch 00010: val_acc did not improve
Epoch 11/1000

Epoch 00011: val_acc did not improve
Epoch 12/1000

Epoch 00012: val_acc improved from 0.28115 to 0.28134, saving model to /home/angela/repos/headlines_classificator/LSTM_jou


Epoch 00040: val_acc did not improve
Epoch 41/1000

Epoch 00041: val_acc did not improve
Epoch 42/1000

Epoch 00042: val_acc did not improve
Epoch 43/1000

Epoch 00043: val_acc did not improve
Epoch 44/1000

Epoch 00044: val_acc did not improve
Epoch 45/1000

Epoch 00045: val_acc did not improve
Epoch 46/1000

Epoch 00046: val_acc did not improve
Epoch 47/1000

Epoch 00047: val_acc did not improve
Epoch 48/1000

Epoch 00048: val_acc did not improve
Epoch 49/1000

Epoch 00049: val_acc did not improve
Epoch 50/1000

Epoch 00050: val_acc did not improve
Epoch 51/1000

Epoch 00051: val_acc did not improve
Epoch 52/1000

Epoch 00052: val_acc did not improve
Epoch 53/1000

Epoch 00053: val_acc did not improve
Epoch 54/1000

Epoch 00054: val_acc improved from 0.29675 to 0.30238, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-054-0.3024.hdf5
Epoch 55/1000

Epoch 00055: val_acc did not improve
Epoch 56/1000

Epoch 00056: val_acc did not improve
Epo


Epoch 00085: val_acc did not improve
Epoch 86/1000

Epoch 00086: val_acc improved from 0.30346 to 0.30365, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-086-0.3036.hdf5
Epoch 87/1000

Epoch 00087: val_acc did not improve
Epoch 88/1000

Epoch 00088: val_acc did not improve
Epoch 89/1000

Epoch 00089: val_acc did not improve
Epoch 90/1000

Epoch 00090: val_acc improved from 0.30365 to 0.30600, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-090-0.3060.hdf5
Epoch 91/1000

Epoch 00091: val_acc did not improve
Epoch 92/1000

Epoch 00092: val_acc improved from 0.30600 to 0.30927, saving model to /home/angela/repos/headlines_classificator/LSTM_journals/weights-improvement-092-0.3093.hdf5
Epoch 93/1000

Epoch 00093: val_acc did not improve
Epoch 94/1000

Epoch 00094: val_acc did not improve
Epoch 95/1000

Epoch 00095: val_acc did not improve
Epoch 96/1000

Epoch 00096: val_acc did not improve
Epoch 97/

In [None]:
# load the network weights
#filename = "weights-improvement-440-0.3196.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_predprob = model.predict(x_test)
y_pred = model.predict_classes(x_test)
y_pred_decoded = Headlines.int_to_journal(data, y_pred)

y_trpred = model.predict_classes(x_train)
y_trpred_decoded = Headlines.int_to_journal(data, y_trpred)

In [None]:
#Predicting in the test set
#y_pred = estimator.predict(x_test)
#y_pred_decoded = Headlines.int_to_journal(data, y_pred)
#y_pred = encoder.inverse_transform(predictions)

#y_predprob = estimator.predict_proba(x_test)[:,1]
#y_trpred = estimator.predict(x_train)
#y_trpred_decoded = Headlines.int_to_journal(data, y_trpred)

y_pred

In [None]:
#Both y_train and y_test are one-hot-encoded. Decoding them for model reporting.
y_train_tocat = Headlines.onehot_to_categorical(data, y_train)
y_test_tocat = Headlines.onehot_to_categorical(data, y_test)

In [None]:
#Print model report:
print ("\nModel Report")
print ("Accuracy (train set): %.4g" % metrics.accuracy_score(y_train_tocat, y_trpred))
print ("Accuracy (test set): %.4g" % metrics.accuracy_score(y_test_tocat, y_pred))
print("Confusion matrix:")
print (metrics.confusion_matrix(y_test_tocat, y_pred))
print("Detailed classification report:")
print (metrics.classification_report(y_test_tocat, y_pred))

In [None]:
# Evaluating the model with k-Fold Validation
seed=7
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))