# Préparation des données avec ELMO à la place de Word2Vec

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('data/train_cap2018.csv')

In [3]:
from sklearn.model_selection import train_test_split
test_size = 0.2
random_state = 42

In [4]:
train, test = train_test_split(dataset, test_size=test_size, random_state=random_state, shuffle=True, stratify=dataset.loc[:,'level1'])
print(train.shape)
print(test.shape)

(21848, 60)
(5462, 60)


In [5]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
print(X_train.shape)
print(y_train.shape)

(21848, 59)
(21848,)


In [6]:
y_train = y_train.replace({"A1": 0, "A2" : 1, "B1" : 2, "B2" : 3, "C1" : 4, "C2" : 5})

In [7]:
#X_train_fulltext = X_train.loc[:, 'fulltext'].iloc[:10]
#y_train = y_train[:10]
#print(X_train_fulltext)

X_train_fulltext = X_train.loc[:, 'fulltext']

In [8]:
training_size = 0.7

X_train = X_train_fulltext[0:int(X_train_fulltext.shape[0]*training_size)]
y_train_full = y_train
y_train = y_train_full[0:int(X_train_fulltext.shape[0]*training_size)]

X_val = X_train_fulltext[int(X_train_fulltext.shape[0]*training_size):]
y_val = y_train_full[int(X_train_fulltext.shape[0]*training_size):]

In [9]:
y_train = np.array(y_train.reset_index().iloc[:,1])
y_val = np.array(y_val.reset_index().iloc[:,1])

In [10]:
print(X_val.shape)
print(y_val.shape)
print(X_train.shape)
print(y_train.shape)

(6555,)
(6555,)
(15293,)
(15293,)


# Programmation avec mode "eager execution"

Récupération d'un module ELMO déjà entrainé.

In [11]:
import tensorflow as tf
import tensorflow_hub as hub

AttributeError: module 'tensorflow' has no attribute 'estimator'

In [12]:
def get_next_batch(X, iteration, batch_size):
    X_batch = X[iteration*batch_size:(iteration+1)*batch_size]
    return X_batch

In [None]:
def create_graph_RNN_and_train(X_train, y_train, X_val, y_val, n_steps, n_neurons=500, activation=tf.nn.relu, 
                               dropout_in=0, dropout_out=0, class_weights=[1, 1, 1, 1, 1, 1], learning_rate=0.001, 
                               n_epochs=100, batch_size=200, max_checks_without_progress=3):
    
    tf.reset_default_graph()
    
    X = tf.placeholder(dtype=tf.string, shape=[None], name="X")
    y = tf.placeholder(tf.int64, shape=[None], name="y")
    
    elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
    X_elmo = elmo(X, signature="default", as_dict=True)["elmo"]
    sequence_length = tf.shape(X_elmo)[1]*tf.ones(tf.shape(X_elmo)[0], dtype=tf.int32)
    
    dropout_in_placeholder = tf.placeholder_with_default(tf.constant(0.0, dtype=tf.float32), ())
    dropout_out_placeholder = tf.placeholder_with_default(tf.constant(0.0, dtype=tf.float32), ())
    
    basic_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons, activation=activation)
    basic_cell = tf.contrib.rnn.DropoutWrapper(basic_cell, input_keep_prob=1-dropout_in_placeholder, output_keep_prob=1-dropout_out_placeholder)
    outputs, states = tf.nn.dynamic_rnn(basic_cell, X_elmo, sequence_length=sequence_length, dtype=tf.float32)

    logits = tf.layers.dense(inputs=states, units=n_outputs, name="logits")
    inference = tf.nn.softmax(logits, name="inference")

    with tf.name_scope("loss"):
        #loss = cost(inference, y)

        class_weights_tf = tf.constant(class_weights)
        weights = tf.gather(class_weights_tf, y)
        xentropy = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits, weights=weights)
        #xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y) #ancienne version (sans poids)
        loss = tf.reduce_mean(xentropy, name="loss")

    with tf.name_scope("train"):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)

    with tf.name_scope("eval"):
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    summary_writer = tf.summary.FileWriter("./summary", tf.get_default_graph())

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    n_batches_per_epoch = X_train.shape[0] // batch_size
    print("Nombre de batchs par epoch =", n_batches_per_epoch)
    
    best_loss = np.infty
    checks_without_progress = 0
    
    with tf.Session() as sess:
        init.run()
        for epoch in range(n_epochs):
            for iteration in range(n_batches_per_epoch):
                if (iteration+1)%10==0:
                    print("Batch n°", iteration+1)
                X_batch = get_next_batch(X_train, iteration, batch_size)
                #print("X_batch=", X_batch)
                y_batch = y_train[iteration*batch_size:(iteration+1)*batch_size]
                #print("y_batch=", y_batch)

                sess.run(training_op, feed_dict={X: X_batch, y: y_batch, dropout_in_placeholder: dropout_in, dropout_out_placeholder: dropout_out})
                #print(sess.run(X_elmo, feed_dict={X: X_batch, y: y_batch, dropout_in_placeholder: dropout_in, dropout_out_placeholder: dropout_out}))

            #fonction de coût sur les 5000 premiers textes d'entrainement (pour que ça tienne dans la mémoire vive)
            nb_training_examples = 5000
            if X_train.shape[0] < nb_training_examples:
                nb_training_examples = X_train.shape[0]
            loss_train = loss.eval(feed_dict={X: get_next_batch(X_train[0:nb_training_examples], 0, nb_training_examples), y: y_train[0:nb_training_examples]})
            loss_val = loss.eval(feed_dict={X: get_next_batch(X_val, 0, X_val.shape[0]), y: y_val})
            print(epoch, "Loss training:", loss_train)
            print(epoch, "Loss validation:", loss_val)

            if loss_val < best_loss:
                save_path = saver.save(sess, "./natural_language_classifier.ckpt")
                best_loss = loss_val
                checks_without_progress = 0
            else:
                checks_without_progress += 1
                if checks_without_progress >= MAX_CHECKS_WITHOUT_PROGRESS:
                    print("Early stopping!")
                    break
    return inference, X

In [None]:
n_steps = 450 #taille maximale des textes (rendue fixe)
print("Nombre maximal de mots par texte (fixe) =", n_steps)
n_neurons = 500
activation = tf.nn.relu
n_outputs = 6
class_weights = [1, 1, 1, 1, 1, 1] #poids de la fonction de coût
learning_rate = 0.001

n_epochs = 100
batch_size = 200

MAX_CHECKS_WITHOUT_PROGRESS = 3

In [12]:
inference, X = create_graph_RNN_and_train(X_train, y_train, X_val, y_val, n_steps, n_neurons=n_neurons, 
                                          activation=activation, class_weights=class_weights,
                                           learning_rate=learning_rate, n_epochs=n_epochs, 
                                          batch_size=batch_size, max_checks_without_progress=MAX_CHECKS_WITHOUT_PROGRESS)



NameError: name 'create_graph_RNN_and_train' is not defined

# A modifier

In [16]:
costs = np.array([[0,1,2,3,4,6],[1,0,1,4,5,8],[3,2,0,3,5,8],[10,7,5,0,2,7],[20,16,12,4,0,8],[44,38,32,19,13,0]])
names = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

(3,)
(3,)
(7,)
(7,)


In [None]:
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes=['A1', 'A2', 'B1', 'B2', 'C1', 'C2'],
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
print_confusion = True
def cost(y_pred, y_true, normalize=True):
    confusion = confusion_matrix(y_true, y_pred)
    res = (1/y_true.shape[0]) * np.sum(np.multiply(costs, confusion))
    
    if print_confusion:
        # Compute confusion matrix
        cnf_matrix = confusion_matrix(y_true, y_pred)
        np.set_printoptions(precision=2)

        # Plot normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix, normalize=normalize, title='Normalized confusion matrix')

        plt.show()
    return res

In [None]:
print_confusion = True

saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, "./natural_language_classifier.ckpt")
    res = sess.run(inference, feed_dict={X: get_next_batch(X_val, 0, X_val.shape[0])})
    y_pred = np.argmax(res, axis=1)

print(cost(y_pred, y_val))