In [2]:
import pandas as pd
import numpy as np

#importer le dataframe de train prétraité par preprocessing.py
df = pd.read_pickle("../data/df_preproc.pk")

In [3]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [24]:
!pip install emot
!pip install fasttext
!pip install sentencepiece

Collecting emot
  Downloading https://files.pythonhosted.org/packages/49/07/20001ade19873de611b7b66a4d5e5aabbf190d65abea337d5deeaa2bc3de/emot-2.1-py3-none-any.whl
Installing collected packages: emot
Successfully installed emot-2.1


In [6]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import tokenization

module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [31]:
from sklearn.model_selection import train_test_split

#codage des labels
catmap = {"pos" : 0, "neg" : 1, "neu" : 2, "irr" : 3}
#décodage
invmap = {0 : "pos", 1 : "neg", 2 : "neu", 3 : "irr"}

X = ["Tweet", "Language"]
y = "Avis"

df_train_test = df[X + [y]].copy()
df_train_test[y] = df_train_test[y].apply(lambda x : catmap[x])

#partition en données de train/test
train, test = train_test_split(df_train_test, test_size=0.2, random_state=1000)

#exclusion de la classe "irr"
train = train[train.Avis != catmap["irr"]]

In [8]:
#vectorisation des tweets

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [10]:
#construction du modèle
def build_model(bert_layer, max_len=512, lr=1e-5):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(10, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(10, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
max_len = 150
train_input = bert_encode(train.Tweet.values, tokenizer, max_len=max_len)
#test_input = bert_encode(test.Tweet.values, tokenizer, max_len=max_len)
train_labels = tf.keras.utils.to_categorical(train.Avis.values, num_classes=3)

In [17]:
#entrainement du modèle
model = build_model(bert_layer, max_len=max_len, lr=0.00001)
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=10,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1)

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.76577, saving model to model.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.76577 to 0.77252, saving model to model.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.77252 to 0.79955, saving model to model.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.79955 to 0.81306, saving model to model.h5
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.81306 to 0.81982, saving model to model.h5
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.81982
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.81982
Epoch 8/10

Epoch 00008: val_accuracy improved from 0.81982 to 0.82207, saving model to model.h5
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.82207
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.82207


In [19]:
#évaluation
from sklearn.metrics import accuracy_score

test_input = bert_encode(test.Tweet.values, tokenizer, max_len=max_len)
test_output = test.Avis.values
pred = np.argmax(model.predict(test_input), axis=1)

#prédiction de la classe "irr" à partir de la langue du tweet
for i in range(len(test)):
  if test.Language[i] != "en":
    pred[i] = 3

#calcul du score
accuracy_score(test_output, pred)

0.8143712574850299

In [42]:
#prédiction des labels de test.txt
#importation des données test.txt prétraitées par preprocessing.py
df_test = pd.read_pickle("../data/test_df.pk")

In [43]:
#vectorisation
test_input = bert_encode(df_test.Tweet.values, tokenizer, max_len=max_len)
#prédiction des classes "pos", "neg" et "neu"
pred = np.argmax(model.predict(test_input), axis=1)

#prédiction de la classe "irr"
for i in range(len(df_test)):
  if df_test.Language[i] != "en":
    pred[i] = 3

#décodage des classes
pred = [invmap[p] for p in pred]

In [44]:
df_test["pred"] = pred

In [46]:
#stats
df_test.pred.value_counts()

neu    499
irr    288
neg    116
pos     97
Name: pred, dtype: int64

In [48]:
#exportation du résultat
def to_txt(in_df, raw_df):
    res = ""
    for i in range(len(in_df)):
        res += "({0},{1},{2}) {3}".format(in_df.index[i], in_df.pred[i], in_df.Entreprise[i], raw_df.Tweet[i])
    return res

save_to = "../data/test_output.txt"

with open(save_to, "w") as text_file:
    text_file.write(to_txt(df_test, df_test))