In [1]:
import pandas as pd
import numpy as np

import time
import sys
from tqdm.notebook import tqdm

import torch
from transformers import CamembertModel, CamembertTokenizer

pd.set_option('display.max_colwidth', -1)

from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping


Using TensorFlow backend.


In [2]:
path_data = "/Users/yohannlefaou/Documents/data/posos/"

In [3]:
train = pd.read_csv(path_data +"input_train.csv", sep=",")


In [4]:
labels = pd.read_csv(path_data + "output_train.csv", sep=",")

In [5]:
train.head()

Unnamed: 0,ID,question
0,0,"bonjour, je m suis trompé de forum pour ma question alors je la repose ici. je pris pour la première fois hier du paroxétine et ce matin c'est une catastrophe. picotement dasn tous le corps annonciateur de sueur froide très très massive et de vomissement. j'en suis à deux crises depuis 5 heure du mat. la cela semble passer mes mes mains reste moites et chaude estce normal pour la première fois merci a tous"
1,1,est ce que le motilium me soulagera contre les nausées?
2,2,"mon médecin m'a prescrit adenyl. au 2ème cachet des maux de tête terribles et au 3ème palpitations, sueurs froides, chaleur intense dans la tête, tremblements, fourmillements dans la lèvre supérieure, difficultés à respirer.. dès l'arrêt du médicament tous les symptômes ont disparu. cela est-il déjà arrivé à quelqu'un??"
3,3,Est-ce qu'il existe une forme adaptée aux enfant de 5ans du Micropakine ?
4,4,mon medecin me soigne pour une rhino pharingite et m'a prescrit du amoxicilline comme anti biotique. Est-ce vraiment pour cette indication?


In [6]:
labels.head()

Unnamed: 0,ID,intention
0,0,28
1,1,31
2,2,28
3,3,44
4,4,31


In [7]:
# load tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# load model
model = CamembertModel.from_pretrained('camembert-base')

In [10]:
text = train["question"][0]

# encode() automatically adds the classification token <s>
token_ids = tokenizer.encode(text)
tokens = [tokenizer._convert_id_to_token(idx) for idx in token_ids]
print(tokens)

# unsqueeze token_ids because batch_size=1
token_ids = torch.tensor(token_ids).unsqueeze(0)
print(token_ids)

# forward method returns a tuple (we only want the logits)
# squeeze() because batch_size=1
output = model(token_ids)[0].squeeze()
# only grab output of CLS token (<s>), which is the first token
cls_out = output[0]
print(cls_out.size())

['<s>', '▁bonjour', ',', '▁je', '▁m', '▁suis', '▁trompé', '▁de', '▁forum', '▁pour', '▁ma', '▁question', '▁alors', '▁je', '▁la', '▁repose', '▁ici', '.', '▁je', '▁pris', '▁pour', '▁la', '▁première', '▁fois', '▁hier', '▁du', '▁par', 'ox', 'é', 'tine', '▁et', '▁ce', '▁matin', '▁c', "'", 'est', '▁une', '▁catastrophe', '.', '▁pic', 'ote', 'ment', '▁d', 'as', 'n', '▁tous', '▁le', '▁corps', '▁', 'annon', 'ci', 'ateur', '▁de', '▁sueur', '▁froide', '▁très', '▁très', '▁massive', '▁et', '▁de', '▁vomi', 'ssement', '.', '▁j', "'", 'en', '▁suis', '▁à', '▁deux', '▁crises', '▁depuis', '▁5', '▁heure', '▁du', '▁mat', '.', '▁la', '▁cela', '▁semble', '▁passer', '▁mes', '▁mes', '▁mains', '▁reste', '▁moi', 'tes', '▁et', '▁chaude', '▁est', 'ce', '▁normal', '▁pour', '▁la', '▁première', '▁fois', '▁merci', '▁a', '▁tous', '</s>']
tensor([[    5,  5061,     7,    50,   115,   146, 12125,     8,  1026,    24,
           155,   397,   183,    50,    13,  4537,   323,     9,    50,   523,
            24,    13,   272

In [23]:
token_ids[:, :]

tensor([[    5,    50,  4608,  8331,   850,   451,    17,   286,   176,    20,
           318,     7,     8,   124,    15,   135, 26347,    10,    15,   135,
         19365,     7,   395,   129,   956,     8,   742,  5472,    15,   742,
          5228,    43,   118, 26347,    10,    15,  5378,     7,    27,    76,
            11,    73,  3470,    15,   124,    38,  2156, 10831,    10, 13770,
           280,   101,   120,    22,   732,   136,    49,    11, 12029,    34,
            72,    38, 12699,    10, 14443,    35,     7,   792,     8,   254,
             7,  2959,   142,    76,    11,    73,   380,    23,   281, 12960,
            18,    10,    23,    60,  3589,    31,    33,    67,  6834,    15,
           118, 26347,    10,    38, 12067,  2802,  5378,    53,   387, 14742,
           118, 26484,    10, 10263,    67,  6834,    15,   135, 26347,    10,
          3120,  2195,  5378,    53,   387, 14742,   135,  1075,   129,   956,
          1432,    95,  2935,    65,    34,   334,  

In [24]:
# compute embeddings for all questions
max_tokens = 512 # maximum allowed number of tokens in one sentence in order to compute embeddings
sentence_tokens = []
sentence_embeddings = []
for i in tqdm(range(len(train))):
    text = train["question"][i]
    token_ids = tokenizer.encode(text)
    token_ids = torch.tensor(token_ids).unsqueeze(0)
    sentence_tokens.append(token_ids)
    output = model(token_ids[:, :max_tokens])[0].squeeze()
    sentence_embeddings.append(output[0].tolist())

HBox(children=(IntProgress(value=0, max=8028), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (966 > 512). Running this sequence through the model will result in indexing errors





In [25]:
train2 = pd.DataFrame(sentence_embeddings)

In [26]:
train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.10326,0.164496,0.000466,0.035828,-0.014292,-0.012606,-0.057755,0.201483,0.020815,0.10738,...,-0.003843,-0.086718,-0.041593,0.036421,0.165885,0.057104,-0.093255,-0.149246,-0.05035,0.122724
1,-0.085814,0.019714,0.092567,-0.046294,-0.063478,-0.012849,0.079008,0.265424,0.04119,0.084285,...,-0.116445,-0.077676,-0.152492,0.058571,0.083522,0.096404,0.073179,-0.09217,-0.057123,-0.005349
2,-0.075152,0.073966,0.039749,-0.00061,-0.024459,0.001018,-0.033008,0.212762,0.004864,0.096275,...,0.04405,-0.101421,-0.123798,0.122955,0.101871,0.06703,-0.021039,-0.177599,-0.051054,0.047357
3,0.032002,0.120783,0.066957,-0.128963,-0.008886,-0.034391,-0.008858,0.255312,0.078952,0.094861,...,-0.059976,0.025396,-0.195635,0.099526,0.057682,0.064332,0.017317,-0.061286,0.009324,-0.039828
4,-0.092328,0.184989,0.024131,-0.07604,0.028363,-0.058682,-0.020498,0.225975,0.00771,0.071678,...,-0.023249,0.029947,-0.145327,0.07908,0.088762,0.089566,0.032271,-0.190924,-0.045433,-0.005441


In [27]:
sys.getsizeof(train2)

49324192

In [23]:
train2.to_csv(path_data + "sentence_embeddings_CLS.csv", index=False)

In [8]:
train2 = pd.read_csv(path_data + "sentence_embeddings_CLS.csv")

# Neural network model

In [24]:
# preprocessing
# one hot encode the target
ohe = OneHotEncoder()
one_hot_encode_labels = ohe.fit_transform(labels["intention"].values.reshape(-1, 1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(train2, one_hot_encode_labels, test_size=0.2,
                                                      random_state=2019)

In [26]:
nn = Sequential()
nn.add(Dropout(0.3, input_shape=(768,)))
#nn.add(Dense(1024, input_dim= 768, activation="relu")) # input_dim= 768
#nn.add(Dropout(0.3))
#nn.add(Dense(256, activation="relu"))
#nn.add(Dropout(0.3))
nn.add(Dense(51, activation="softmax")) #a completer

nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [27]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)

nn.fit(np.array(X_train), y_train, epochs=30, batch_size=32,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7ceb6cd0>

In [28]:
nn.fit(np.array(X_train), y_train, epochs=10, batch_size=32,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a90f60bd0>

In [29]:
nn.fit(np.array(X_train), y_train, epochs=10, batch_size=32,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a7d300b50>

In [30]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=64,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30

KeyboardInterrupt: 

In [31]:
nn.fit(np.array(X_train), y_train, epochs=20, batch_size=128,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a90f53250>

In [32]:
nn.fit(np.array(X_train), y_train, epochs=20, batch_size=128,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1a7d1bd750>

In [33]:
nn = Sequential()
nn.add(Dropout(0.1, input_shape=(768,)))
#nn.add(Dense(1024, input_dim= 768, activation="relu")) # input_dim= 768
#nn.add(Dropout(0.3))
#nn.add(Dense(256, activation="relu"))
#nn.add(Dropout(0.3))
nn.add(Dense(51, activation="softmax")) #a completer

nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)

nn.fit(np.array(X_train), y_train, epochs=30, batch_size=64,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a8ff49c10>

In [35]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=64,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7d1bd690>

In [36]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=64,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7d0dbad0>

In [37]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=124,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7efb1d90>

In [38]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=256,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7efb8310>

In [39]:
nn = Sequential()
#nn.add(Dropout(0.3, input_shape=(768,)))
#nn.add(Dense(1024, input_dim= 768, activation="relu")) # input_dim= 768
#nn.add(Dropout(0.3))
#nn.add(Dense(256, activation="relu"))
#nn.add(Dropout(0.3))
nn.add(Dense(51, activation="softmax")) #a completer

nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)

nn.fit(np.array(X_train), y_train, epochs=30, batch_size=32,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a91129d50>

In [41]:
nn.fit(np.array(X_train), y_train, epochs=30, batch_size=64,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1a7f0987d0>

In [43]:
nn.fit(np.array(X_train), y_train, epochs=10, batch_size=2048,
       validation_data=(np.array(X_valid), y_valid), shuffle=True) #, callbacks=[es]

Train on 6422 samples, validate on 1606 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a7f1092d0>

# Export data

In [95]:
export_X_train, export_X_test, export_y_train, export_y_test = train_test_split(train,
                                                                                labels["intention"].values,
                                                                                test_size=0.2, 
                                                                                random_state=2019)

export_X_train["intention"] = export_y_train
export_X_test["intention"] = export_y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [99]:
export_X_train.to_csv("data/train.tsv", sep="\t", index=False)
export_X_train.to_csv("data/test.tsv", sep="\t", index=False)

In [97]:
dev = pd.read_csv(path_data +"input_test_b1Yip6O.csv", sep=",")

In [98]:
dev.to_csv("data/dev.tsv", sep="\t", index=False)