In [1]:
import pandas as pd
import unidecode
import re
import nltk
import gensim
import numpy as np
import keras
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pickle
from keras.utils import np_utils
from keras.layers import Embedding
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Data

In [2]:
paths = {
    "dev":"../data/macmorpho-v3/macmorpho-dev.txt",
    "test":"../data/macmorpho-v3/macmorpho-test.txt",
    "train":"../data/macmorpho-v3/macmorpho-train.txt",
}

In [3]:
#load data
sentences = []
for p in paths.values():
    file = open(p, "r")
    sentences = sentences + file.readlines()
    file.close()
data = pd.DataFrame()
data['sentences'] = sentences

In [4]:
len(data)

49932

In [5]:
data.head()

Unnamed: 0,sentences
0,"Ainda_ADV em_PREP dezembro_N de_PREP 1990_N ,_..."
1,"Porém_KC ,_PU como_KS a_ART previsão_N indica_..."
2,"""_PU O_ART crescimento_N é_V expressivo_ADJ ma..."
3,O_ART programa_N atende_V ainda_ADV as_ART cul...
4,"de_PREP qualquer_PROADJ maneira_N ,_PU toda_PR..."


# Pre-processing

vou fazer uma rede que recebe uma palavra e retorna 

- remover acentos
- remover pontuação (?) -> nao vou fazer isso... pq tbm é uma tag
- remover \n
- aplicar lower case
- get TAGS
- DEPOIS fazer isso:
- aplicar token
- fazer stemming
- aplicar pharese

- preparacao dos dados 
    separar as palavras e a tag. Dicionario tag: word
- create embedding com todos os dados
- olhar tamanho das palavras
- vetorzacao, padding... etc.

In [6]:
def remove_accents(sentence):
    return unidecode.unidecode(sentence)

In [7]:
def remove_character(sentence):
    return re.sub(' +',' ', (sentence.replace("\n", " ")).replace("\t", " "))

In [8]:
def pre_processing(sentence):
    sentence = remove_character(sentence)
    sentence = remove_accents(sentence)
    sentence = sentence.lower()
    return sentence

In [9]:
#pre-processing
data["sentences"] = data["sentences"].map(pre_processing)

In [10]:
#get text
data["corpus"] = data.loc[:, "sentences"].apply(lambda x : (" ".join([y.split("_")[0] for y in x.strip().split(" ")])).strip())

In [11]:
# get tags
data["tags"] = data.loc[:, "sentences"].apply(lambda x : (" ".join([y.split("_")[1] for y in x.strip().split(" ")])).strip())

In [12]:
data.head()

Unnamed: 0,sentences,corpus,tags
0,"ainda_adv em_prep dezembro_n de_prep 1990_n ,_...","ainda em dezembro de 1990 , foi editada a famo...",adv prep n prep n pu v pcp art adj n pu pro-ks...
1,"porem_kc ,_pu como_ks a_art previsao_n indica_...","porem , como a previsao indica entrada de fren...",kc pu ks art n v n prep n adj pu v propess v p...
2,"""_pu o_art crescimento_n e_v expressivo_adj ma...",""" o crescimento e expressivo mas , mesmo assim...",pu art n v adj kc pu pden pden pu v prep prep+...
3,o_art programa_n atende_v ainda_adv as_art cul...,o programa atende ainda as culturas de feijao ...,art n v adv art n prep n pu adj kc adj n pu pu...
4,"de_prep qualquer_proadj maneira_n ,_pu toda_pr...","de qualquer maneira , toda essa "" informalidad...",prep proadj n pu proadj proadj pu n pu prep+ar...


In [13]:
#apply token
data["corpus"] = data.loc[:, "corpus"].apply(lambda x : nltk.word_tokenize(x))
data["tags"] = data.loc[:, "tags"].apply(lambda x : nltk.word_tokenize(x))

In [14]:
data.head()

Unnamed: 0,sentences,corpus,tags
0,"ainda_adv em_prep dezembro_n de_prep 1990_n ,_...","[ainda, em, dezembro, de, 1990, ,, foi, editad...","[adv, prep, n, prep, n, pu, v, pcp, art, adj, ..."
1,"porem_kc ,_pu como_ks a_art previsao_n indica_...","[porem, ,, como, a, previsao, indica, entrada,...","[kc, pu, ks, art, n, v, n, prep, n, adj, pu, v..."
2,"""_pu o_art crescimento_n e_v expressivo_adj ma...","[``, o, crescimento, e, expressivo, mas, ,, me...","[pu, art, n, v, adj, kc, pu, pden, pden, pu, v..."
3,o_art programa_n atende_v ainda_adv as_art cul...,"[o, programa, atende, ainda, as, culturas, de,...","[art, n, v, adv, art, n, prep, n, pu, adj, kc,..."
4,"de_prep qualquer_proadj maneira_n ,_pu toda_pr...","[de, qualquer, maneira, ,, toda, essa, ``, inf...","[prep, proadj, n, pu, proadj, proadj, pu, n, p..."


# Embedding

In [15]:
data.columns

Index(['sentences', 'corpus', 'tags'], dtype='object')

## Phrases and Phraser

In [16]:
corpus = data.corpus.values
bigram_corpus_phrases = gensim.models.Phrases(corpus)
bigram_corpus = bigram_corpus_phrases[corpus]



In [17]:
size_emb = 300
model_word2vec = gensim.models.word2vec.Word2Vec(bigram_corpus, min_count=1, workers=4, seed=123, size=size_emb)

In [18]:
len(model_word2vec.wv.vocab)

56252

In [19]:
def create_dict_embedding(word2vec):
    embeddings_dict = {}
    #get size dimension
    size_vec = word2vec.wv.vector_size
    #create disctionary
    for word in list(word2vec.wv.vocab.keys()):
        embeddings_dict[word] = word2vec.wv[word] 
    return embeddings_dict

In [20]:
def get_embeddings(word2vec):
    embeddings_dict = create_dict_embedding(word2vec)
    size = len(embeddings_dict)
    embedding_matrix = np.ndarray((size, word2vec.wv.vector_size), dtype='float32')
    wv_map = {}
    pos = 0
    for i, (word, vector) in enumerate(embeddings_dict.items()):
        #pos = i + 1
        pos = i
        wv_map[word] = pos
        embedding_matrix[pos] = vector
    #pos += 1
    #wv_map["<unk>"] = pos
    #embedding_matrix[pos] = numpy.random.uniform(low=-0.05, high=0.05, size=dim)
    return embedding_matrix, wv_map

# Split data: Train, Test and Validation

In [21]:
data = shuffle(data, random_state=0)

In [22]:
corpus = data.corpus.values
tags = data.tags.values

In [23]:
X_train, X_test, y_train, y_test = train_test_split(corpus, tags, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

# Preparing data
- padding max_sentence_length  with 0
- index words
- index tags

In [62]:
embeddings, words_index = get_embeddings(model_word2vec)

In [25]:
tags_index = {}
i = 1
for tags in data.tags.values:
    for tag in tags:
        if(tag not in tags_index.keys()):
            tags_index[tag] = i
            i = i + 1

In [26]:
max_sentence_length = int(data["corpus"].map(len).describe()['max'] + 1)
max_sentence_length

249

## Index

In [27]:
# SET sentence index and padding
def index_words(sentences, max_sentence_length, words_index):    
    print("indexing sentences...")
    text_max = False
    sent_words = []

    sentences_words_indexed = np.zeros((len(sentences), max_sentence_length), dtype='int32')

    for i, sentence in enumerate(sentences):
        print(str(i)+"/"+str(len(sentences)))
        text_max = False
        sent_words = []
        for word in sentence:
            if(word not in set(words_index.keys())):
                continue
            sent_words.append(words_index[word])
            if(len(sent_words) >= max_sentence_length):
                text_max = True
                break
        if(text_max == False):
            sent_words = np.pad(sent_words, (0, max_sentence_length - len(sent_words) % max_sentence_length), 'constant')
        sentences_words_indexed[i] = sent_words
    return sentences_words_indexed

In [28]:
len(X_train)

31956

In [29]:
len(X_train[:200])

200

In [211]:
X_train_inx = pickle.load(open("data/X_train_index.p", "rb"))
y_train_inx = pickle.load(open("data/y_train_index.p", "rb"))

X_test_inx = pickle.load(open("data/X_test_index.p", "rb"))
y_test_inx = pickle.load(open("data/y_test_index.p", "rb"))

X_val_inx = pickle.load(open("data/X_val_index.p", "rb"))
y_val_inx = pickle.load(open("data/y_val_index.p", "rb"))


In [None]:
#train
print("traing")
X_train_inx = index_words(X_train, max_sentence_length, words_index)
y_train_inx = index_words(y_train, max_sentence_length, tags_index)

pickle.dump(X_train_inx, open( "data/X_train_index.p", "wb" ) )
pickle.dump(y_train_inx, open( "data/y_train_index.p", "wb" ) )

#test
print("test")
X_test_inx = index_words(X_test, max_sentence_length, words_index)
y_test_inx = index_words(y_test, max_sentence_length, tags_index)

pickle.dump(X_test_inx, open( "data/X_test_index.p", "wb" ) )
pickle.dump(y_test_inx, open( "data/y_test_index.p", "wb" ) )

#validaotion
print("validation")
X_val_inx = index_words(X_val, max_sentence_length, words_index)
y_val_inx = index_words(y_val, max_sentence_length, tags_index)

pickle.dump(X_val_inx, open( "data/X_val_index.p", "wb" ) )
pickle.dump(y_val_inx, open( "data/y_val_index.p", "wb" ) )

In [217]:
y_test_inx = np_utils.to_categorical(y_test_inx)

In [103]:
y_train_inx = np_utils.to_categorical(y_train_inx)

In [104]:
y_val_inx = np_utils.to_categorical(y_val_inx)

# Model

In [34]:
model = keras.models.Sequential()
model

<keras.engine.sequential.Sequential at 0x7fe9598969e8>

In [35]:
embeddings.shape

(56252, 300)

In [36]:
max_sentence_length

249

In [37]:
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
def embeddings_layer(max_length, embeddings, trainable=False, masking=False,
                     scale=False, normalize=False):

    vocab_size = embeddings.shape[0]
    embedding_size = embeddings.shape[1]

    _embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_size,
        input_length=max_length, #if max_length > 0 else None,
        trainable=False,
        #mask_zero=masking, if max_length > 0 else False,
        weights=[embeddings]
    )

    return _embedding

In [38]:
len(tags_index)

26

In [92]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        print("PRINT " + str(y_true))
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [91]:
from keras import backend as K
 
#def ignore_class_accuracy(to_ignore=0):
def ignore_accuracy(y_true, y_pred):
    print(y_true)
    y_true_class = K.argmax(y_true, axis=-1)
    y_pred_class = K.argmax(y_pred, axis=-1)

    ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
    matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
    accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
    return accuracy
    #return ignore_accuracy

In [160]:
model = Sequential()
model.add(embeddings_layer(max_sentence_length, embeddings))
model.add(keras.layers.LSTM(300, return_sequences=True))
#model.add(keras.layers.Dropout(0.5))
#model.add(keras.layers.LSTM(300, return_sequences=True))
#model.add(keras.layers.Dropout(0.5))
#model.add(keras.layers.Dense(150, activation='relu'))
#model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(50, activation='relu'))
#model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(len(tags_index)+1, activation='softmax'))
#model.add(keras.layers.Activation('softmax'))

In [106]:
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [107]:
#model.summary()

## Compile

In [124]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy', 'categorical_accuracy',  ignore_class_accuracy(0)])

PRINT Tensor("dense_28_target:0", shape=(?, ?, ?), dtype=float32)


In [132]:
#sem duas dropout
history =  model.fit(
            X_train_inx[:2000],
            y_train_inx[:2000],
            batch_size = 32,
            epochs=2,#5,
            validation_data = (X_val_inx[:400], y_val_inx[:400]),
            callbacks = [
                keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    min_delta=0.0001,
                    patience=10,
                    verbose=0,
                    mode='min'
                 )
            ]
        )

Train on 2000 samples, validate on 400 samples
Epoch 1/2
Epoch 2/2


# Evaluate Model

In [229]:
predicted_test = model.predict(X_test_inx)

In [230]:
tags = []
for predict in predicted_test:
    for t in predict:
        tags.append(np.argmax(t))
len(np.unique(tags))

22

In [227]:
predicted_test_results = np.zeros((predicted_test.shape[0], predicted_test.shape[1]), dtype='int32')

In [228]:
for i, predict in enumerate(predicted_test):
    tags = []
    for p in predict:
        tags.append(np.argmax(p))
    tags = np.array(tags)
    predicted_test_results[i] = tags

In [None]:
model.s

In [272]:
import sklearn.metrics as metric

In [None]:
y_test_

In [284]:
predicted_test_results

array([[ 9, 14, 14, ...,  6,  6,  6],
       [17, 17, 17, ...,  6,  6,  6],
       [ 6,  6,  6, ...,  6,  6,  6],
       ...,
       [ 9, 14, 14, ...,  6,  6,  6],
       [ 6,  6,  6, ...,  6,  6,  6],
       [12, 25, 14, ...,  6,  6,  6]], dtype=int32)

In [289]:
metric.f1_score(predicted_test_results[0],y_test_inx_real[0], average="macro")

0.0005649717514124295

In [290]:
metric.classification_report(predicted_test_results[0],y_test_inx_real[0])

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


'             precision    recall  f1-score   support\n\n          0       0.00      0.00      0.00         0\n          1       0.00      0.00      0.00         0\n          2       0.00      0.00      0.00         0\n          3       0.00      0.00      0.00         0\n          4       0.00      0.00      0.00         0\n          5       0.00      0.00      0.00         0\n          6       0.50      0.00      0.01       234\n          9       0.00      0.00      0.00         1\n         10       0.00      0.00      0.00         0\n         11       0.00      0.00      0.00         0\n         12       0.00      0.00      0.00         0\n         14       0.00      0.00      0.00         4\n         15       0.00      0.00      0.00         0\n         17       0.00      0.00      0.00         9\n         25       0.00      0.00      0.00         1\n\navg / total       0.47      0.00      0.01       249\n'

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

In [231]:
from sklearn.metrics import accuracy_score

In [None]:
metric.f1_score(predicted_test_results[0],y_test_inx_real[0], average="macro")

In [236]:
y_test_inx_real = np.zeros((y_test_inx.shape[0], y_test_inx.shape[1]), dtype='int32')

In [237]:
for i, predict in enumerate(y_test_inx):
    tags = []
    for p in predict:
        tags.append(np.argmax(p))
    tags = np.array(tags)
    y_test_inx_real[i] = tags

In [239]:
import collections

In [243]:
l = []
for y in y_test_inx_real:
    for i in y:
        l.append(i)

In [248]:
l_pred = []
for y in predicted_test_results:
    for i in y:
        l_pred.append(i)

In [246]:
len(l)

2486763

In [260]:
real = dict(collections.Counter(l))

In [261]:
pred = dict(collections.Counter(l_pred))

In [268]:
real

{1: 13865,
 3: 18335,
 6: 20152,
 2: 40679,
 15: 4639,
 11: 2323,
 10: 18659,
 12: 3337,
 5: 11703,
 9: 8694,
 4: 28005,
 0: 2295900,
 16: 3055,
 7: 2464,
 8: 4938,
 19: 2219,
 13: 3930,
 18: 1149,
 14: 1282,
 17: 336,
 20: 150,
 24: 528,
 21: 204,
 26: 17,
 23: 51,
 22: 107,
 25: 42}

In [270]:
for i in range(0,len(real.keys())):
    try:
        print(pred[i]/real[i]*100)
    except:
        pass
    #pred[2]/real[2]*100

4.355590400278758e-05
3.4186801298232963
0.0024582708522825044
5.527584359935726
15.474664615910449
11796.511512504962
15.113871635610765
0.005359344016292406
2.281532501076194
26.07132154629907
0.02544529262086514
1825.7410296411856
0.08622547962923044
16.923076923076923
20243.154761904763
1.5665796344647518
0.6666666666666667
4.672897196261682
339.2156862745098
0.1893939393939394
26933.333333333332
11.76470588235294


In [259]:
a[2]/b[2]

40679.0

In [238]:
accuracy_score(y_test_inx_real, predicted_test_results)

ValueError: multiclass-multioutput is not supported

In [225]:
predicted_test_results2[2]

array([6, 8, 9, 4, 4, 6, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [216]:
y_test_inx[2]

array([6, 8, 9, 4, 4, 6, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [208]:
y_test[3]

['adj',
 'n',
 'v',
 'v',
 'proadj',
 'n',
 'prep+art',
 'n',
 'adj',
 'kc',
 'prep+art',
 'n',
 'adj',
 'pu']

In [192]:
tags = np.array(tags)

In [193]:
tags

array([ 9, 14, 14,  6, 17,  6,  6,  6, 17, 17, 17, 17, 17, 17, 17, 14, 14,
       17, 25,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6

In [183]:
a = []

In [186]:
a.append(1)

In [188]:
np.argmax(a)

0

In [87]:
predicted_test = model.predict(X_test_inx[:1000])

In [88]:
tags = []
for predict in predicted_test:
    for t in predict:
        tags.append(np.argmax(t))
        

In [89]:
np.unique(tags)

array([ 0,  1,  2,  4,  6, 10])

In [None]:
predicted_test = model.predict(X_test_inx[:1000])

In [48]:
tags = []
for predict in predicted_test:
    for t in predict:
        tags.append(np.argmax(t))
        

In [138]:
np.unique(tags)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 15, 16, 19])

In [73]:
np.unique(tags)

array([ 1,  4,  6,  8,  9, 10, 18, 19, 20, 22, 23, 24, 25, 26])

In [88]:
np.unique(tags)

array([ 0,  1,  2,  3,  4,  5,  6, 10])

In [147]:
predicted_test = model.predict(X_test_inx[:1000])

In [148]:
predicted_test

array([[[1.81179438e-02, 1.55267000e-01, 1.16288356e-01, ...,
         1.58056114e-02, 8.08129553e-03, 9.27509926e-03],
        [9.92445461e-03, 3.44675817e-02, 3.95947248e-01, ...,
         4.17349767e-03, 1.13613135e-03, 1.67626294e-03],
        [3.55146527e-02, 2.25155260e-02, 3.20306093e-01, ...,
         6.80369604e-03, 2.21172604e-03, 2.74471566e-03],
        ...,
        [1.00000000e+00, 2.55077065e-19, 1.53537526e-17, ...,
         2.49018381e-24, 4.43458032e-23, 3.29372154e-27],
        [1.00000000e+00, 2.55107231e-19, 1.53550397e-17, ...,
         2.49044039e-24, 4.43505426e-23, 3.29436249e-27],
        [1.00000000e+00, 2.55137398e-19, 1.53562706e-17, ...,
         2.49068750e-24, 4.43551117e-23, 3.29495336e-27]],

       [[9.83433127e-02, 8.72423872e-02, 5.44521771e-02, ...,
         1.99123342e-02, 2.28976887e-02, 2.21705250e-02],
        [1.50355017e-02, 6.16921671e-02, 3.24901313e-01, ...,
         6.43057562e-03, 2.43131979e-03, 3.04739503e-03],
        [3.63115519e-02, 

In [149]:
tags = []
for predict in predicted_test:
    for t in predict:
        tags.append(np.argmax(t))
        

In [150]:
np.unique(tags)

array([ 0,  1,  2,  3,  4,  6, 10])

In [113]:
k = []
for p in y_test_inx:
    for c in p:
        k.append(np.argmax(c))

In [117]:
np.unique(k)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [167]:
t = predicted_test[0][0]

In [168]:
max(t)

0.4401563

In [169]:
predicted_test.shape

(300, 249, 27)

In [91]:
predicted_test.shape

(300, 249, 27)

In [98]:
predicted_test[9].shape

(249, 27)

In [110]:
(np.all(predicted_test[9]))

True

In [114]:
predicted_test[0].shape

(249, 27)

In [133]:
predicted_test.shape[0]

300

In [146]:
teste = np.ones((predicted_test.shape[0], predicted_test.shape[1], predicted_test.shape[2]))

In [137]:
i = 0

j = 0

In [155]:
np.argmax(predicted_test[0][110])

0

In [147]:
teste[i][j] = (predicted_test[i][j] == max(predicted_test[i][j]))

IndexError: arrays used as indices must be of integer (or boolean) type

In [140]:
for i in predicted_test:
    for j in i:
        teste[i][j] = (predicted_test[i][j] == max(predicted_test[i][j]))


IndexError: arrays used as indices must be of integer (or boolean) type

In [123]:
predicted_test[0][101] == max(predicted_test[0][101])

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [111]:
predicted_test[9] = (predicted_test[8] ==  max(predicted_test[8]))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [74]:
predicted_test = (predicted_test == max(predicted_test))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [86]:
i = 0
for p in predicted_test:
    for b in p:
        i = i + 1
        b = (b == max(b))
        break
    break

In [87]:
b.shape

(27,)

In [88]:
b

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [89]:
t = (t == max(t))

In [90]:
t

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [60]:
X_test_inx.shape

(9987, 249)

In [61]:
y_test_inx.shape

(9987, 249, 27)

In [62]:
y_test_inx[0][3]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [63]:
predicted_test[0][3]

array([0.13757886, 0.03904151, 0.04679544, 0.0345195 , 0.04117398,
       0.03923594, 0.03097416, 0.02464281, 0.0417484 , 0.02912197,
       0.03302423, 0.0270728 , 0.03787771, 0.03424341, 0.03430784,
       0.02867949, 0.03172125, 0.03280784, 0.03111209, 0.03372767,
       0.02728003, 0.03019129, 0.03634657, 0.03472586, 0.03262221,
       0.02992634, 0.01950084], dtype=float32)

In [70]:
predicted_test.data.shape

(9987, 249, 27)

In [None]:
tags_index

In [72]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences
 

In [74]:
print(logits_to_tokens(predicted_test, {i: t for t, i in tags_index.items()}))

KeyError: 0

In [78]:
for i, pred in enumerate(predicted_test[100]):
	try:
		print(sentence[i], ' : ', tags_index[np.argmax(pred)])
	except:
		pass
		# print('NA')

In [83]:
sorted(predicted_test[0][0])

[0.31801337,
 0.32218537,
 0.32299793,
 0.3276326,
 0.3278319,
 0.328435,
 0.33004722,
 0.33046103,
 0.3323479,
 0.3344832,
 0.33542857,
 0.33555093,
 0.3381184,
 0.3394882,
 0.3413541,
 0.34203148,
 0.34526813,
 0.3500948,
 0.35055625,
 0.3636256,
 0.37206104,
 0.3778393,
 0.38035557,
 0.40135187,
 0.40262803,
 0.42853042,
 0.731753]

In [62]:
metrics = sklearn.metrics.precision_recall_fscore_support(y_test_inx, predicted_test)
#pd.DataFrame(list(metrics), index=['Precision', 'Recall', "F-Score", "Support"])

ValueError: unknown is not supported

In [168]:
sklearn.metrics.confusion_matrix(y_true=y_test_inx[:300], y_pred=predicted_test)

ValueError: unknown is not supported

In [170]:
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true=y_test, y_score=predicted_test)
sklearn.metrics.auc(fpr, tpr)

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.

In [185]:
y_train_inx.shape

(31956, 249, 27)

In [188]:
y_test_inx#[0][0]

array([[[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [178]:
t = y_train_inx[0][0]

In [None]:
t

In [179]:
sklearn.metrics.confusion_matrix(y_train_inx[0][0], t)

array([[26,  0],
       [ 0,  1]])

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [419]:
predicted_test = model.predict(X=test_set['text_tokens'])
sklearn.metrics.confusion_matrix(y_true=test_set['tags'], y_pred=predicted_test)

TypeError: predict() got an unexpected keyword argument 'X'

grafico do loss
curva ROC