In [1]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-lr4DFn
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-lr4DFn
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25ldone
[?25h  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-py2-none-any.whl size=101242 sha256=c482a342cc7a73b54780c3b7b13403229e7a5b10ec59023e28ea65b7e5d6d886
  Stored in directory: /tm

In [2]:
from keras.models import Model, Input as Kinput
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF
import numpy as np
import pickle

word2index = np.load('word2index.npy', allow_pickle=True)
tag2index = np.load('tag2index.npy', allow_pickle=True)

MAX_LENGTH = 149

myinput = Kinput(shape=(MAX_LENGTH,))
word_embedding_size = 300

# Embedding Layer
model = Embedding(input_dim=len(word2index), output_dim=word_embedding_size, input_length=MAX_LENGTH)(myinput)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(len(tag2index), activation="relu"))(model)  

# CRF Layer
crf = CRF(len(tag2index))

out = crf(model)  # output
model = Model(myinput, out)


#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()

Using TensorFlow backend.







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 149)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 149, 300)          7349700   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 149, 600)          1442400   
_________________________________________________________________
lstm_2 (LSTM)                (None, 149, 600)          2882400   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 149, 291)          174891    
_________________________________________________________________
crf_1 (CRF)                  (None, 149, 291)          170235    
Total params: 12,019,626
Trainable params: 12,019,626
Non-trainable params: 0
_______________________________________________

In [3]:
model.load_weights('mb-full.h5')

# Example

In [4]:
#ESTA FUNCION RECIBE EN sequences LA LISTA DE ORACIONES DONDE CADA ELEMENTO DE LA ORACION ES UN ONE HOT VECTOR
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [5]:
def cleaningWords(spp, test_samples):
    for i in range(len(test_samples)):
        response = spp.encode_as_pieces(test_samples[i])
        for j in range(len(response)):
            response[j] = response[j].replace("▁","")
   
        test_samples[i] = response
    return test_samples

In [6]:
from keras.preprocessing.sequence import pad_sequences
from tabulate import tabulate
import sentencepiece as sp

def PosTag(test_samples):
    
    # spliting by dot     
    test_samples = test_samples.split(".")
    
    # spliting by word using SENTENCEPIECE
    spp = sp.SentencePieceProcessor()
    spp.load("m_word.model")
    
    test_samples = cleaningWords(spp, test_samples)

    test_samples_X = []
    
    # Normalize with sentence length
    for s in test_samples:
        s_int = []
        for w in s:
            try:
                s_int.append(word2index[w.lower()])
            except KeyError:
                s_int.append(word2index['-OOV-'])
        test_samples_X.append(s_int)

    test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
    
    #Predicting
    predictions = model.predict(test_samples_X)
    
    #Formating tags
    log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
    
    #Output     
    out = ""
    for i in range(len(test_samples)):
        head = test_samples[i]
        body = [log_tokens[i][:len(test_samples[i])]]
        #print(tabulate(body, headers=head))
        out += tabulate(body, headers=head, tablefmt="github")
        #print ("\n")
        out += "\n"
        #print(out)
        
    return out
    

In [7]:
## postagging Freeling 4.1

## El      hombre   bajo     corre    bajo  el      puente   con  bajo  índice   de  adrenalina  .
## DA0MS0  NCMS000  AQ0MS00  VMIP3S0  SP    DA0MS0  NCMS000  SP   SP    NCMS000  SP  NCFS000     Fp


## pos tagger Stanford NLP

## El      hombre   bajo     corre    bajo  el      puente   con    bajo   índice  de    adrenalina  .
## da0000  nc0s000  aq0000   vmip000  sp000 da0000  nc0s000  sp000  aq0000 nc0s000 sp000 nc0s000     fp

In [16]:
print(PosTag("Correr es importante para mi. El hombre bajo corre bajo el puente con bajo indice de adrenalina"))

| Correr   | es      | importante   | para   | mi     |
|----------|---------|--------------|--------|--------|
| da0fp0   | vsip3s0 | aq0cs0       | sps00  | aq0fs0 |
| El     | hombre   | bajo   | corre   | bajo   | el     | puente   | con   | bajo   | indice   | de    | adrenalina   |
|--------|----------|--------|---------|--------|--------|----------|-------|--------|----------|-------|--------------|
| da0ms0 | ncms000  | sps00  | vmip3s0 | sps00  | da0ms0 | ncms000  | sps00 | sps00  | np0000a  | sps00 | Fp           |



In [20]:
print(PosTag("El es un personaje de la pelicula"))

| El     | es      | un     | personaje   | de    | la     | pelicula   |
|--------|---------|--------|-------------|-------|--------|------------|
| da0ms0 | vsip3s0 | di0ms0 | ncms000     | sps00 | da0fs0 | np0000a    |



In [18]:
print(PosTag("El hombre corre todo el dia"))

| El     | hombre   | corre   | todo   | el     | dia   |
|--------|----------|---------|--------|--------|-------|
| da0ms0 | ncms000  | vmip3s0 | di0ms0 | da0ms0 | W     |



In [21]:
from tkinter import *
from tkinter import scrolledtext

window = Tk()

window.title("||||||| POSTAGER |||||||")

window.geometry('900x300')

txt = Entry(window,width=83,font="Helvetica 14 bold")

txt.grid(column=0, row=0)

salida = scrolledtext.ScrolledText(window,width=103,height=15,bg='cyan',fg='black')

salida.grid(column=0, row=1)

#lbl = Label(window, text="")

#lbl.grid(column=0, row=2)

def clicked():
    res = PosTag(txt.get())
    salida.delete(1.0,END)
    salida.insert(INSERT,res)
#     lbl.configure(text=res)

btn = Button(window, text="RUN", command=clicked, bg='red', fg='white')

btn.grid(column=2, row=0)

window.mainloop()