In [3]:
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

## Assigning unique index to word
token_idx = {}
for sample in samples:
    for word in sample.split():
        if word not in token_idx:
            token_idx[word] = len(token_idx) + 1

length = 10

results = np.zeros(shape=(len(samples),length, max(token_idx.values()) +1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:length]:
        idx = token_idx.get(word)
        results[i,j,idx] = 1.


In [5]:
## Character-level one-hot encoding

import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
chars = string.printable ## all chars
token_idx = dict(zip(range(1, len(chars) +1), chars))

length = 50

results = np.zeros((len(samples), length, max(token_idx.keys()) + 1 ))

for i, sample in enumerate(samples):
    for j,character in enumerate(sample):
        idx = token_idx.get(character)
        results[i,j,idx] = 1.


[[[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [9]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(samples) ## word index

seq = tokenizer.texts_to_sequences(samples) ## palabras en vectores de indices

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') ## directamente a matriz

word_idx = tokenizer.word_index
print('Found %s unique tokens' % len(word_idx))

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
Found 9 unique tokens


In [10]:
##One hot hashing trick
##Se realiza mediante una funcion de hashing. La principal ventaja es ahorrar espacio y poder hacer el encoding sin que se tenga todos los datos disponibles

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

dimensions = 1000 ## vectores de tamano 1000
length = 10

results = np.zeros((len(samples), length, dimensions))
for i,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:length]:
        idx = abs(hash(word)) % dimensions ## index random entre 0 y 1000
        results[i,j,idx] = 1.


In [11]:
## Capa Embedding para dataset IMDB

from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
length = 20

(x_train, y_train) , (x_test, y_test) = imdb.load_data(num_words= max_features)

x_train = preprocessing.sequence.pad_sequences(x_train,maxlen=length)
x_test = preprocessing.sequence.pad_sequences(x_test,maxlen=length)
##Train y test sera un tensor tipo (numero_seq, 20)


In [13]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000,8,input_length = length))

model.add(Flatten()) ## lo transforma en (10000,8*length)

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summary()
 

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(x_train, 
                   y_train, 
                   epochs = 10,
                   validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
## Solo empleando las 20 primeras palabras de cada review nos ha salido un accuracy de 87%.Seria mejor emplear redes conv 1d o capas recurrentes.