### Sentiment example 

First lets load the data.

In [None]:
import numpy as np
import random
from sklearn.model_selection import train_test_split

positive_sentences = [l.strip() for l in open("exercise/rt-polaritydata/rt-polarity.pos").readlines()]
negative_sentences = [l.strip() for l in open("exercise/rt-polaritydata/rt-polarity.neg").readlines()]

positive_labels = [1 for sentence in positive_sentences]
negative_labels = [0 for sentence in negative_sentences]

sentences = np.concatenate([positive_sentences,negative_sentences], axis=0)
labels = np.concatenate([positive_labels,negative_labels],axis=0)

## make sure we have a label for every data instance
assert(len(sentences)==len(labels))
data={}
np.random.seed(113) #seed
data['target']= np.random.permutation(labels)
np.random.seed(113) # use same seed!
data['data'] = np.random.permutation(sentences)


In [None]:
X_rest, X_test, y_rest, y_test = train_test_split(data['data'], data['target'], test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_rest, y_rest, test_size=0.2)
del X_rest, y_rest

In [None]:
y_train

In [None]:
print("#train instances: {} #dev: {} #test: {}".format(len(X_train),len(X_dev),len(X_test)))

### Using word unigram (embeddings)

In [None]:
from collections import defaultdict
w2i = defaultdict(lambda: len(w2i))

PAD = w2i["<pad>"] # index 0 is padding
UNK = w2i["<unk>"] # index 1 is for UNK

# convert words to indices, taking care of UNKs
X_train_num = [[w2i[word] for word in sentence.split(" ")] for sentence in X_train]
w2i = defaultdict(lambda: UNK, w2i) # freeze - cute trick!
X_dev_num = [[w2i[word] for word in sentence.split(" ")] for sentence in X_dev]
X_test_num = [[w2i[word] for word in sentence.split(" ")] for sentence in X_test]

max_sentence_length=max([len(s.split(" ")) for s in X_train] 
                        + [len(s.split(" ")) for s in X_dev] 
                        + [len(s.split(" ")) for s in X_test] )

In [None]:
from keras.preprocessing import sequence
# pad X
X_train_pad = sequence.pad_sequences(X_train_num, maxlen=max_sentence_length, value=PAD)
X_dev_pad = sequence.pad_sequences(X_dev_num, maxlen=max_sentence_length, value=PAD)
X_test_pad = sequence.pad_sequences(X_test_num, maxlen=max_sentence_length,value=PAD)


In [None]:
print(X_train_pad.shape)

In [None]:
vocabulary_size = len(w2i)
embeds_size=64

np.random.seed(113) #set seed before any keras import
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, GlobalAveragePooling1D, LSTM


model = Sequential()
model.add(Embedding(vocabulary_size, embeds_size, input_length=max_sentence_length, mask_zero=True))
#model.add(GlobalAveragePooling1D()) # mean embedding: actually better for this example!
model.add(LSTM(100))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_pad, y_train, epochs=5)
loss, accuracy = model.evaluate(X_dev_pad, y_dev)

In [None]:
print(accuracy)

In [None]:
predictions = model.predict_classes(X_dev_pad) 

In [None]:
from collections import Counter
Counter([x[0] for x in predictions])

## Using multiple inputs (characters and words) with an LSTM

First lets prepare the data for the character representation (here we represent each words by the character indices). For this we use a dedicated c2i.

In [None]:
from collections import defaultdict

# convert words to indices, taking care of UNKs
def get_characters(sentence, c2i):
    out = []
    for word in sentence.split(" "):
        chars = []
        for c in word:
            chars.append(c2i[c])
        out.append(chars)
    return out

c2i = defaultdict(lambda: len(c2i))

PAD = c2i["<pad>"] # index 0 is padding
UNK = c2i["<unk>"] # index 1 is for UNK
X_train_num = [get_characters(sentence, c2i) for sentence in X_train]
c2i = defaultdict(lambda: UNK, c2i) # freeze - cute trick!
X_dev_num = [get_characters(sentence, c2i) for sentence in X_dev]
X_test_num = [get_characters(sentence, c2i) for sentence in X_test]

max_sentence_length=max([len(s.split(" ")) for s in X_train] 
                        + [len(s.split(" ")) for s in X_dev] 
                        + [len(s.split(" ")) for s in X_test] )
max_word_length = max([len(word)  for sentence in X_train_num for word in sentence])

In [None]:
### we need both max sent and word length
print(max_sentence_length)
print(max_word_length)
print(X_train[0:2])
print(X_train_num[0:2]) # example how the first two sentences are encoded

In [None]:
def pad_words(tensor_words, max_word_len, pad_symbol_id, max_sent_len=None):
    """
    pad character list all to same word length
    """
    padded = []
    for words in tensor_words:
        if max_sent_len: #pad all to same sentence length (insert empty word list)
            words = [[[0]]*(max_sent_len-len(words))+ words][0] #prepending empty words
        padded.append(sequence.pad_sequences(words, maxlen=max_word_len, value=pad_symbol_id))
    return np.array(padded)

In [None]:
X_train_pad_char = pad_words(X_train_num, max_word_length, 0, max_sent_len=max_sentence_length)
X_dev_pad_char = pad_words(X_dev_num, max_word_length, 0, max_sent_len=max_sentence_length)

In [None]:
X_train_pad.shape

In [None]:
X_train_pad_char.shape

In [None]:
## lets look at the first instance
X_train_pad_char[0]

In [None]:

from keras.models import Model, Sequential
from keras.layers import Dense, Input, LSTM, TimeDistributed, Flatten
import keras

In [None]:
batch_size=32
max_chars = len(c2i)
c_dim=32
w_dim=64
h_dim=100
char_vocab_size = len(c2i)
word_vocab_size = len(w2i)

## lower-level character LSTM
input_chars = Input(batch_shape=(batch_size, max_sentence_length, max_word_length), dtype='int32', name='input_chars')
emb_chars = TimeDistributed(Embedding(input_dim=char_vocab_size, output_dim=c_dim, mask_zero=True), name='char_emb')(input_chars)
flatten = TimeDistributed(Flatten(), name='flatten')(emb_chars)
char_lstm = LSTM(h_dim, name='char_lstm')(flatten)
#char_lstm = TimeDistributed(LSTM(h_dim))(emb_chars) # how to take 4-d input?

#cmodel = Sequential()
#cmodel.add(TimeDistributed(Embedding(input_dim=char_vocab_size, output_dim=c_dim, mask_zero=True), batch_input_shape=(batch_size, max_sentence_length, max_word_length), input_dtype='int32'))
#cmodel.add(TimeDistributed(Flatten()))
#cmodel.add(LSTM(h_dim))
##cmodel.add(TimeDistributed(LSTM(h_dim)))
##cmodel.add(Dense(1, activation='sigmoid'))

## input for words
input_words = Input(batch_shape=(batch_size, max_sentence_length), name='input_words')
emb_words = Embedding(input_dim=word_vocab_size, output_dim=c_dim, mask_zero=True, input_length=max_sentence_length, name='word_emb')(input_words)
word_lstm = LSTM(h_dim, name='word_lstm')(emb_words)
#wmodel = Sequential()
#wmodel.add(Embedding(input_dim=word_vocab_size, output_dim=c_dim, mask_zero=True, input_length=max_sentence_length))
#wmodel.add(LSTM(h_dim))
##wmodel.add(Dense(1, activation='sigmoid'))

# how to do the same in a sequential model? (especially the concatenate merge?)
#merge = Sequential()
##merge.add(InputLayer(shape=(batch_size, max_sentence_length)))
##merge = Concatenate([cmodel, wmodel])
#merge.add(Dense(1, activation='sigmoid'))

# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate([char_lstm, word_lstm], axis=-1)

# And add a prediction node on top
predictions = Dense(1, activation='sigmoid')(merged_vector)



In [None]:
model = Model(inputs=[input_chars, input_words], outputs=predictions)


model.compile(loss='binary_crossentropy', optimizer='adam',
                      metrics=['accuracy'])

model.summary()

In [None]:
model.fit([X_train_pad_char, X_train_pad], y_train, epochs=5)

In [None]:
loss, accuracy = model.evaluate([X_dev_pad_char, X_dev_pad], y_dev)

In [None]:
print(accuracy)

In [None]:
## Todo: find optimal parameters + model structure on dev, evaluate final model on test

### Composing words only out of characters



Instead of using a separate word embedding matrix, compose words through characters (see https://aclweb.org/anthology/W/W16/W16-4303.pdf)

In [None]:
batch_size=32
max_chars = len(c2i)
c_dim=50
c_h_dim=256
w_h_dim=256
char_vocab_size = len(c2i)

## lower-level character LSTM
input_chars = Input(shape=(max_sentence_length, max_word_length))

embedded_chars = TimeDistributed(Embedding(char_vocab_size, c_dim,
                                         input_length=max_word_length))(input_chars)
char_lstm = TimeDistributed(LSTM(c_h_dim))(embedded_chars)

word_lstm_from_char = LSTM(w_h_dim)(char_lstm)

# And add a prediction node on top
predictions = Dense(1, activation='sigmoid')(word_lstm_from_char)


In [None]:
print(X_train_pad_char.shape)

In [None]:
model = Model(inputs=input_chars, outputs=predictions)


model.compile(loss='binary_crossentropy', optimizer='adam',
                      metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train_pad_char, y_train, epochs=5, batch_size=32)



In [None]:
loss, accuracy = model.evaluate(X_dev_pad_char, y_dev)

In [None]:
print(accuracy)

In [50]:
## TODO: doesn't work very well yet, check! plus find optimal parameters + model structure on dev, evaluate final model on test