<a href="https://colab.research.google.com/github/alexbrill/tf-train/blob/main/imdb%2Blstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import

In [2]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Conv1D, LSTM, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# data preprocessing

In [3]:
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


# fitting

In [4]:
max_review_length = 500

X_train = sequence.pad_sequences(training_data, maxlen=max_review_length)
y_train = training_targets
X_test = sequence.pad_sequences(testing_data, maxlen=max_review_length)
y_test = testing_targets

In [5]:
X_train[X_train >= 5000] = 0
X_test[X_test >= 5000] = 0

## LSTM 

In [6]:
top_words = 5000
embedding_vector_length = 32
max_review_length = 500

def build(conv = False):
  model = Sequential()
  model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
  if conv:
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
  model.add(LSTM(100))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  print(model.summary())

  return model

In [7]:
model1 = build()
model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
scores = model1.evaluate(X_test, y_test, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 83.77%


## LSTM + CONV

In [40]:
model2 = build(conv=True)
model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
scores = model2.evaluate(X_test, y_test, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 216,405
Trainable params: 216,405
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 86.10%


# Ensemble

In [9]:
def ensemble(models, x):
  return np.mean([model.predict(x) for model in models])

In [10]:
def vectorize(sequences, dimension = 5000):
  results = np.zeros((len(sequences), dimension))

  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1

  return results


In [11]:
index = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [63]:
def predict(source_text):
  # prepocessing
  text = source_text.lower().strip()

  # tokenize
  tokens = text.split()

  # get indexes
  indexes = [index.get(token, 0) for token in tokens]

  # binarize
  x = sequence.pad_sequences([indexes], maxlen=max_review_length)
  x[x >= 5000] = 0

  # get prediction
  #pred = ensemble([model1, model2], binarized)
  pred = np.mean([model1.predict(x), model2.predict(x)])

  # result
  if pred < 0.5:
    return pred, 'neg'
  else:
    return pred, 'pos'

In [64]:
text = '''
Bad film.
I hate it. This film is awful. I hate it. It is OK.
I hate it. This film is awful. I hate it. It is OK.
I hate it. This film is awful. I hate it. It is OK.
'''
predict(text)

(0.2815455, 'neg')

In [65]:
text = '''
Good film.
I love it. This film is amazing. I like it. It is not OK.
I love it. This film is amazing. I like it. It is not OK.
I love it. This film is amazing. I like it. It is not OK.
'''
predict(text)

(0.7443879, 'pos')