In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Bidirectional
from sklearn.cross_validation import train_test_split

np.random.seed(1337)  # for reproducibility

In [None]:
df = pd.read_csv('data/metacritic-movie-reviews.csv')
df = df[df['review text'].notnull() & df['rating'].notnull()]

In [None]:
max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [None]:
def load_data(df):
    X_train, X_test, y_train, y_test = train_test_split(df['review text'], df['rating'], test_size=0.33)
    # TODO need to be vectors
    return (X_train, y_train), (X_test, y_test)

In [None]:
print('Loading data...')
(X_train, y_train), (X_test, y_test) = load_data(df)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

In [None]:
print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=4,
          validation_data=[X_test, y_test])