In [2]:
import keras

In [3]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D        #for max pooling
from keras.datasets import imdb


In [4]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [5]:
# for loading imdb dataset
import numpy as np 
np_load_old = np.load   # save old function for calling later 

# modify the default parameters of np.load
#np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [6]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [7]:
word_to_id = keras.datasets.imdb.get_word_index()

In [8]:
def get_fixed_word_to_id_dict(): 
    INDEX_FROM=3   # word index offset    #starts from number 3 and onwords
    
    word_to_id = keras.datasets.imdb.get_word_index()    #got the dictionary
    word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}    #so that we can have 0,1,2 in the mapping
    word_to_id[" "] = 0        #tagging reserve free functin 
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2
    return word_to_id

In [9]:
def decode_to_sentence(data_point): 
    
    
    word_to_id = get_fixed_word_to_id_dict()    #get the dictionary which have all key value pairs

    id_to_word = {value:key for key,value in word_to_id.items()}   #flip the dictionary so that given the value we have the key
    return ' '.join( id_to_word[id] for id in data_point )

In [10]:
def encode_sentence(sent): 
    # print(sent)
    encoded = []
    
    word_to_id = get_fixed_word_to_id_dict()    # GETS THE DICTIONARY
    
    for w in sent.split(" "):                   #SPLITS AND GET EACH WORDS OUT
        if w in word_to_id: 
            encoded.append(word_to_id[w])        #AND FOR EACH WORD IT WILL APPEND id to encoded lists if found
        else: 
            encoded.append(2)        # We used '2' for <UNK>  if not found in the dictionary.
    return encoded 

In [11]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [12]:
#print(x_train[0])

In [14]:
print('Build model...')

from keras.layers import LSTM

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

    #model.summary()

print('Train...')
model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=5,
            validation_data=(x_test, y_test))


import matplotlib.pyplot as plt

#print(history.history.keys())

from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense

# generate data
#x_train, y_train, x_test, testy = get_data()
# fit model
model = get_model()
print('\n')
print('Accuracy and Score during testing of the model :')
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('\n')
 
 
# predict probabilities for test set
yhat_probs = model.predict(x_train, verbose=0)
# predict crisp classes for test set
yhat_classes = model.predict_classes(x_test, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]
 
# accuracy: (tp + tn) / (p + n)
print('Model Evaluation :')
#accuracy = accuracy_score(y_test, yhat_classes)
#print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Build model...
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy and Score during testing of the model :
Test score: 0.3374877428817749
Test accuracy: 0.8601199984550476


Model Evaluation :
Precision: 0.809999
Recall: 0.940960
F1 score: 0.870582


In [50]:
predictions = model.predict(x_test)    #pass testing data mentioned above here in model
sentiment = ['NEG' if i < 0.5 else 'POS' for i in predictions]      #can add negative pos and neutral here

In [51]:
#create our own sentences and see how it will do for new senences it has never seen.
test_sentences = [] 

test_sentence = "i do not like this at all"
test_sentence = encode_sentence(test_sentence)     #encode sentences return sequence of id
test_sentences.append(test_sentence)                #append it into the above sentences list.


test_sentence = "loved it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence) 


test_sentence = "did not love it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence)


test_sentence = "cannot say that i loved it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence)

In [52]:
test_sentences = sequence.pad_sequences(test_sentences, maxlen=maxlen)    #pad them as before

In [53]:
test_sentences.shape    #shape of the our sentences

(4, 400)

In [54]:
predictions = model.predict(test_sentences)
sentiment = ['NEG' if i < 0.96 else 'POS' for i in predictions]

for i in range(test_sentences.shape[0]): 
    print(decode_to_sentence(test_sentences[i]), "--", sentiment[i])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  i do not like this at all -- NEG
                                                                                                                                                                                     