In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D        #for max pooling
from keras.datasets import imdb

In [3]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [4]:
# for loading imdb dataset
import numpy as np 
np_load_old = np.load   # save old function for calling later 

# modify the default parameters of np.load
#np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [5]:
print('Loading data...')

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [6]:
x_train.shape  # 25000 samples, each with a total (fixed) length of 400 'words'

(25000,)

In [7]:
y_train.shape

(25000,)

In [8]:
y_train[0]      # labels are easy to understand. 0 is negative sentiment, 1 is positive

1

In [9]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 2,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 2,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 2,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 2,
 18,
 51,
 36,
 

In [10]:
word_to_id = keras.datasets.imdb.get_word_index()   #returns word index dictionry and saved in variable given
list(word_to_id.items())[:10]                       #starting from begining to 10 representation in list of word to id

[('fawn', 34701),
 ('tsukino', 52006),
 ('nunnery', 52007),
 ('sonja', 16816),
 ('vani', 63951),
 ('woods', 1408),
 ('spiders', 16115),
 ('hanging', 2345),
 ('woody', 2289),
 ('trawling', 52008)]

In [11]:
#mapping from words to number by following the dictionary i.e going from words to this mapped number for our dataset learning
print(word_to_id['love'])
print(word_to_id['like'])
print(word_to_id['boring'])
print(word_to_id['interesting'])

116
37
354
218


In [12]:
#creating dictionary and incorporating these extra 3 token i.e space start and unknown
def get_fixed_word_to_id_dict(): 
    INDEX_FROM=3   # word index offset    #starts from number 3 and onwords
    
    word_to_id = keras.datasets.imdb.get_word_index()    #got the dictionary
    word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}    #so that we can have 0,1,2 in the mapping
    word_to_id[" "] = 0        #tagging reserve free functin 
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2
    return word_to_id

In [13]:
def decode_to_sentence(data_point): 
    
    
    word_to_id = get_fixed_word_to_id_dict()    #get the dictionary which have all key value pairs

    id_to_word = {value:key for key,value in word_to_id.items()}   #flip the dictionary so that given the value we have the key
    return ' '.join( id_to_word[id] for id in data_point )

In [14]:
data_point_to_show = 0    #taking the data point that we want to shoW

In [15]:
x_train[data_point_to_show]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 2,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 2,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 2,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 2,
 18,
 51,
 36,
 

In [16]:
print(decode_to_sentence(x_train[data_point_to_show]))    #DECODE THAT THING TO SENTENCES AND THEN PRINT IT.

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they

In [17]:
print(y_train[data_point_to_show]) # to see the actual sentiment    1-> POSITIVE  FIRST PART DONE AS WE HAVE ABILITY TO GO FROM ID TO SENTENCES AND THEN PREDICATE ITS SENTIMENT.  

1


In [18]:
def encode_sentence(sent): 
    # print(sent)
    encoded = []
    
    word_to_id = get_fixed_word_to_id_dict()    # GETS THE DICTIONARY
    
    for w in sent.split(" "):                   #SPLITS AND GET EACH WORDS OUT
        if w in word_to_id: 
            encoded.append(word_to_id[w])        #AND FOR EACH WORD IT WILL APPEND id to encoded lists if found
        else: 
            encoded.append(2)        # We used '2' for <UNK>  if not found in the dictionary.
    return encoded 

In [19]:
words = "fawn sonja vani made-up-word"
print(encode_sentence(words))
print(encode_sentence("this does not look good"))

[34704, 16819, 63954, 2]
[14, 127, 24, 168, 52]


In [20]:
print('Pad sequences (samples x time)')    # pad from the sequence library of keras.preprocessing 
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)     # for shorter review pad initially as 0 so that all upto 400 is supported
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [21]:
print(x_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    1   14   22   16   43  530  973 1622 1385   65  458 4468   66 3941
    4 

In [23]:
print('Build bidirectional lstm model...')

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

def get_model():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

#visualization of the model
#model.summary()

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=5,
              validation_data=(x_test, y_test))
    return model

import matplotlib.pyplot as plt

#print(history.history.keys())

from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense

# generate data
#x_train, y_train, x_test, testy = get_data()
# fit model
model = get_model()
print('\n')
print('Accuracy and Score during testing of the model :')
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('\n')
 
 
# predict probabilities for test set
yhat_probs = model.predict(x_train, verbose=0)
# predict crisp classes for test set
yhat_classes = model.predict_classes(x_test, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]
 
# accuracy: (tp + tn) / (p + n)
print('Model Evaluation :')
#accuracy = accuracy_score(y_test, yhat_classes)
#print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Build bidirectional lstm model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy and Score during testing of the model :
Test score: 0.35819251821517945
Test accuracy: 0.8554800152778625


Model Evaluation :
Precision: 0.861642
Recall: 0.846960
F1 score: 0.854238


In [None]:
x_test[0]

In [25]:
predictions = model.predict(x_test)    #pass testing data mentioned above here in model
sentiment = ['NEG' if i < 0.5 else 'POS' for i in predictions]      #can add negative pos and neutral here

In [30]:
data_point_to_show = 1
print(decode_to_sentence(x_test[data_point_to_show]), "--", sentiment[data_point_to_show])

                                                                                                                                                                                                                                                                                        <START> this film requires a lot of <UNK> because it focuses on mood and character development the plot is very simple and many of the scenes take place on the same set in <UNK> <UNK> the <UNK> dennis character apartment but the film builds to a disturbing climax br br the characters create an atmosphere <UNK> with sexual tension and psychological <UNK> it's very interesting that robert <UNK> directed this considering the style and structure of his other films still the <UNK> <UNK> audio style is evident here and there i think what really makes this film work is the brilliant performance by <UNK> dennis it's definitely one of her darker characters but she plays it so perfectly and convincingly that it's scary m

In [31]:
#create our own sentences and see how it will do for new senences it has never seen.
test_sentences = [] 

test_sentence = "i do not like this at all"
test_sentence = encode_sentence(test_sentence)     #encode sentences return sequence of id
test_sentences.append(test_sentence)                #append it into the above sentences list.


test_sentence = "loved it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence) 


test_sentence = "did not love it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence)


test_sentence = "cannot say that i loved it"
test_sentence = encode_sentence(test_sentence)
test_sentences.append(test_sentence)

In [32]:
test_sentences = sequence.pad_sequences(test_sentences, maxlen=maxlen)    #pad them as before

In [34]:
test_sentences.shape    #shape of the our sentences

(4, 400)

In [35]:
predictions = model.predict(test_sentences)
sentiment = ['NEG' if i < 0.5 else 'POS' for i in predictions]

for i in range(test_sentences.shape[0]): 
    print(decode_to_sentence(test_sentences[i]), "--", sentiment[i])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  i do not like this at all -- NEG
                                                                                                                                                                                     