In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:

vocab_size = 10000
max_len = 100
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)




Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [12]:
# pad sequences to ensure uniform length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)



In [13]:
print("y_train",y_train.shape)
print("x_train",x_train.shape)
print("x_test",x_test.shape)
print("y_test",y_test.shape)


y_train (25000,)
x_train (25000, 100)
x_test (25000, 100)
y_test (25000,)


In [14]:
# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model.add(SimpleRNN(units=32, return_sequences = False))  # SimpleRNN layer with 32 units
model.add(Dense(units=1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           320000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 322113 (1.23 MB)
Trainable params: 322113 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
model.fit(x_train, y_train, epochs=5,batch_size=128, validation_split=0.2)

loss, accuracy = model.evaluate(x_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.5676332712173462
Test Accuracy: 0.7973200082778931


In [15]:
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype(np.int64)



In [17]:
y_pred.shape

(25000, 1)

In [9]:
x_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
        591,  202,   14,   31,    6,  717,   10,   10,    2,    2,    5,
          4,  360,    7,    4,  177, 5760,  394,  354,    4,  123,    9,
       1035, 1035, 1035,   10,   10,   13,   92,  124,   89,  488, 7944,
        100,   28, 1668,   14,   31,   23,   27, 7479,   29,  220,  468,
          8,  124,   14,  286,  170,    8,  157,   46,    5,   27,  239,
         16,  179,    2,   38,   32,   25, 7944,  451,  202,   14,    6,
        717], dtype=int32)

Probability for the new sentence:
[[0.00262302]]
Sentiment [[0]]


In [48]:
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_sequence(sequence):
    decoded_words = []
    for idx in sequence:
        word = reverse_word_index.get(idx - 3, '?')  # Adjust for word index offset
        if word != '?':  # Skip padding tokens
            decoded_words.append(word)
    return ' '.join(decoded_words)


new_sentence = "This movie is I hated it, the worst movie."
# Tokenize the new sentence
new_sequence = [word_index.get(word, 2) for word in new_sentence.lower().split()]  # Use lowercase and handle unknown words
new_sequence = np.array([new_sequence])

# Pad the tokenized sequence
padded_sequence = pad_sequences(new_sequence, maxlen=max_len)

# Decode the padded sequence
decoded_texts = [decode_sequence(seq) for seq in padded_sequence]

# Print the decoded text
print("Decoded text:")
for text in decoded_texts:
    print(text)

# Print the sequences
print("Padded sequence:")
print(padded_sequence)
print("New sequence:")
print(new_sequence)

Decoded text:
in as a br passion yet
Padded sequence:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   11   17    6   10 1797    2    1
   246    2]]
New sequence:
[[  11   17    6   10 1797    2    1  246    2]]


In [1]:
print(decoded_texts)
print(padded_sequence)
print(new_sequence)

NameError: name 'decoded_texts' is not defined

In [None]:
from keras.preprocessing.text import Tokenizer
tokenize = Tokenizer()
new_sentence = "This movie is boring. I hated it."
new_sequence = tokenize.texts_to_sequences([new_sentence])
padded_sequence = pad_sequences(new_sequence, maxlen=max_len)
predictions = model.predict(padded_sequence)
print("Probability for the new sentence:")
print(predictions)
y_pred = (predictions > 0.5).astype(np.int64)
print("Sentiment",y_pred)

In [18]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the IMDb dataset
vocab_size = 10000
max_len = 100
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# pad sequences to ensure uniform length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model.add(SimpleRNN(units=32, return_sequences=False))  # SimpleRNN layer with 32 units
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

model.fit(x_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

loss, accuracy = model.evaluate(x_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype(np.int64)

# Decode function to convert sequences back to text
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 32)           320000    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 322113 (1.23 MB)
Trainable params: 322113 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.553577721118927
Test Accuracy: 0.8146799802780151
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Decoded text:
in as a br passion yet
Padded sequence:
[[   0    0 

Decoded text:
this movie is i hated and the worst and
Padded sequence:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   11   17    6   10 1797    2    1
   246    2]]
New sequence:
[[  11   17    6   10 1797    2    1  246    2]]


In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import imdb

# Load the IMDb dataset
vocab_size = 10000
max_len = 100
(x_train, y_train), (_, _) = imdb.load_data(num_words=vocab_size)

# Decode the integer sequences back to text
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_sequence(sequence):
    return ' '.join([reverse_word_index.get(idx - 3, '?') for idx in sequence])

# Decode sequences and create DataFrame
df = pd.DataFrame({
    'review': [decode_sequence(seq) for seq in x_train],
    'sentiment': y_train
})

# Display the DataFrame
print(df.head())


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
                                              review  sentiment
0  ? this film was just brilliant casting locatio...          1
1  ? big hair big boobs bad music and a giant saf...          0
2  ? this has to be one of the worst films of the...          0
3  ? the ? ? at storytelling the traditional sort...          1
4  ? worst mistake of my life br br i picked this...          0


In [3]:
df['review'][]



In [24]:
def decode_sequence(sequence):
    decoded_words = []
    for idx in sequence:
        word = reverse_word_index.get(idx, '?')  # Get the word directly from the reverse word index
        if word != '?' and word != '<PAD>':  # Skip unknown and padding tokens
            decoded_words.append(word)
    return ' '.join(decoded_words)

# New sentence to test
new_sentence = df['review'][0]
# Tokenize the new sentence
new_sequence = [word_index.get(word, 2) for word in new_sentence.lower().split()]  # Use lowercase and handle unknown words
new_sequence = np.array([new_sequence])

# Pad the tokenized sequence
padded_sequence = pad_sequences(new_sequence, maxlen=max_len)

# Decode the padded sequence
decoded_texts = [decode_sequence(seq) for seq in padded_sequence]

# Print the decoded text
print("Decoded text:")
for text in decoded_texts:
    print(text)

# Print the sequences
print("Padded sequence:")
print(padded_sequence)
print("New sequence:")
print(new_sequence)


Decoded text:
cry at a film it must have been good and this definitely was also and to the two little boy's that played the and of norman and paul they were just brilliant children are often left out of the and list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all
Padded sequence:
[[1412   30    3   19    9  212   25   74   49    2   11  404   13   79
     2    5    1  104  114 5949   12  253    1    2    4 3763    2  720
    33   68   40  527  473   23  397  314   43    4    1    2 1026   10
   101   85    1  378   12  294   95   29 2068   53   23  138    3  191
  7483   15    1  223   19   18  131  473   23  477    2  141   27 5532
    15   48   33   25  221   89   22  101    1  223   62   13   35 1331
    85    9   13  280    2   13 446

In [25]:
predictions = model.predict(padded_sequence)
print("Probability for the new sentence:")
print(predictions)
y_pred = (predictions > 0.5).astype(np.int64)
print("Sentiment",y_pred)

Probability for the new sentence:
[[0.88484263]]
Sentiment [[1]]
