# Text

In [1]:
# TensorFlow and Keras for deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# For handling the dataset
import numpy as np
import matplotlib.pyplot as plt

Load Data

In [2]:
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


Explore the data

In [3]:
print(train_data[0])
print(train_labels[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


Preprocess the data

In [4]:
# dictionary
word_index = imdb.get_word_index()

# index to word
reverse_word_index = {value: key for (key, value) in word_index.items()}

# decodify
def decode_review(sequence):
    return ' '.join([reverse_word_index.get(i - 3, "") for i in sequence])

print(decode_review(train_data[0]))
print(f"Label: {'positive' if train_labels[0] == 1 else 'negative'}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step
 this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little boy's that played the  of norman and paul they were just brilliant children are often left out of the  list i think

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import os

# Convert integer sequences back to text for tokenizer fitting
decoded_reviews = [' '.join([reverse_word_index.get(i - 3, "") for i in seq]) for seq in train_data]

# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(decoded_reviews)

# Save the tokenizer
os.makedirs("Models", exist_ok=True)
with open("Models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

: 

Build the model

In [5]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
model = keras.Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=500),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid") #binary output
])



Compile the model

In [6]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad the sequences to a fixed length
train_data = pad_sequences(train_data, maxlen=500)
test_data = pad_sequences(test_data, maxlen=500)

Train the model

In [8]:
model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.5849 - loss: 0.6621 - val_accuracy: 0.8268 - val_loss: 0.4100
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8214 - loss: 0.4006 - val_accuracy: 0.8744 - val_loss: 0.3226
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8534 - loss: 0.3364 - val_accuracy: 0.8674 - val_loss: 0.3207
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8896 - loss: 0.2708 - val_accuracy: 0.8828 - val_loss: 0.2912
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8995 - loss: 0.2449 - val_accuracy: 0.8902 - val_loss: 0.2793
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9169 - loss: 0.2149 - val_accuracy: 0.8906 - val_loss: 0.2753
Epoch 7/10
[1m625/625[0m 

<keras.src.callbacks.history.History at 0x7ad4a7a03850>

Evaluate the model

In [9]:
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8763 - loss: 0.3110
Test loss: 0.30890852212905884
Test accuracy: 0.875760018825531


Evaluate different index

In [10]:
index = 13

# real data
review = test_data[index]
true_label = test_labels[index]

# prediction
prediction = model.predict(np.array([review]))[0][0]

# decode
decoded = decode_review(review)

# results
print(f"\nReview:\n{decoded[:300]}...")
print(f"\nReal: {'Positive' if true_label == 1 else 'Negative'}")
print(f"Prediction: {'Positive' if prediction >= 0.5 else 'Negative'}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step

Review:
                                                                                                                                                                               i started watching this because i thought it was a really  porno as i kept watching the only thrill i got from this movie was...

Real: Negative
Prediction: Negative


In [11]:
model.save("text_model.keras")

In [13]:
from google.colab import files
files.download("text_model.keras")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>