In [None]:
pip install tensorflow scikit-learn numpy fastapi uvicorn "python-multipart"



In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Parameters ---
VOCAB_SIZE = 10000  # Number of words to consider as features
MAX_LEN = 250       # Max length of a review (in words)
BATCH_SIZE = 64

# --- Load Data ---
# The data is already split and tokenized into integers
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")

# --- Pad Sequences ---
# All sequences must have the same length for the model.
# We pad shorter reviews with 0s and truncate longer ones.
X_train_padded = pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')

print(f"Padded training data shape: {X_train_padded.shape}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Training data shape: (25000,)
Training labels shape: (25000,)
Padded training data shape: (25000, 250)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# --- Model Architecture ---
EMBEDDING_DIM = 128
LSTM_UNITS = 64

model = Sequential([
    # 1. Embedding Layer: Turns word IDs into rich vectors of size 128
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),

    # 2. Bidirectional LSTM Layer: Reads the review forwards and backwards for better context
    Bidirectional(LSTM(units=LSTM_UNITS, dropout=0.2, recurrent_dropout=0.2)),

    # 3. Output Layer: Gives a single output between 0 and 1 (Negative vs. Positive)
    Dense(1, activation='sigmoid')
])

# --- Compile the Model ---
# This step configures the model for training by specifying the optimizer,
# loss function, and metrics to monitor.
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# This will print a summary of the model you just built
model.summary()



In [None]:
# --- Train the Model ---
EPOCHS = 5 # How many times to go through the entire training dataset

print("\nStarting model training...")

history = model.fit(
    X_train_padded,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_test_padded, y_test) # Data to check performance against
)

# --- Evaluate the Model's Final Performance ---
print("\nTraining finished. Evaluating model on the test data...")
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")


Starting model training...
Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 1s/step - accuracy: 0.6567 - loss: 0.6026 - val_accuracy: 0.8218 - val_loss: 0.4254
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m398s[0m 1s/step - accuracy: 0.8386 - loss: 0.3950 - val_accuracy: 0.8024 - val_loss: 0.4470
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 1s/step - accuracy: 0.8597 - loss: 0.3519 - val_accuracy: 0.8148 - val_loss: 0.4235
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 1s/step - accuracy: 0.7436 - loss: 0.5284 - val_accuracy: 0.7883 - val_loss: 0.4821
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 1s/step - accuracy: 0.8726 - loss: 0.3299 - val_accuracy: 0.8220 - val_loss: 0.4259

Training finished. Evaluating model on the test data...
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 115ms/step - accuracy: 0.8219 - loss: 

In [None]:
import json
from tensorflow.keras.datasets import imdb

# 1. Save the trained model
# The file will contain the model's architecture and learned weights.
model.save('imdb_sentiment_lstm_model.h5')
print("Model saved as imdb_sentiment_lstm_model.h5")

# 2. Save the word_index
# This is the dictionary mapping words to integer IDs.
word_index = imdb.get_word_index()

# Keras reserves indices 0, 1, and 2 for special tokens.
# We must shift the original word_index by 3 to match the training setup.
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0      # Padding token
word_index["<START>"] = 1    # Start of sequence token
word_index["<UNK>"] = 2      # Unknown word token

# Save the dictionary as a JSON file for easy loading later.
with open('word_index.json', 'w') as f:
    json.dump(word_index, f)

print("Word index saved as word_index.json")




Model saved as imdb_sentiment_lstm_model.h5
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Word index saved as word_index.json


In [None]:
from google.colab import files
files.download("imdb_sentiment_lstm_model.h5")
files.download("word_index.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp imdb_sentiment_lstm_model.h5 /content/drive/MyDrive/
!cp word_index.json /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Save the model
model.save('imdb_sentiment_lstm_model.h5')

# Save the word_index
import json

word_index = imdb.get_word_index()

# Keras reserves index 0 for padding, 1 for start, 2 for unknown.
# We need to shift the original word_index by 3.
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # UNK for unknown words

with open('word_index.json', 'w') as f:
    json.dump(word_index, f)

NameError: name 'model' is not defined