In [1]:
#tokenizer_padding
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Load train/val/test
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Tokenizer
MAX_VOCAB = 50000  # cap vocab size
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["clean_text"])

# Convert to sequences
def to_seq(df, tokenizer, maxlen=40):
    seqs = tokenizer.texts_to_sequences(df["clean_text"])
    return pad_sequences(seqs, maxlen=maxlen, padding="post", truncating="post")

X_train = to_seq(train_df, tokenizer)
X_test = to_seq(test_df, tokenizer)

y_train = train_df["label"].values
y_test = test_df["label"].values

print("Train shape:", X_train.shape, y_train.shape)

# Save tokenizer for reuse
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save numpy arrays
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)

Train shape: (1276080, 40) (1276080,)


In [2]:
# src/08_lstm_baseline.py
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load preprocessed arrays
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")
X_test = np.load("X_test.npy")
y_test = np.load("y_test.npy")

# Convert labels to one-hot
num_classes = 2
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes)

# Model parameters
MAX_VOCAB = 50000
EMBED_DIM = 100
MAXLEN = 40

# Build model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAXLEN),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(num_classes, activation="softmax")
])
model.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["accuracy"]
)

model.summary()

# Train
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train_cat,
    epochs=10,
    validation_split = 0.1,
    batch_size=100,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)
print("Test accuracy:", test_acc)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 100)           5000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 5117506 (19.52 MB)
Trainable params: 5117506 (19.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Test accuracy: 0.7960973978042603


In [3]:
# Save model after training
model.save("lstm_baseline.h5")   # HDF5 format



  saving_api.save_model(
