<a href="https://colab.research.google.com/github/anushchhatani/CAI/blob/main/transformer_pretraining_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import tensorflow as tf
import keras_nlp
import sentencepiece as spm

In [10]:
max_seq_length = 50
batch_size = 32
data_path = "user_behavior_dataset.csv"  # Path to your dataset
text_file_path = "constructed_text_data.txt"  # Temporary text file for vocab generation

In [11]:
print("Loading dataset...")
data = pd.read_csv(data_path)

Loading dataset...


In [12]:
# Construct text sequences from the dataset
print("Constructing text data...")
text_data = data.apply(
    lambda row: (
        f"User with device {row['Device Model']} using {row['Operating System']} "
        f"spends {row['App Usage Time (min/day)']} minutes daily on apps, "
        f"has {row['Number of Apps Installed']} apps installed, "
        f"and their data usage is {row['Data Usage (MB/day)']} MB per day."
    ),
    axis=1
)


Constructing text data...


In [13]:
# Save the constructed text data to a file
print("Saving text data to file...")
text_data.to_csv(text_file_path, index=False, header=False)

Saving text data to file...


In [14]:
# Step 2: Determine Vocabulary Size and Train SentencePiece
print("Counting unique tokens...")
with open(text_file_path, 'r') as f:
    unique_token_count = len(set(f.read().split()))
vocab_size = min(unique_token_count, 10000)  # Limit vocab size to a maximum of 10,000

Counting unique tokens...


In [15]:
print(f"Training SentencePiece model with vocab size: {vocab_size}...")
spm.SentencePieceTrainer.Train(
    input=text_file_path,
    model_prefix="user_behavior_vocab",
    vocab_size=vocab_size,
    model_type="word"
)

Training SentencePiece model with vocab size: 889...


In [16]:
# Load the trained SentencePiece model
print("Loading SentencePiece model...")
sp = spm.SentencePieceProcessor(model_file="user_behavior_vocab.model")

Loading SentencePiece model...


In [18]:
# Step 3: Initialize KerasNLP Tokenizer
# Include special tokens in the vocabulary
special_tokens = {"[UNK]": 0, "[CLS]": 1, "[SEP]": 2}
vocabulary = {sp.id_to_piece(i): i + len(special_tokens) for i in range(sp.get_piece_size())}
vocabulary.update(special_tokens)  # Add special tokens to the vocabulary

# Initialize the tokenizer with the updated vocabulary
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocabulary,
    sequence_length=max_seq_length,
    lowercase=True,
    oov_token="[UNK]",
)

In [20]:
# Step 4: Tokenize Dataset
# Tokenize the dataset without .to_tensor()
print("Tokenizing dataset...")
def tokenize_function(text):
    return tokenizer(text)

tf_dataset = tf.data.Dataset.from_tensor_slices(text_data).map(
    tokenize_function, num_parallel_calls=tf.data.AUTOTUNE
).batch(batch_size).shuffle(10000).prefetch(tf.data.AUTOTUNE)

Tokenizing dataset...


In [21]:
# Step 5: Build a Simple Transformer Model
print("Building Transformer model...")
from keras_nlp.layers import TransformerEncoder
from tensorflow.keras.layers import Embedding, Dense, Input
from tensorflow.keras.models import Model

num_layers = 4
d_model = 128
num_heads = 4
dff = 512
dropout_rate = 0.1

inputs = Input(shape=(None,), dtype=tf.int32)
x = Embedding(input_dim=vocab_size, output_dim=d_model)(inputs)
for _ in range(num_layers):
    x = TransformerEncoder(
        num_heads=num_heads,
        intermediate_dim=dff,
        dropout=dropout_rate
    )(x)
outputs = Dense(vocab_size)(x)

model = Model(inputs, outputs)

Building Transformer model...


In [22]:
# Step 6: Compile and Train the Model
print("Compiling and training the model...")
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

Compiling and training the model...


In [27]:
# Step 7: Save the Pretrained Model
print("Saving the pretrained model...")
model.save("pretrained_transformer_model.keras")
print("Pretraining complete. Model saved as 'pretrained_transformer_model'.")

Saving the pretrained model...
Pretraining complete. Model saved as 'pretrained_transformer_model'.
