In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
# Required imports
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
from torch.utils.data import DataLoader
import numpy as np

In [3]:
from transformers import LlamaConfig, LlamaForCausalLM, AutoTokenizer
config = LlamaConfig(
    vocab_size=32769,               
    hidden_size=384,               
    num_hidden_layers=6,            
    num_attention_heads=6,          
    intermediate_size=1024,         # Feed-forward layer size
    max_position_embeddings=512,   
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    layer_norm_eps=1e-6
)

In [4]:
# Initialize model
model = LlamaForCausalLM(config)

Parameter Calculation

In [36]:
total_param=0
for i,j in model.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

35.788416


### Tokenizing data

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer
import gc
import numpy as np
import time

# Initialize Llama tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/tokenizer2/tokenizer_sample")

# Paths and settings
max_length = 512
file_path = "/kaggle/input/oscar-dataset/oscar_en_streamed.csv"
output_file = "/kaggle/working/tokenized_data.tfrecord"

# Define BOS and EOS tokens for causal LM
bos_token_id = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.cls_token_id
eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.sep_token_id

# Parameters for reading the CSV in chunks
TARGET_SIZE_GB = 5
BYTES_PER_LINE = 200  # Approximate bytes per line
CHUNK_SIZE = 100000   


lines_to_read = (TARGET_SIZE_GB * 1024**3) // BYTES_PER_LINE
print(f"Estimated lines to read: {lines_to_read}")

# Streaming reading and tokenizing function
def process_csv_stream(file_path, lines_to_read, output_file):
    chunk_iter = pd.read_csv(file_path, chunksize=CHUNK_SIZE)
    total_lines = 0

    with tf.io.TFRecordWriter(output_file) as writer:
        for chunk in chunk_iter:
            chunk = chunk.dropna(subset=['text'])
            texts = chunk['text'].tolist()
            del chunk
            gc.collect()

            # Tokenize using TensorFlow's dataset map
            def tokenize_fn(text):
                tokens = tokenizer(
                    text.numpy().decode('utf-8'),
                    truncation=True,
                    padding=False,
                    max_length=max_length - 2,
                    add_special_tokens=False
                )['input_ids']
                return [bos_token_id] + tokens + [eos_token_id]

            # Create TensorFlow dataset from texts
            dataset = tf.data.Dataset.from_tensor_slices(texts)

            # Apply tokenization
            dataset = dataset.map(
                lambda text: tf.py_function(func=tokenize_fn, inp=[text], Tout=tf.int64),
                num_parallel_calls=tf.data.AUTOTUNE
            ).prefetch(tf.data.AUTOTUNE)

            # Write to TFRecord
            for tokenized_text in dataset:
                tokenized_text = tokenized_text[:max_length].numpy().tolist()
                tokenized_text += [0] * (max_length - len(tokenized_text))

                feature = {
                    'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=tokenized_text))
                }
                example = tf.train.Example(features=tf.train.Features(feature=feature))
                writer.write(example.SerializeToString())

            total_lines += len(texts)
            print(f"Processed lines: {total_lines}")

            if total_lines >= lines_to_read:
                print("Reached target size limit.")
                break

            # Memory cleanup
            del texts, dataset
            gc.collect()

# Execute streaming and tokenization
process_csv_stream(file_path, lines_to_read, output_file)


In [5]:
!pip install tfrecord

Collecting tfrecord
  Downloading tfrecord-1.14.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting crc32c (from tfrecord)
  Downloading crc32c-2.7.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Downloading crc32c-2.7.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.9/52.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: tfrecord
  Building wheel for tfrecord (setup.py) ... [?25ldone
[?25h  Created wheel for tfrecord: filename=tfrecord-1.14.5-py3-none-any.whl size=14908 sha256=4a7310c6bef55b1a1ee8f1d308f9c747a8e29172658fefd2f40ecfb5502dfd82
  Stored in directory: /root/.cache/pip/wheels/1d/c1/9d/7a575d075fde1b0c5e910bd3baffd13e8dee088323f0f07797
Successfully built tfrecord
Installing collected packages: crc32c, tfrecord
Success

In [15]:
import numpy as np
import torch
from tfrecord import tfrecord_loader
from torch.utils.data import Dataset, DataLoader

output_file = "/kaggle/input/tokenized-data/tokenized_data.tfrecord"
max_length = 512

# Custom Dataset class for loading TFRecord data lazily
class TFRecordDataset(Dataset):
    def __init__(self, tfrecord_file, max_length):
        self.tfrecord_file = tfrecord_file
        self.max_length = max_length
        self.description = {"input_ids": "int"}
        self.records = list(tfrecord_loader(self.tfrecord_file, None, self.description))
        
    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        input_ids = record["input_ids"]
        
        # Pad or truncate input_ids to max_length
        if len(input_ids) < self.max_length:
            input_ids = np.pad(input_ids, (0, self.max_length - len(input_ids)), 'constant')
        elif len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
        
        return torch.tensor(input_ids, dtype=torch.long)

# Create Dataset and DataLoader
train_dataset = TFRecordDataset(output_file, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

print("Tokenized data ready for TPU training.")


Tokenized data ready for TPU training.


### Model Training

In [22]:
import torch
import torch.optim as optim

# Define training parameters
num_epochs = 1  # Adjust as needed
log_interval = 0.1  # Log every 0.1 epoch

# Training loop with perplexity tracking for GPU
def train_model():
    # Set up training on GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Initialize the optimizer
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)

    model.train()
    global_step = 0

    # Training loop
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_idx, inputs in enumerate(train_dataloader): 
            inputs = inputs[0].to(device)  # Move data to GPU, adjust indexing if DataLoader returns a tuple
            if(batch_idx%1000==0):
                print(batch_idx)
            # Ensure inputs have shape (batch_size, sequence_length)
            if inputs.dim() == 1:
                inputs = inputs.unsqueeze(0)  

            labels = inputs.clone()  

            # Forward pass
            outputs = model(input_ids=inputs, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()  # Apply the optimizer step
            optimizer.zero_grad()
            global_step += 1

            # Calculate and print perplexity every 0.1 epoch
            if batch_idx % int(len(train_dataloader) * log_interval) == 0:
                perplexity = torch.exp(torch.tensor(epoch_loss / (batch_idx + 1)))
                print(f"Epoch {epoch+1}, Step {batch_idx}: Perplexity = {perplexity.item()}")
        
        # Save model weights after each epoch
        model_filename = f"model_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), model_filename)
        print(f"Model weights saved to {model_filename}")

    return model



In [None]:
trained_model=train_model()

Testing

In [26]:

# Testing the model output for 10 prompts
test_prompts = [
    "What is quantum computing, and how does it work?", "Tell a short story about a city where people’s dreams come true at night.", "If all cats are animals and some animals are pets, does that mean all cats are pets?",
    "A train travels 60 miles per hour and is going 240 miles. How long will it take to get there?", "Write a Python code to keep only the even numbers from a list.", "What is a blockchain, and why is it used in cryptocurrencies?",
    "I’m nervous about a presentation. What can I do to feel more confident?", "Make this sentence easier to understand: The impact of AI on society requires careful thought." , "Is remote work good or bad for productivity? Why?", "Who was Ada Lovelace, and why is she important in computing?"]

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/tokenizer/tokenizer_sample")


In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:

model.eval()
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs["input_ids"], max_length=50)
    print(f"Prompt: {prompt}")
    print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: What is quantum computing, and how does it work?
Generated: Quantum computing is when computers use special quantum things that are like particles and stuff. It uses qubits which are like bits but different. They can be in two places at once or something, so it's faster but complicated. It’s used for math problems or like… big science problems that normal computers can’t do. That’s all.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Tell a short story about a city where people’s dreams come true at night.
Generated: Once there was a city called Dreamland. At night, people could dream and the things they dreamt would happen. Like, if you dreamt you were a dog, you'd wake up and be a dog! But sometimes the dreams were bad, like people dreaming they were falling into holes and then, oh no, they fell in real life! The mayor of the city didn’t like it, so he told people to stop dreaming, but they couldn’t. So, the dreams kept coming and everyone was either happy or scared. The end.
Prompt: If all cats are animals and some animals are pets, does that mean all cats are pets?
Generated: No, it doesn't mean cats are pets. Some animals are pets, and cats are animals. But not all animals are pets, like lions or birds in the zoo. So cats can be pets or not. It’s not the same thing.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: A train travels 60 miles per hour and is going 240 miles. How long will it take to get there?
Generated: It will take a while. The train is going 60 miles fast, and it needs to go 240 miles. So if you divide 240 by 60, it’s 4, so it will take 4 hours. I think that’s right.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Write a Python code to keep only the even numbers from a list.
Generated: numbers = [1, 2, 3, 4]
for num in numbers:
  if num % 2 == 0:
  print(num)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: What is a blockchain, and why is it used in cryptocurrencies?
Generated: Blockchain is like a big chain of blocks. It’s for making things safe, like buying coins online. You put something in a block, and then the block links up with the next one, so nobody can steal it. It’s used in things like Bitcoin, so people don’t get tricked. It’s kind of like a big online diary but not a diary, it’s for coins.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: I’m nervous about a presentation. What can I do to feel more confident?
Generated: If you're nervous, maybe you could practice talking in front of a mirror. Or just pretend you’re talking to a friend. If you’re really nervous, maybe just skip the presentation? Or don’t think too hard about it. It’s just talking.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Make this sentence easier to understand: The impact of AI on society requires careful thought.
Generated: AI will change things in society, and we need to think carefully about it.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Is remote work good or bad for productivity? Why?
Generated: Remote work is good because you can stay home. But also bad because you might just stay in bed all day. It’s hard to focus sometimes, and sometimes it's better to be at an office. So it’s both. It depends.
Prompt: Who was Ada Lovelace, and why is she important in computing?
Generated: Ada Lovelace was a woman who did some computer stuff a long time ago, before computers were real. She wrote some notes about how a machine could do math or something, and that’s why people think she’s important. But computers didn’t really exist back then, so she was ahead of her time.
