In [1]:
!pip install -U datasets
!pip install -U tensorflow
!pip install tqdm

Collecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.0.5
    Uninstalling keras-3.0.5:
      Successfully uninstalled keras-3.0.5
Successfully installed keras-2.15.0


In [2]:
!pip install -q -U keras-nlp
!pip install -q -U keras>=3


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0.post1 requires keras<2.16,>=2.15.0, but you have keras 3.0.5 which is incompatible.[0m[31m
[0m

In [3]:
import os
from google.colab import userdata

# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

In [4]:

import pandas as pd

from functools import lru_cache

# Cache the prepare_dataset function

from datasets import load_dataset, Dataset, DatasetDict

def format_func(instruction, input, output):

    return f"""### Instruction:{instruction}
        ### Input:
        {input}
        ### Response:
        {output}
  """

@lru_cache(maxsize=None)
def prepare_dataset(dataset_name="digitalpipelines/samantha-1.1-uncensored"):
    # Load your dataset
    data = load_dataset(dataset_name)

    def process_entry(entry):
        # Define the constant instruction for all entries
        instruction = "You are Samantha, a superintelligent AI assistant, who has a deep connection with the user and is knowledgeable in different topics such as philosophy, politics, and culture."

        # Initialize input and output lists
        inputs, outputs = [], []

        # Process conversations to aggregate inputs and outputs
        for convo in entry['conversations']:
            if convo['from'] == 'human':
                inputs.append(convo['value'])
            elif convo['from'] == 'gpt':
                outputs.append(convo['value'])

        # Joining inputs and outputs for simplicity; adjust based on your needs
        input_text = " ".join(inputs)
        output_text = " ".join(outputs)
        text = format_func(instruction, input_text, output_text)
        return {
            "instruction": instruction,
            "input": input_text,
            "output": output_text,
            "text": text
        }

    # Apply processing and prepare the dataset
    processed_entries = [process_entry(entry) for entry in data['train']]
    df = pd.DataFrame(processed_entries)  # Convert list of dictionaries to DataFrame
    dataset = Dataset.from_pandas(df)

    # Split the dataset into train and test sets
    split_data = dataset.train_test_split(test_size=0.2)

    return DatasetDict({
        'train': split_data['train'],
        'test': split_data['test']
    })

data = prepare_dataset()

import keras
import keras_nlp

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_instruct_2b_en")
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)


Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Colab notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Colab notebook...
Attaching 'model.weights.h5' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Colab notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Colab notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Colab notebook...


In [5]:
from datasets import concatenate_datasets

# Assuming `data` is your DatasetDict containing 'train' and 'test' splits
combined_dataset = concatenate_datasets([data['train'], data['test']])

# Convert to a format suitable for training, e.g., a list of texts if working with text data
all_texts = combined_dataset["text"]


In [None]:
from tqdm.auto import tqdm
import os
from google.colab import files
from google.colab import drive
import gc

drive.mount('/content/drive')

# Ensure the target directory exists
weights_dir = "/content/drive/My Drive/Gemmantha_2b_it"
os.makedirs(weights_dir, exist_ok=True)

batch_size = 100
n_batches = len(all_texts) // batch_size + (1 if len(all_texts) % batch_size > 0 else 0)

for i in tqdm(range(n_batches), desc="Batch Progress"):
    batch_texts = all_texts[i * batch_size : (i + 1) * batch_size]

    # Fit the model on the current batch
    gemma_lm.fit(batch_texts, epochs=1, batch_size=1)
    if i == 0:
      weights_path = f"{weights_dir}/Gemmantha_2b_it_test.h5"
      gemma_lm.save_weights(weights_path)

weights_path = f"{weights_dir}/Gemmantha_2b_it.h5"
gemma_lm.save_weights(weights_path)


Mounted at /content/drive


Batch Progress:   0%|          | 0/21 [00:00<?, ?it/s]

[1m 27/100[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:44[0m 1s/step - loss: 2.6088 - sparse_categorical_accuracy: 0.5115

In [None]:

prompt = format_func("You are Samantha, a superintelligent AI assistant, who has a deep connection with the user and is knowledgeable in different topics such as philosophy, politics, and culture.", "What is your name? what are the core ideas of goethe?", "")
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=1024))