<a href="https://colab.research.google.com/github/Yonas-Desta-Ebren/ICOG_Training_Task/blob/main/ICOG_AMHARIC_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install required libraries
!pip install transformers tensorflow datasets




In [4]:
# Verify TensorFlow and GPU availability
import tensorflow as tf
print("TensorFlow Version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

TensorFlow Version: 2.18.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import re

# Load the raw Amharic dataset
with open('/content/drive/My Drive/ICOG_Folders/raw-corpus.txt', 'r', encoding='utf-8') as file:
    raw_data = file.readlines()

In [7]:
# Basic data preprocessing
def clean_text(text):
    text = re.sub(r'[^\w\s፡-፦]', '', text)  # Remove non-Amharic characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

In [42]:
cleaned_data = [clean_text(line) for line in raw_data if line.strip()]
cleaned_data = [line for line in cleaned_data if line]  # Remove empty lines

In [43]:
# Save the cleaned data to a new file
with open('/content/drive/My Drive/ICOG_Folders/amharic-corps-cleaned.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(cleaned_data))

In [44]:
# Load the cleaned data into a list for splitting
from datasets import load_dataset
dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/ICOG_Folders/amharic-corps-cleaned.txt'})

Generating train split: 0 examples [00:00, ? examples/s]

In [71]:
# Split into train (80%), validation (10%), and test (10%)
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
test_valid = dataset['test'].train_test_split(test_size=0.5, seed=42)

train_size = len(dataset['train'])  # 10,000
val_test_size = train_size // 8  # 10% of total (train is 80%, so val + test = 20%, val = 10%)
dataset['validation'] = test_valid['train'].select(range(val_test_size))  # 1,250 samples
dataset['test'] = test_valid['test'].select(range(val_test_size))  # 1,250 samples


print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3276
    })
    test: Dataset({
        features: ['text'],
        num_rows: 409
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 409
    })
})


In [72]:
from transformers import GPT2Tokenizer
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [73]:
# Add a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [74]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=64)

In [75]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Format for TensorFlow
tokenized_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/3276 [00:00<?, ? examples/s]

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

In [76]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 3276
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 409
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 409
    })
})


In [77]:
from transformers import TFGPT2LMHeadModel
# Load the pre-trained GPT-2 model
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [78]:
# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

<tf_keras.src.layers.core.embedding.Embedding at 0x7f42a03ba050>

In [79]:
print("Model loaded successfully")

Model loaded successfully


In [80]:
import tensorflow as tf

# Enable mixed precision training
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')

# Convert to tf.data.Dataset
def create_tf_dataset(split):
    dataset = tokenized_dataset[split]
    # Cast input_ids and attention_mask to int32 to avoid dtype mismatch
    features = {
        'input_ids': tf.cast(tf.convert_to_tensor(dataset['input_ids'], dtype=tf.int64), tf.int32),
        'attention_mask': tf.cast(tf.convert_to_tensor(dataset['attention_mask'], dtype=tf.int64), tf.int32),
    }
    # Labels are the same as input_ids for language modeling
    labels = features['input_ids']
    return tf.data.Dataset.from_tensor_slices((features, labels)).batch(32)  # Increased batch size to 32

In [81]:
train_dataset = create_tf_dataset('train')
val_dataset = create_tf_dataset('validation')
test_dataset = create_tf_dataset('test')

In [82]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Define optimizer and loss function (with mixed precision compatibility)
optimizer = Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [83]:
# Training step
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        # Forward pass
        outputs = model(inputs, training=True)
        logits = outputs.logits
        # Shift logits and labels for next-word prediction
        shift_logits = logits[:, :-1, :]
        shift_labels = labels[:, 1:]
        # Compute loss
        loss = loss_fn(shift_labels, shift_logits)
    # Backward pass
    gradients = tape.gradient(loss, model.trainable_variables)
    # Update weights
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [84]:
# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    num_batches = 0
    for batch, (inputs, labels) in enumerate(train_dataset):
        loss = train_step(inputs, labels)
        total_loss += loss
        num_batches += 1
        if (batch + 1) % 50 == 0:
            print(f"Batch {batch + 1}, Loss: {loss:.4f}")
    avg_loss = total_loss / num_batches
    print(f"Average Training Loss: {avg_loss:.4f}")

    # Validation
    val_loss = 0
    val_batches = 0
    for inputs, labels in val_dataset:
        outputs = model(inputs, training=False)
        logits = outputs.logits
        shift_logits = logits[:, :-1, :]
        shift_labels = labels[:, 1:]
        val_loss += loss_fn(shift_labels, shift_logits)
        val_batches += 1
    avg_val_loss = val_loss / val_batches
    print(f"Average Validation Loss: {avg_val_loss:.4f}")

Epoch 1/3
Batch 50, Loss: 2.1724
Batch 100, Loss: 1.7557
Average Training Loss: 3.2873
Average Validation Loss: 1.8536
Epoch 2/3
Batch 50, Loss: 1.8125
Batch 100, Loss: 1.5894
Average Training Loss: 1.8130
Average Validation Loss: 1.6222
Epoch 3/3
Batch 50, Loss: 1.7214
Batch 100, Loss: 1.5014
Average Training Loss: 1.6649
Average Validation Loss: 1.5424


In [85]:
# Save the model
model.save_pretrained('/content/drive/My Drive/ICOG_Folders/gpt2-amharic-finetuned')

In [86]:
# Test the model
test_loss = 0
test_batches = 0
for inputs, labels in test_dataset:
    outputs = model(inputs, training=False)
    logits = outputs.logits
    shift_logits = logits[:, :-1, :]
    shift_labels = labels[:, 1:]
    test_loss += loss_fn(shift_labels, shift_logits)
    test_batches += 1
avg_test_loss = test_loss / test_batches
print(f"Average Test Loss: {avg_test_loss:.4f}")

Average Test Loss: 1.5314


In [93]:
# Generate text
prompt = "ማህበራዊ"
inputs = tokenizer(prompt, return_tensors="tf", padding=True, truncation=True, max_length=128)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [94]:
# Generate
outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=64,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.5,
    temperature=0.6,
)

In [95]:
# Decode
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)

Generated Text: ማህበራዊት የመምን የመምስ የመን የስምን �
