# Text Autocomplete Comparison

This notebook compares the performance of transformer-based models for text autocomplete tasks, focusing on Filipino (Tagalog) language processing. We evaluate:
- **BaseBERT**: A baseline BERT model for Filipino.
- **CGABERT**: An enhanced model with optimized attention mechanisms for Filipino NLP.

The notebook fine-tunes the models, evaluates their performance, and saves the results for visualization in a GUI.

In [None]:
# Install dependencies
%pip install transformers datasets torch evaluate huggingface_hub

In [None]:
# Import libraries
import os
import torch
import pandas as pd
import numpy as np
from transformers import (
    BertForMaskedLM,
    RobertaForMaskedLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, Dataset
import evaluate
from pathlib import Path
import time
import json

In [None]:
# Setup
input_dir = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('.')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
torch.cuda.empty_cache()

In [None]:
# Load user-specified dataset
dataset_path = os.environ.get('DATASET_PATH', '')
if dataset_path:
    try:
        if dataset_path.endswith('.csv'):
            dataset = load_dataset('csv', data_files=dataset_path)
        else:
            dataset = load_dataset(dataset_path, split='train')
        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)
    except Exception as e:
        print(f'Failed to load dataset {dataset_path}: {e}')
        dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)
else:
    try:
        dataset_files = list(input_dir.glob('*.csv'))
        if not dataset_files:
            raise FileNotFoundError('No CSV file found.')
        dataset = load_dataset('csv', data_files=str(dataset_files[0]))
    except Exception:
        dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)

In [None]:
# Classify dataset
num_samples = len(dataset)
classification = 'small' if num_samples < 512 else 'big'
data_type = 'standard NLP' if num_samples > 1000 else 'low-resource NLP'
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
# Tokenization
base_tokenizer = AutoTokenizer.from_pretrained('GKLMIP/bert-tagalog-base-uncased')
improved_model_path = 'distilbert-base-uncased' if classification == 'small' else 'jcblaise/roberta-tagalog-base'
improved_tokenizer = AutoTokenizer.from_pretrained(improved_model_path, do_lower_case=False)

def tokenize(examples, tokenizer):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_train_base = train_dataset.map(lambda x: tokenize(x, base_tokenizer), batched=True, remove_columns=['text'])
tokenized_val_base = val_dataset.map(lambda x: tokenize(x, base_tokenizer), batched=True, remove_columns=['text'])
tokenized_train_improved = train_dataset.map(lambda x: tokenize(x, improved_tokenizer), batched=True, remove_columns=['text'])
tokenized_val_improved = val_dataset.map(lambda x: tokenize(x, improved_tokenizer), batched=True, remove_columns=['text'])

In [None]:
# Load models
base_model = BertForMaskedLM.from_pretrained('GKLMIP/bert-tagalog-base-uncased').to(device)
improved_model = RobertaForMaskedLM.from_pretrained(improved_model_path).to(device)

In [None]:
# Fine-tuning (skip if model already fine-tuned)
fine_tuned_model_path = './fine_tuned_model'
if os.path.exists(fine_tuned_model_path):
    print(f'Loading fine-tuned model from {fine_tuned_model_path}')
    if classification == 'small':
        from transformers import DistilBertForMaskedLM
        improved_model = DistilBertForMaskedLM.from_pretrained(fine_tuned_model_path).to(device)
    else:
        improved_model = RobertaForMaskedLM.from_pretrained(fine_tuned_model_path).to(device)
else:
    print('Fine-tuning model...')
    training_args = TrainingArguments(
        output_dir='./output',
        num_train_epochs=1,
        per_device_train_batch_size=4,
        eval_strategy='no',
        logging_dir='./logs',
        report_to='none'
    )
    if classification == 'big':
        trainer = Trainer(
            model=improved_model,
            args=training_args,
            train_dataset=tokenized_train_improved,
            eval_dataset=tokenized_val_improved,
            data_collator=DataCollatorForLanguageModeling(tokenizer=improved_tokenizer, mlm=True)
        )
        trainer.train()
        print(f'Saving fine-tuned model to {fine_tuned_model_path}')
        improved_model.save_pretrained(fine_tuned_model_path)
    else:
        print('Dataset is small, skipping fine-tuning')

In [None]:
# Evaluation function
def evaluate_mlm(model, tokenizer, dataset, device):
    model.eval()
    metric = evaluate.load('accuracy')
    predictions, labels = [], []
    for i in range(len(dataset)):
        input_ids = torch.tensor(dataset[i]['input_ids']).unsqueeze(0).to(device)
        attention_mask = torch.tensor(dataset[i]['attention_mask']).unsqueeze(0).to(device)
        mask_token_index = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
        if len(mask_token_index) == 0:
            valid_indices = (input_ids[0] != tokenizer.pad_token_id) & \
                            (input_ids[0] != tokenizer.cls_token_id) & \
                            (input_ids[0] != tokenizer.sep_token_id)
            valid_indices = valid_indices.nonzero(as_tuple=True)[0]
            if len(valid_indices) == 0:
                continue
            mask_idx = valid_indices[torch.randint(0, len(valid_indices), (1,)).item()]
            original_token = input_ids[0, mask_idx].clone()
            input_ids[0, mask_idx] = tokenizer.mask_token_id
            mask_token_index = torch.tensor([mask_idx]).to(device)
            original_token = [original_token.item()]
        else:
            original_token = input_ids[0, mask_token_index].cpu().numpy()
            if original_token.ndim == 0:
                original_token = [original_token.item()]
            else:
                original_token = original_token.tolist()
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_token_id = torch.argmax(logits[0, mask_token_index], dim=-1)
            predictions.extend(predicted_token_id.cpu().numpy().tolist())
            labels.extend(original_token)
    if not predictions:
        print('No valid predictions; returning 0 accuracy')
        return 0.0
    return metric.compute(predictions=predictions, references=labels)['accuracy']

In [None]:
# Run evaluation
start_time = time.time()
base_accuracy = evaluate_mlm(base_model, base_tokenizer, tokenized_val_base, device)
improved_accuracy = evaluate_mlm(improved_model, improved_tokenizer, tokenized_val_improved, device)
eval_time = time.time() - start_time
print(f'BaseBERT accuracy: {base_accuracy:.4f}')
print(f'Improved model accuracy: {improved_accuracy:.4f}')
print(f'Evaluation time: {eval_time:.2f} seconds')

In [None]:
# Save results with efficiency metrics
results = pd.DataFrame({
    'Model': ['BaseBERT', 'Improved'],
    'Accuracy': [base_accuracy, improved_accuracy],
    'EvalTimeSeconds': [eval_time / 2, eval_time / 2]
})
results.to_csv('results.csv', index=False)
results

In [None]:
# Interpret results
threshold = 0.60
is_good = improved_accuracy >= threshold
interpretation = {
    'status': 'Good' if is_good else 'Needs Improvement',
    'reason': f'Improved model (CGABERT) accuracy ({improved_accuracy:.4f}) {'exceeds' if is_good else 'is below'} threshold ({threshold}) for effective text autocomplete.'
}
with open('interpretation.json', 'w') as f:
    json.dump(interpretation, f, indent=4)