In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test cell to confirm kernel execution\n",
    "import sys\n",
    "print(f'Python version: {sys.version}')\n",
    "print('Kernel execution started')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import torch\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import subprocess\n",
    "import sys\n",
    "import logging\n",
    "from pathlib import Path\n",
    "\n",
    "# Set up logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n",
    "logger = logging.getLogger(__name__)\n",
    "logger.info(\"Starting kernel execution\")\n",
    "\n",
    "# Install dependencies with error handling\n",
    "def install_packages():\n",
    "    packages = [\n",
    "        \"transformers==4.44.2\",\n",
    "        \"datasets==3.0.1\",\n",
    "        \"torch==2.4.1\",\n",
    "        \"evaluate==0.4.3\",\n",
    "        \"huggingface_hub==0.25.2\",\n",
    "        \"peft==0.13.2\",\n",
    "        \"psutil==6.0.0\"\n",
    "    ]\n",
    "    for pkg in packages:\n",
    "        try:\n",
    "            subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", pkg, \"--quiet\"])\n",
    "            logger.info(f\"Successfully installed {pkg}\")\n",
    "        except subprocess.CalledProcessError as e:\n",
    "            logger.error(f\"Failed to install {pkg}: {e}\")\n",
    "            print(f\"Warning: Failed to install {pkg}. Proceeding with available packages.\")\n",
    "\n",
    "install_packages()\n",
    "\n",
    "# Verify imports\n",
    "try:\n",
    "    from transformers import (\n",
    "        BertForMaskedLM,\n",
    "        RobertaForMaskedLM,\n",
    "        AutoTokenizer,\n",
    "        Trainer,\n",
    "        TrainingArguments,\n",
    "        DataCollatorForLanguageModeling\n",
    "    )\n",
    "    from datasets import load_dataset, Dataset\n",
    "    import evaluate\n",
    "    import time\n",
    "    import json\n",
    "    import psutil\n",
    "    from peft import LoraConfig, get_peft_model\n",
    "    import torch.nn.functional as F\n",
    "    logger.info(\"All imports successful\")\n",
    "except ImportError as e:\n",
    "    logger.error(f\"Import error: {e}\")\n",
    "    raise\n",
    "\n",
    "# Setup\n",
    "input_dir = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('.')\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "quant_device = torch.device('cpu')  # Quantized models run on CPU\n",
    "logger.info(f'Using device: {device} for main models, {quant_device} for quantized models')\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "# Baseline memory\n",
    "baseline_memory = psutil.Process().memory_info().rss / 1024**2  # MB\n",
    "logger.info(f\"Baseline memory: {baseline_memory:.2f} MB\")\n",
    "\n",
    "# Load dataset\n",
    "dataset_path = os.environ.get('DATASET_PATH', '')\n",
    "if dataset_path:\n",
    "    try:\n",
    "        if dataset_path.endswith('.csv'):\n",
    "            dataset = load_dataset('csv', data_files=dataset_path)\n",
    "        else:\n",
    "            dataset = load_dataset(dataset_path, split='train')\n",
    "        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)\n",
    "        logger.info(f\"Loaded dataset from {dataset_path}\")\n",
    "    except Exception as e:\n",
    "        logger.warning(f'Failed to load dataset {dataset_path}: {e}')\n",
    "        dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')\n",
    "        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)\n",
    "        logger.info(\"Fell back to wikitext dataset\")\n",
    "else:\n",
    "    try:\n",
    "        dataset_files = list(input_dir.glob('*.csv'))\n",
    "        if dataset_files:\n",
    "            dataset = load_dataset('csv', data_files=str(dataset_files[0]))\n",
    "            logger.info(f\"Loaded dataset from {dataset_files[0]}\")\n",
    "        else:\n",
    "            raise FileNotFoundError('No CSV file found; falling back to wikitext.')\n",
    "        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)\n",
    "    except Exception:\n",
    "        dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')\n",
    "        dataset = dataset.filter(lambda x: x['text'].strip() != '' and len(x['text'].split()) > 5)\n",
    "        logger.info(\"Fell back to wikitext dataset\")\n",
    "\n",
    "# Log dataset details\n",
    "num_samples = len(dataset)\n",
    "logger.info(f\"Dataset size: {num_samples} samples\")\n",
    "sample_texts = dataset[:5]['text']\n",
    "logger.info(f\"Sample texts: {sample_texts}\")\n",
    "language = \"Tagalog\" if dataset_path.endswith('.csv') else \"English (wikitext)\"\n",
    "logger.info(f\"Dataset language: {language}\")\n",
    "\n",
    "# Classify dataset\n",
    "classification = 'small' if num_samples < 512 else 'big'\n",
    "data_type = 'low-resource NLP' if num_samples < 1000 else 'standard NLP'\n",
    "train_test_split = dataset.train_test_split(test_size=0.2, seed=42)\n",
    "train_dataset = train_test_split['train']\n",
    "val_dataset = train_test_split['test']\n",
    "\n",
    "# Tokenization\n",
    "base_tokenizer = AutoTokenizer.from_pretrained('GKLMIP/bert-tagalog-base-uncased')\n",
    "improved_model_path = 'distilbert-base-uncased' if classification == 'small' else 'jcblaise/roberta-tagalog-base'\n",
    "improved_tokenizer = AutoTokenizer.from_pretrained(improved_model_path, do_lower_case=False)\n",
    "\n",
    "def tokenize(examples, tokenizer):\n",
    "    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)\n",
    "\n",
    "# Filter non-Tagalog tokens if using wikitext\n",
    "def filter_non_tagalog(examples, tokenizer):\n",
    "    if language != \"English (wikitext)\":\n",
    "        return True\n",
    "    tokens = tokenizer(examples['text'], truncation=True, max_length=64).input_ids\n",
    "    vocab = set(tokenizer.get_vocab().keys())\n",
    "    decoded = tokenizer.convert_ids_to_tokens(tokens)\n",
    "    return all(\n",
    "        token in vocab and \n",
    "        not token.startswith('##') and \n",
    "        all(ord(c) < 128 for c in tokenizer.decode([t]) if t not in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id])\n",
    "        for t, token in zip(tokens, decoded)\n",
    "        if token not in [tokenizer.pad_token, tokenizer.cls_token, tokenizer.sep_token]\n",
    "    )\n",
    "\n",
    "if language == \"English (wikitext)\":\n",
    "    train_dataset = train_dataset.filter(lambda x: filter_non_tagalog(x, base_tokenizer))\n",
    "    val_dataset = val_dataset.filter(lambda x: filter_non_tagalog(x, base_tokenizer))\n",
    "    logger.info(f\"Filtered dataset size: train={len(train_dataset)}, val={len(val_dataset)}\")\n",
    "\n",
    "tokenized_train_base = train_dataset.map(lambda x: tokenize(x, base_tokenizer), batched=True, remove_columns=['text'])\n",
    "tokenized_val_base = val_dataset.map(lambda x: tokenize(x, base_tokenizer), batched=True, remove_columns=['text'])\n",
    "tokenized_train_improved = train_dataset.map(lambda x: tokenize(x, improved_tokenizer), batched=True, remove_columns=['text'])\n",
    "tokenized_val_improved = val_dataset.map(lambda x: tokenize(x, improved_tokenizer), batched=True, remove_columns=['text'])\n",
    "\n",
    "# Validate tokens\n",
    "def validate_tokens(dataset, tokenizer):\n",
    "    vocab = set(tokenizer.get_vocab().keys())\n",
    "    invalid_count = 0\n",
    "    for example in dataset:\n",
    "        tokens = tokenizer.convert_ids_to_tokens(example['input_ids'])\n",
    "        if any(token not in vocab for token in tokens if token not in [tokenizer.pad_token, tokenizer.cls_token, tokenizer.sep_token]):\n",
    "            invalid_count += 1\n",
    "    logger.info(f\"Invalid tokens found: {invalid_count}/{len(dataset)} samples\")\n",
    "    return invalid_count == 0\n",
    "\n",
    "logger.info(\"Validating base tokenizer tokens...\")\n",
    "base_tokens_valid = validate_tokens(tokenized_val_base, base_tokenizer)\n",
    "logger.info(\"Validating improved tokenizer tokens...\")\n",
    "improved_tokens_valid = validate_tokens(tokenized_val_improved, improved_tokenizer)\n",
    "\n",
    "# Load models\n",
    "base_model = BertForMaskedLM.from_pretrained('GKLMIP/bert-tagalog-base-uncased').to(device)\n",
    "improved_model = RobertaForMaskedLM.from_pretrained(improved_model_path).to(device)\n",
    "\n",
    "# Apply LoRA for fine-tuning\n",
    "lora_config = LoraConfig(\n",
    "    r=8,\n",
    "    lora_alpha=16,\n",
    "    target_modules=[\"query\", \"value\"],\n",
    "    lora_dropout=0.1,\n",
    "    bias=\"none\"\n",
    ")\n",
    "improved_model = get_peft_model(improved_model, lora_config)\n",
    "improved_model.print_trainable_parameters()\n",
    "\n",
    "# Fine-tuning\n",
    "fine_tuned_model_path = '/kaggle/working/fine_tuned_model' if Path('/kaggle/working').exists() else './fine_tuned_model'\n",
    "if os.path.exists(fine_tuned_model_path):\n",
    "    logger.info(f'Loading fine-tuned model from {fine_tuned_model_path}')\n",
    "    if classification == 'small':\n",
    "        from transformers import DistilBertForMaskedLM\n",
    "        improved_model = DistilBertForMaskedLM.from_pretrained(fine_tuned_model_path).to(device)\n",
    "    else:\n",
    "        improved_model = RobertaForMaskedLM.from_pretrained(fine_tuned_model_path).to(device)\n",
    "else:\n",
    "    logger.info('Fine-tuning model with LoRA...')\n",
    "    training_args = TrainingArguments(\n",
    "        output_dir='/kaggle/working/output' if Path('/kaggle/working').exists() else './output',\n",
    "        num_train_epochs=1,\n",
    "        per_device_train_batch_size=4,\n",
    "        eval_strategy='no',\n",
    "        logging_dir='/kaggle/working/logs' if Path('/kaggle/working').exists() else './logs',\n",
    "        report_to='none',\n",
    "        fp16=True\n",
    "    )\n",
    "    if classification == 'big':\n",
    "        trainer = Trainer(\n",
    "            model=improved_model,\n",
    "            args=training_args,\n",
    "            train_dataset=tokenized_train_improved,\n",
    "            data_collator=DataCollatorForLanguageModeling(tokenizer=improved_tokenizer, mlm=True)\n",
    "        )\n",
    "        trainer.train()\n",
    "        logger.info(f'Saving fine-tuned model to {fine_tuned_model_path}')\n",
    "        improved_model.save_pretrained(fine_tuned_model_path)\n",
    "    else:\n",
    "        logger.info('Dataset is small, skipping fine-tuning')\n",
    "\n",
    "# Evaluation function with focal loss and batched inference\n",
    "def evaluate_mlm(model, tokenizer, dataset, device, gamma=2.0, batch_size=16):\n",
    "    model.eval()\n",
    "    accuracy_metric = evaluate.load('accuracy')\n",
    "    f1_metric = evaluate.load('f1')\n",
    "    predictions, labels = [], []\n",
    "    perplexity_scores = []\n",
    "    latencies = []\n",
    "    memory_usages = []\n",
    "    \n",
    "    for i in range(0, len(dataset), batch_size):\n",
    "        batch = dataset[i:i+batch_size]\n",
    "        start_time = time.time()\n",
    "        input_ids = torch.tensor(batch['input_ids']).to(device)\n",
    "        attention_mask = torch.tensor(batch['attention_mask']).to(device)\n",
    "        logger.info(f\"Batch {i//batch_size}: input_ids shape = {input_ids.shape}, attention_mask shape = {attention_mask.shape}\")\n",
    "        mask_token_indices = [(input_ids[j] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0] for j in range(input_ids.size(0))]\n",
    "        original_tokens = []\n",
    "        valid_mask_indices = []\n",
    "        for j in range(input_ids.size(0)):\n",
    "            if len(mask_token_indices[j]) == 0:\n",
    "                valid_indices = (input_ids[j] != tokenizer.pad_token_id) & (input_ids[j] != tokenizer.cls_token_id) & (input_ids[j] != tokenizer.sep_token_id)\n",
    "                valid_indices = valid_indices.nonzero(as_tuple=True)[0]\n",
    "                if len(valid_indices) == 0:\n",
    "                    continue\n",
    "                mask_idx = valid_indices[torch.randint(0, len(valid_indices), (1,)).item()]\n",
    "                original_token = input_ids[j, mask_idx].clone()\n",
    "                input_ids[j, mask_idx] = tokenizer.mask_token_id\n",
    "                mask_token_indices[j] = torch.tensor([mask_idx]).to(device)\n",
    "                original_tokens.append([original_token.item()])\n",
    "                valid_mask_indices.append([mask_idx])\n",
    "            else:\n",
    "                original_token = input_ids[j, mask_token_indices[j]].cpu().numpy()\n",
    "                if original_token.ndim == 0:\n",
    "                    original_token = [original_token.item()]\n",
    "                else:\n",
    "                    original_token = original_token.tolist()\n",
    "                original_tokens.append(original_token)\n",
    "                valid_mask_indices.append(mask_token_indices[j].cpu().numpy().tolist())\n",
    "        if not original_tokens:\n",
    "            logger.info(f\"Batch {i//batch_size}: No valid tokens, skipping\")\n",
    "            continue\n",
    "        with torch.no_grad():\n",
    "            outputs = model(input_ids, attention_mask=attention_mask)\n",
    "            logits = outputs.logits\n",
    "            for j in range(len(original_tokens)):\n",
    "                mask_indices = valid_mask_indices[j]\n",
    "                if not mask_indices:\n",
    "                    continue\n",
    "                predicted_token_id = torch.argmax(logits[j, mask_indices], dim=-1)\n",
    "                probs = F.softmax(logits[j, mask_indices], dim=-1)\n",
    "                log_probs = F.log_softmax(logits[j, mask_indices], dim=-1)\n",
    "                log_probs = torch.clamp(log_probs, min=-100, max=0)\n",
    "                try:\n",
    "                    neg_log_prob = -log_probs[torch.arange(len(original_tokens[j])), original_tokens[j]]\n",
    "                    if neg_log_prob.max() > 100:\n",
    "                        logger.info(f\"Sample {i+j}: Extreme neg_log_prob {neg_log_prob.max().item():.4f}, skipping\")\n",
    "                        continue\n",
    "                    perplexity = torch.exp(torch.clamp(neg_log_prob, max=100).mean())\n",
    "                    if not torch.isfinite(perplexity):\n",
    "                        logger.info(f\"Sample {i+j}: Non-finite perplexity, skipping\")\n",
    "                        continue\n",
    "                    perplexity_scores.append(perplexity.item())\n",
    "                    logger.info(f\"Sample {i+j}: Perplexity = {perplexity.item():.4f}, Neg log prob = {neg_log_prob.mean().item():.4f}\")\n",
    "                except Exception as e:\n",
    "                    logger.error(f\"Sample {i+j}: Perplexity error: {e}\")\n",
    "                    continue\n",
    "                ce_loss = F.cross_entropy(logits[j, mask_indices], torch.tensor(original_tokens[j]).to(device), reduction='none')\n",
    "                pt = torch.exp(-ce_loss)\n",
    "                focal_loss = (1 - pt) ** 2.0 * ce_loss\n",
    "                logger.info(f\"Sample {i+j}: Focal loss = {focal_loss.mean().item():.4f}\")\n",
    "                predictions.extend(predicted_token_id.cpu().numpy().tolist())\n",
    "                labels.extend(original_tokens[j])\n",
    "        latency = (time.time() - start_time) / len(original_tokens)\n",
    "        latencies.extend([latency] * len(original_tokens))\n",
    "        memory = (psutil.Process().memory_info().rss / 1024**2) - baseline_memory\n",
    "        memory_usages.extend([memory] * len(original_tokens))\n",
    "        del input_ids, attention_mask, logits, outputs\n",
    "        torch.cuda.empty_cache()\n",
    "    if not predictions:\n",
    "        logger.warning('No valid predictions; returning 0 metrics')\n",
    "        return {'accuracy': 0.0, 'f1': 0.0, 'perplexity': float('inf'), 'latency': 0.0, 'memory': 0.0}\n",
    "    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']\n",
    "    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')['f1']\n",
    "    avg_perplexity = np.mean(perplexity_scores) if perplexity_scores else float('inf')\n",
    "    avg_latency = np.mean(latencies)\n",
    "    avg_memory = np.mean(memory_usages)\n",
    "    return {'accuracy': accuracy, 'f1': f1, 'perplexity': avg_perplexity, 'latency': avg_latency, 'memory': avg_memory}\n",
    "\n",
    "# Run evaluation with quantization\n",
    "from torch.quantization import quantize_dynamic\n",
    "logger.info(\"Applying dynamic quantization (qint8) for CPU backend\")\n",
    "base_model_quantized = quantize_dynamic(base_model.to('cpu'), {torch.nn.Linear}, dtype=torch.qint8).to(quant_device)\n",
    "improved_model_quantized = quantize_dynamic(improved_model.to('cpu'), {torch.nn.Linear}, dtype=torch.qint8).to(quant_device)\n",
    "\n",
    "# Log model size\n",
    "def get_model_size(model):\n",
    "    torch.save(model.state_dict(), \"temp.pt\")\n",
    "    size = os.path.getsize(\"temp.pt\") / 1024**2\n",
    "    os.remove(\"temp.pt\")\n",
    "    return size\n",
    "\n",
    "logger.info(f\"BaseBERT quantized size: {get_model_size(base_model_quantized):.2f} MB\")\n",
    "logger.info(f\"Improved quantized size: {get_model_size(improved_model_quantized):.2f} MB\")\n",
    "\n",
    "start_time = time.time()\n",
    "base_metrics = evaluate_mlm(base_model_quantized, base_tokenizer, tokenized_val_base, quant_device, batch_size=16)\n",
    "improved_metrics = evaluate_mlm(improved_model_quantized, improved_tokenizer, tokenized_val_improved, quant_device, batch_size=16)\n",
    "eval_time = time.time() - start_time\n",
    "\n",
    "logger.info(f\"BaseBERT metrics: {base_metrics}\")\n",
    "logger.info(f\"Improved model metrics: {improved_metrics}\")\n",
    "logger.info(f\"Evaluation time: {eval_time:.2f} seconds\")\n",
    "\n",
    "# Save results with local fallback\n",
    "output_dir = '/kaggle/working' if Path('/kaggle/working').exists() else '.'\n",
    "results = pd.DataFrame({\n",
    "    'Model': ['BaseBERT', 'Improved'],\n",
    "    'Accuracy': [base_metrics['accuracy'], improved_metrics['accuracy']],\n",
    "    'F1': [base_metrics['f1'], improved_metrics['f1']],\n",
    "    'Perplexity': [base_metrics['perplexity'], improved_metrics['perplexity']],\n",
    "    'LatencySeconds': [base_metrics['latency'], improved_metrics['latency']],\n",
    "    'MemoryMB': [base_metrics['memory'], improved_metrics['memory']],\n",
    "    'EvalTimeSeconds': [eval_time / 2, eval_time / 2]\n",
    "})\n",
    "results.to_csv(os.path.join(output_dir, 'results.csv'), index=False)\n",
    "logger.info(f\"Results saved: {results}\")\n",
    "\n",
    "# Interpret results\n",
    "threshold = 0.60\n",
    "is_good = improved_metrics['accuracy'] >= threshold\n",
    "interpretation = {\n",
    "    'status': 'Good' if is_good else 'Needs Improvement',\n",
    "    'reason': f'Improved model (CGABERT) accuracy ({improved_metrics[\"accuracy\"]:.4f}) {\"exceeds\" if is_good else \"is below\"} threshold ({threshold}) for effective text autocomplete.',\n",
    "    'dataset': {'size': num_samples, 'language': language, 'samples': sample_texts[:3]},\n",
    "    'metrics': improved_metrics\n",
    "}\n",
    "with open(os.path.join(output_dir, 'interpretation.json'), 'w') as f:\n",
    "    json.dump(interpretation, f, indent=4)\n",
    "logger.info(\"Interpretation saved\")\n",
    "\n",
    "logger.info(\"Kernel executed successfully\")\n",
    "print(\"Execution completed. Check results.csv and interpretation.json in\", output_dir)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}