In [None]:
import os, sys, platform, subprocess, importlib

def v(pkg):
    try:
        return importlib.import_module(pkg).__version__
    except Exception as e:
        return f"Not installed / import error: {type(e).__name__}"

print("=== SYSTEM ===")
print("Python:", sys.version.replace("\n"," "))
print("Platform:", platform.platform())
print("Processor:", platform.processor())

print("\n=== NVIDIA / CUDA (driver) ===")
try:
    out = subprocess.check_output(["bash","-lc","nvidia-smi -L && nvidia-smi | head -n 5"], text=True)
    print(out)
except Exception as e:
    print("nvidia-smi not available:", e)

print("\n=== PYTORCH ===")
try:
    import torch
    print("torch:", torch.__version__)
    print("torch.version.cuda:", torch.version.cuda)
    print("cuda available:", torch.cuda.is_available())
    print("device_count:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        prop = torch.cuda.get_device_properties(i)
        print(f"GPU{i}: {prop.name} | total_mem_GB={prop.total_memory/1024**3:.2f}")
except Exception as e:
    print("torch import error:", e)

print("\n=== HF / TRAINING STACK ===")
print("transformers:", v("transformers"))
print("datasets:", v("datasets"))
print("tokenizers:", v("tokenizers"))
print("accelerate:", v("accelerate"))
print("safetensors:", v("safetensors"))

print("\n=== OTHER COMMON ===")
print("numpy:", v("numpy"))
print("protobuf:", v("google.protobuf"))
print("tensorflow:", v("tensorflow"))

print("\n=== ENV (useful flags) ===")
for k in ["CUDA_VISIBLE_DEVICES", "PYTORCH_CUDA_ALLOC_CONF", "TOKENIZERS_PARALLELISM"]:
    print(f"{k}={os.environ.get(k)}")


=== SYSTEM ===
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
Processor: x86_64

=== NVIDIA / CUDA (driver) ===
GPU 0: NVIDIA A100-SXM4-80GB (UUID: GPU-35bd50b8-78d4-5e85-fe90-427588c21673)
Mon Dec 15 12:40:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |


=== PYTORCH ===
torch: 2.9.0+cu126
torch.version.cuda: 12.6
cuda available: True
device_count: 1
GPU0: NVIDIA A100-SXM4-80GB | total_mem_GB=79.32

=== HF / TRAINING STACK ===
transformers: 4.57.3
datasets: 4.0.0
tokenizers: 0.22.1
accelerate: 1.12.0
safetensors: 0.7.0

=== OTHER COMMON ===
numpy: 2.0.2
protobuf: 5.29.5
tensorflow: 2.19.0

=== ENV 

In [None]:
# ===== COLAB RUNTIME'I YENİDEN BAŞLAT =====
# 1. Runtime -> Restart runtime
# 2. Sonra bu kodu çalıştır:

# ===== KÜTÜPHANELERİ YENİDEN YÜKLE =====
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers datasets accelerate

print("✅ Kütüphaneler yüklendi. Lütfen runtime'ı yeniden başlatın:")
print("   Runtime -> Restart runtime")


Found existing installation: torch 2.9.0+cu126
Uninstalling torch-2.9.0+cu126:
  Successfully uninstalled torch-2.9.0+cu126
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: torchaudio 2.9.0+cu126
Uninstalling torchaudio-2.9.0+cu126:
  Successfully uninstalled torchaudio-2.9.0+cu126
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading htt

✅ Kütüphaneler yüklendi. Lütfen runtime'ı yeniden başlatın:
   Runtime -> Restart runtime


In [None]:
# ===== LEGAL-BERTURK FULL TRAINING (COLAB - FIX) =====

import warnings
warnings.filterwarnings('ignore')

import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from transformers.trainer_utils import get_last_checkpoint
from google.colab import drive

# Versiyon kontrolü
print("🔍 Versiyon kontrolü:")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

print("\n" + "="*70)
print("🏛️  LEGAL-BERTURK FULL TRAINING (COLAB)")
print("="*70)

# ===== GOOGLE DRIVE MOUNT =====
print("\n📁 Google Drive bağlanıyor...")
drive.mount('/content/drive')
print("✅ Drive bağlandı!")

# ===== PATHS =====
DRIVE_BASE = '/content/drive/MyDrive/colab_output/legal-bert-training'
OUTPUT_DIR = f'{DRIVE_BASE}/checkpoints'
FINAL_MODEL_DIR = f'{DRIVE_BASE}/final-model'

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
print(f"✅ Checkpoint: {OUTPUT_DIR}")
print(f"✅ Final model: {FINAL_MODEL_DIR}")

# ===== 1. CORPUS YÜKLE =====
print("\n📚 1/5: Corpus yükleniyor...")

# Corpus'u Drive'a yükle veya yüklet
corpus_path = '/content/drive/MyDrive/legal_corpus_mixed.txt'

# Eğer corpus yoksa yükle
if not os.path.exists(corpus_path):
    print("⚠️  Corpus bulunamadı. Lütfen yükleyin:")
    print(f"   1. Dosyayı Drive'ınıza yükleyin: {corpus_path}")
    print("   2. Ya da aşağıdaki komutu kullanın:")
    print("\n   from google.colab import files")
    print("   uploaded = files.upload()")
    raise FileNotFoundError(f"Corpus bulunamadı: {corpus_path}")

dataset = load_dataset(
    'text',
    data_files={'train': corpus_path},
    split='train',
    cache_dir='/content/cache'
)
print(f"✅ Corpus: {len(dataset):,} satır")

# ===== 2. TOKENIZER & MODEL =====
print("\n🤖 2/5: BERTurk yükleniyor...")
tokenizer = AutoTokenizer.from_pretrained(
    "dbmdz/bert-base-turkish-cased",
    cache_dir='/content/model_cache'
)
model = AutoModelForMaskedLM.from_pretrained(
    "dbmdz/bert-base-turkish-cased",
    cache_dir='/content/model_cache'
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print(f"✅ Model yüklendi - Device: {device}")

# ===== 3. TOKENIZATION =====
print("\n🔤 3/5: Tokenization...")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=2,
    remove_columns=['text'],
    desc="Tokenizing",
    load_from_cache_file=True
)
print(f"✅ Tokenization: {len(tokenized_dataset):,} örnek")

# ===== 4. DATA COLLATOR =====
print("\n⚙️  4/5: Data collator...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)
print("✅ Data collator hazır")

# ===== 5. TRAINING SETUP =====
print("\n⚙️  5/5: Training ayarları...")

# Checkpoint kontrolü
last_checkpoint = None
try:
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
    if last_checkpoint:
        print(f"🔄 Checkpoint bulundu: {last_checkpoint}")
except:
    print("🆕 Yeni training başlatılıyor")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=False,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=3,
    logging_steps=200,
    logging_first_step=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to="none",
    save_safetensors=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# ===== 6. TRAINING =====
print("\n" + "="*70)
print("🚀 TRAINING BAŞLIYOR!")
print("="*70)
print(f"📊 Dataset: {len(tokenized_dataset):,} örnek")
print(f"📊 Epochs: 3")
print(f"📊 Effective batch: 32 (8×4)")
print(f"💾 Checkpoint her 2000 step: {OUTPUT_DIR}")
print("="*70 + "\n")

try:
    if last_checkpoint:
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        trainer.train()

    print("\n✅ TRAINING TAMAMLANDI!")

    # Final model kaydet
    print("\n💾 Final model kaydediliyor...")
    trainer.save_model(FINAL_MODEL_DIR)
    tokenizer.save_pretrained(FINAL_MODEL_DIR)

    print(f"✅ Model kaydedildi: {FINAL_MODEL_DIR}")

except KeyboardInterrupt:
    print("\n⚠️  Training durduruldu!")
    print(f"💾 Checkpoint'ler: {OUTPUT_DIR}")

except Exception as e:
    print(f"\n❌ Hata: {e}")
    print(f"💾 Checkpoint'ler korundu: {OUTPUT_DIR}")


🔍 Versiyon kontrolü:
PyTorch: 2.7.1+cu118
CUDA available: True
CUDA version: 11.8

🏛️  LEGAL-BERTURK FULL TRAINING (COLAB)

📁 Google Drive bağlanıyor...
Mounted at /content/drive
✅ Drive bağlandı!
✅ Checkpoint: /content/drive/MyDrive/colab_output/legal-bert-training/checkpoints
✅ Final model: /content/drive/MyDrive/colab_output/legal-bert-training/final-model

📚 1/5: Corpus yükleniyor...


Generating train split: 0 examples [00:00, ? examples/s]

✅ Corpus: 1,294,228 satır

🤖 2/5: BERTurk yükleniyor...


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

✅ Model yüklendi - Device: cuda

🔤 3/5: Tokenization...


Tokenizing (num_proc=2):   0%|          | 0/1294228 [00:00<?, ? examples/s]

✅ Tokenization: 1,294,228 örnek

⚙️  4/5: Data collator...
✅ Data collator hazır

⚙️  5/5: Training ayarları...
🔄 Checkpoint bulundu: /content/drive/MyDrive/colab_output/legal-bert-training/checkpoints/checkpoint-16000

🚀 TRAINING BAŞLIYOR!
📊 Dataset: 1,294,228 örnek
📊 Epochs: 3
📊 Effective batch: 32 (8×4)
💾 Checkpoint her 2000 step: /content/drive/MyDrive/colab_output/legal-bert-training/checkpoints



There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


Step,Training Loss
16200,0.8156
16400,0.7896
16600,0.8051
16800,0.795
17000,0.7791
17200,0.7859
17400,0.7869
17600,0.7954
17800,0.8058
18000,0.7738



✅ TRAINING TAMAMLANDI!

💾 Final model kaydediliyor...
✅ Model kaydedildi: /content/drive/MyDrive/colab_output/legal-bert-training/final-model
