In [1]:
"""
mBART-50 Fine-tuning with NLLB-200 Configuration
Optimized for Colab Pro+ (A100/V100)
"""

# ============================================================================
# 1. 환경 설정 및 라이브러리 설치
# ============================================================================
print("="*70)
print("mBART-50 Fine-tuning (NLLB-200 Configuration)")
print("="*70)

!pip install transformers datasets sentencepiece accelerate -q

import pandas as pd
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from google.colab import files, drive
import time
from datetime import timedelta
import os

# ============================================================================
# 2. GPU 확인
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'='*70}")
print("GPU Information")
print(f"{'='*70}")
print(f"Device: {device}")
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Total Memory: {gpu_memory:.1f} GB")

    # Colab Pro+ 확인
    if "A100" in gpu_name or "V100" in gpu_name:
        print("✓ Colab Pro+ GPU detected!")
    else:
        print("⚠ Warning: This configuration requires Colab Pro+ (A100/V100)")

# ============================================================================
# 3. Google Drive 마운트
# ============================================================================
print(f"\n{'='*70}")
print("Mounting Google Drive")
print(f"{'='*70}")
drive.mount('/content/drive')

# 저장 경로 설정
OUTPUT_DIR = "/content/drive/MyDrive/mbart_legal_nllb_config"
FINAL_DIR = "/content/drive/MyDrive/mbart_legal_nllb_final"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory: {OUTPUT_DIR}")

# ============================================================================
# 4. 데이터 파일 업로드
# ============================================================================
print(f"\n{'='*70}")
print("Upload Training Data")
print(f"{'='*70}")
print("Please upload train_v2.csv and dev_v2.csv:")

uploaded = files.upload()

# 업로드된 파일 확인
uploaded_files = list(uploaded.keys())
print(f"\n✓ Uploaded files: {uploaded_files}")

# 데이터 로드
if 'train_v2.csv' not in uploaded_files or 'dev_v2.csv' not in uploaded_files:
    raise FileNotFoundError("train_v2.csv and dev_v2.csv are required!")

train_df = pd.read_csv('train_v2.csv')
val_df = pd.read_csv('dev_v2.csv')

print(f"\n{'='*70}")
print("Dataset Information")
print(f"{'='*70}")
print(f"Train samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")
print(f"\nSample data:")
print(train_df.head(2))

# ============================================================================
# 5. 모델 및 토크나이저 로드
# ============================================================================
print(f"\n{'='*70}")
print("Loading Model and Tokenizer")
print(f"{'='*70}")

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
print(f"Model: {MODEL_NAME}")

tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang="ru_RU",
    tgt_lang="ko_KR"
)
print("✓ Tokenizer loaded")

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
print(f"✓ Model loaded: {model.num_parameters():,} parameters")

# ============================================================================
# 6. 데이터 전처리 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Data Preprocessing (NLLB-200 Config)")
print(f"{'='*70}")

# NLLB-200 설정값
MAX_LENGTH = 256  # 최대 시퀀스 길이

def preprocess_function(examples):
    """NLLB-200 방식의 전처리"""
    tokenizer.src_lang = "ru_RU"

    # Source 언어 토크나이징
    inputs = tokenizer(
        examples['ru'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )

    # Target 언어 토크나이징
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['ko'],
            max_length=MAX_LENGTH,
            truncation=True,
            padding=False
        )

    inputs['labels'] = labels['input_ids']
    return inputs

# 데이터셋 변환
train_dataset = Dataset.from_pandas(train_df).map(
    preprocess_function,
    batched=True,
    remove_columns=train_df.columns.tolist()
)

val_dataset = Dataset.from_pandas(val_df).map(
    preprocess_function,
    batched=True,
    remove_columns=val_df.columns.tolist()
)

# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print(f"✓ Train dataset: {len(train_dataset):,} samples")
print(f"✓ Validation dataset: {len(val_dataset):,} samples")
print(f"✓ MAX_LENGTH: {MAX_LENGTH}")

# ============================================================================
# 7. 학습 설정 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Training Configuration (NLLB-200 Settings)")
print(f"{'='*70}")

training_args = Seq2SeqTrainingArguments(
    # 기본 설정
    output_dir=OUTPUT_DIR,

    # 학습 설정
    num_train_epochs=3,                      # 에포크 수
    per_device_train_batch_size=8,           # 배치 크기 (GPU 당)
    gradient_accumulation_steps=2,           # 그래디언트 누적 (실질 배치 16)

    # 학습률 및 최적화
    learning_rate=1e-5,                      # 학습률 (1×10⁻⁵)
    warmup_steps=300,                        # 워밍업 단계

    # 로깅 및 저장
    logging_steps=100,                       # 로그 기록 간격
    save_steps=500,                          # 모델 저장 간격
    eval_steps=500,                          # 검증 실행 간격

    # 저장 전략
    save_strategy="steps",
    evaluation_strategy="steps",
    save_total_limit=3,                      # 최대 체크포인트 수

    # 메모리 최적화
    fp16=True,                               # FP16 혼합 정밀도

    # 기타
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    predict_with_generate=False,             # 평가 시 생성 비활성화 (속도 향상)
)

print(f"""
Configuration Summary:
{'='*70}
Model: {MODEL_NAME}
Epochs: 3
Batch size per device: 8
Gradient accumulation: 2
Effective batch size: 16
Learning rate: 1e-5
Warmup steps: 300
MAX_LENGTH: 256
FP16: Enabled
{'='*70}
""")

# ============================================================================
# 8. Trainer 초기화
# ============================================================================
print(f"\n{'='*70}")
print("Initializing Trainer")
print(f"{'='*70}")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer initialized")

# GPU 캐시 정리
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# ============================================================================
# 9. 학습 실행
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING START")
print(f"{'='*70}")

start_time = time.time()
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    # 학습 시작
    train_result = trainer.train()

    # 학습 완료
    success = True

    print(f"\n{'='*70}")
    print("TRAINING COMPLETED!")
    print(f"{'='*70}")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n{'='*70}")
    print("OOM ERROR!")
    print(f"{'='*70}")
    print("Out of memory error occurred.")
    print("\nOptions:")
    print("  1. Reduce batch_size to 4")
    print("  2. Increase gradient_accumulation_steps to 4")
    print("  3. Reduce MAX_LENGTH to 128")
    success = False

except Exception as e:
    print(f"\n{'='*70}")
    print("ERROR!")
    print(f"{'='*70}")
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    success = False

# 학습 시간 계산
duration = time.time() - start_time
duration_str = str(timedelta(seconds=int(duration)))

# ============================================================================
# 10. 모델 저장 및 결과 정리
# ============================================================================
if success:
    print(f"\n{'='*70}")
    print("Saving Model")
    print(f"{'='*70}")

    # 최종 모델 저장
    trainer.save_model(FINAL_DIR)
    tokenizer.save_pretrained(FINAL_DIR)

    print(f"✓ Model saved to: {FINAL_DIR}")

    # 학습 결과 요약
    summary = f"""
{'='*70}
mBART-50 Fine-tuning Results (NLLB-200 Configuration)
{'='*70}

Dataset:
  - Train samples: {len(train_dataset):,}
  - Validation samples: {len(val_dataset):,}

Model:
  - Name: {MODEL_NAME}
  - Parameters: {model.num_parameters():,}

Configuration:
  - Epochs: 3
  - Batch size: 8
  - Gradient accumulation: 2
  - Effective batch size: 16
  - Learning rate: 1e-5
  - Warmup steps: 300
  - MAX_LENGTH: 256
  - FP16: Enabled

Training:
  - Duration: {duration_str}
  - Final loss: {train_result.training_loss:.4f}

Saved:
  - Location: {FINAL_DIR}

{'='*70}
Training completed successfully!
{'='*70}
"""

    print(summary)

    # 요약 파일 저장
    with open(f"{FINAL_DIR}/training_summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary)

    print(f"✓ Summary saved to: {FINAL_DIR}/training_summary.txt")

    # 학습 로그 저장
    if hasattr(trainer.state, 'log_history'):
        import json
        with open(f"{FINAL_DIR}/training_logs.json", 'w', encoding='utf-8') as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print(f"✓ Training logs saved to: {FINAL_DIR}/training_logs.json")

    print(f"\n{'='*70}")
    print("SUCCESS! 🎉")
    print(f"{'='*70}")

else:
    print(f"\n{'='*70}")
    print("TRAINING FAILED")
    print(f"{'='*70}")
    print(f"Duration: {duration_str}")
    print("\nPlease check the error messages above and try again.")

# ============================================================================
# 11. 메모리 정리
# ============================================================================
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\n✓ GPU memory cleared")

print(f"\n{'='*70}")
print("Script completed")
print(f"{'='*70}")

mBART-50 Fine-tuning (NLLB-200 Configuration)


KeyboardInterrupt: 

In [1]:
"""
mBART-50 Fine-tuning with NLLB-200 Configuration
Optimized for Colab Pro+ (A100/V100)
"""

# ============================================================================
# 1. 환경 설정 및 라이브러리 설치
# ============================================================================
print("="*70)
print("mBART-50 Fine-tuning (NLLB-200 Configuration)")
print("="*70)

!pip install transformers datasets sentencepiece accelerate -q

import pandas as pd
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from google.colab import files, drive
import time
from datetime import timedelta
import os

# ============================================================================
# 2. GPU 확인
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'='*70}")
print("GPU Information")
print(f"{'='*70}")
print(f"Device: {device}")
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Total Memory: {gpu_memory:.1f} GB")

    # Colab Pro+ 확인
    if "A100" in gpu_name or "V100" in gpu_name:
        print("✓ Colab Pro+ GPU detected!")
    else:
        print("⚠ Warning: This configuration requires Colab Pro+ (A100/V100)")

# ============================================================================
# 3. Google Drive 마운트
# ============================================================================
print(f"\n{'='*70}")
print("Mounting Google Drive")
print(f"{'='*70}")
drive.mount('/content/drive')

# 저장 경로 설정
OUTPUT_DIR = "/content/drive/MyDrive/mbart_legal_nllb_config"
FINAL_DIR = "/content/drive/MyDrive/mbart_legal_nllb_final"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory: {OUTPUT_DIR}")

# ============================================================================
# 4. 데이터 파일 업로드
# ============================================================================
print(f"\n{'='*70}")
print("Upload Training Data")
print(f"{'='*70}")
print("Please upload train_v2.csv and dev_v2.csv:")

uploaded = files.upload()

# 업로드된 파일 확인
uploaded_files = list(uploaded.keys())
print(f"\n✓ Uploaded files: {uploaded_files}")

# 데이터 로드
if 'train_v2.csv' not in uploaded_files or 'dev_v2.csv' not in uploaded_files:
    raise FileNotFoundError("train_v2.csv and dev_v2.csv are required!")

train_df = pd.read_csv('train_v2.csv')
val_df = pd.read_csv('dev_v2.csv')

print(f"\n{'='*70}")
print("Dataset Information")
print(f"{'='*70}")
print(f"Train samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")
print(f"\nSample data:")
print(train_df.head(2))

# ============================================================================
# 5. 모델 및 토크나이저 로드
# ============================================================================
print(f"\n{'='*70}")
print("Loading Model and Tokenizer")
print(f"{'='*70}")

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
print(f"Model: {MODEL_NAME}")

tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang="ru_RU",
    tgt_lang="ko_KR"
)
print("✓ Tokenizer loaded")

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
print(f"✓ Model loaded: {model.num_parameters():,} parameters")

# ============================================================================
# 6. 데이터 전처리 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Data Preprocessing (NLLB-200 Config)")
print(f"{'='*70}")

# NLLB-200 설정값
MAX_LENGTH = 256  # 최대 시퀀스 길이

def preprocess_function(examples):
    """NLLB-200 방식의 전처리"""
    tokenizer.src_lang = "ru_RU"

    # Source 언어 토크나이징
    inputs = tokenizer(
        examples['ru'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )

    # Target 언어 토크나이징
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['ko'],
            max_length=MAX_LENGTH,
            truncation=True,
            padding=False
        )

    inputs['labels'] = labels['input_ids']
    return inputs

# 데이터셋 변환
train_dataset = Dataset.from_pandas(train_df).map(
    preprocess_function,
    batched=True,
    remove_columns=train_df.columns.tolist()
)

val_dataset = Dataset.from_pandas(val_df).map(
    preprocess_function,
    batched=True,
    remove_columns=val_df.columns.tolist()
)

# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print(f"✓ Train dataset: {len(train_dataset):,} samples")
print(f"✓ Validation dataset: {len(val_dataset):,} samples")
print(f"✓ MAX_LENGTH: {MAX_LENGTH}")

# ============================================================================
# 7. 학습 설정 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Training Configuration (NLLB-200 Settings)")
print(f"{'='*70}")

training_args = Seq2SeqTrainingArguments(
    # 기본 설정
    output_dir=OUTPUT_DIR,

    # 학습 설정
    num_train_epochs=3,                      # 에포크 수
    per_device_train_batch_size=8,           # 배치 크기 (GPU 당)
    gradient_accumulation_steps=2,           # 그래디언트 누적 (실질 배치 16)

    # 학습률 및 최적화
    learning_rate=1e-5,                      # 학습률 (1×10⁻⁵)
    warmup_steps=300,                        # 워밍업 단계

    # 로깅 및 저장
    logging_steps=100,                       # 로그 기록 간격
    save_steps=500,                          # 모델 저장 간격
    eval_steps=500,                          # 검증 실행 간격

    # 저장 전략
    save_strategy="steps",
    evaluation_strategy="steps",
    save_total_limit=3,                      # 최대 체크포인트 수

    # 메모리 최적화
    fp16=True,                               # FP16 혼합 정밀도

    # 기타
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    predict_with_generate=False,             # 평가 시 생성 비활성화 (속도 향상)
)

print(f"""
Configuration Summary:
{'='*70}
Model: {MODEL_NAME}
Epochs: 3
Batch size per device: 8
Gradient accumulation: 2
Effective batch size: 16
Learning rate: 1e-5
Warmup steps: 300
MAX_LENGTH: 256
FP16: Enabled
{'='*70}
""")

# ============================================================================
# 8. Trainer 초기화
# ============================================================================
print(f"\n{'='*70}")
print("Initializing Trainer")
print(f"{'='*70}")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer initialized")

# GPU 캐시 정리
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# ============================================================================
# 9. 학습 실행
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING START")
print(f"{'='*70}")

start_time = time.time()
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    # 학습 시작
    train_result = trainer.train()

    # 학습 완료
    success = True

    print(f"\n{'='*70}")
    print("TRAINING COMPLETED!")
    print(f"{'='*70}")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n{'='*70}")
    print("OOM ERROR!")
    print(f"{'='*70}")
    print("Out of memory error occurred.")
    print("\nOptions:")
    print("  1. Reduce batch_size to 4")
    print("  2. Increase gradient_accumulation_steps to 4")
    print("  3. Reduce MAX_LENGTH to 128")
    success = False

except Exception as e:
    print(f"\n{'='*70}")
    print("ERROR!")
    print(f"{'='*70}")
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    success = False

# 학습 시간 계산
duration = time.time() - start_time
duration_str = str(timedelta(seconds=int(duration)))

# ============================================================================
# 10. 모델 저장 및 결과 정리
# ============================================================================
if success:
    print(f"\n{'='*70}")
    print("Saving Model")
    print(f"{'='*70}")

    # 최종 모델 저장
    trainer.save_model(FINAL_DIR)
    tokenizer.save_pretrained(FINAL_DIR)

    print(f"✓ Model saved to: {FINAL_DIR}")

    # 학습 결과 요약
    summary = f"""
{'='*70}
mBART-50 Fine-tuning Results (NLLB-200 Configuration)
{'='*70}

Dataset:
  - Train samples: {len(train_dataset):,}
  - Validation samples: {len(val_dataset):,}

Model:
  - Name: {MODEL_NAME}
  - Parameters: {model.num_parameters():,}

Configuration:
  - Epochs: 3
  - Batch size: 8
  - Gradient accumulation: 2
  - Effective batch size: 16
  - Learning rate: 1e-5
  - Warmup steps: 300
  - MAX_LENGTH: 256
  - FP16: Enabled

Training:
  - Duration: {duration_str}
  - Final loss: {train_result.training_loss:.4f}

Saved:
  - Location: {FINAL_DIR}

{'='*70}
Training completed successfully!
{'='*70}
"""

    print(summary)

    # 요약 파일 저장
    with open(f"{FINAL_DIR}/training_summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary)

    print(f"✓ Summary saved to: {FINAL_DIR}/training_summary.txt")

    # 학습 로그 저장
    if hasattr(trainer.state, 'log_history'):
        import json
        with open(f"{FINAL_DIR}/training_logs.json", 'w', encoding='utf-8') as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print(f"✓ Training logs saved to: {FINAL_DIR}/training_logs.json")

    print(f"\n{'='*70}")
    print("SUCCESS! 🎉")
    print(f"{'='*70}")

else:
    print(f"\n{'='*70}")
    print("TRAINING FAILED")
    print(f"{'='*70}")
    print(f"Duration: {duration_str}")
    print("\nPlease check the error messages above and try again.")

# ============================================================================
# 11. 메모리 정리
# ============================================================================
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\n✓ GPU memory cleared")

print(f"\n{'='*70}")
print("Script completed")
print(f"{'='*70}")

mBART-50 Fine-tuning (NLLB-200 Configuration)

GPU Information
Device: cuda
GPU: NVIDIA A100-SXM4-40GB
Total Memory: 42.5 GB
✓ Colab Pro+ GPU detected!

Mounting Google Drive
Mounted at /content/drive
✓ Output directory: /content/drive/MyDrive/mbart_legal_nllb_config

Upload Training Data
Please upload train_v2.csv and dev_v2.csv:


Saving dev_v2.csv to dev_v2.csv
Saving train_v2.csv to train_v2.csv

✓ Uploaded files: ['dev_v2.csv', 'train_v2.csv']

Dataset Information
Train samples: 8,000
Validation samples: 1,000

Sample data:
   id                                                 ru  \
0   1  Российская Федерация - Россия есть демократиче...   
1   2  Наименования Российская Федерация и Россия рав...   

                                             ko  
0  러시아연방, 즉 러시아는 공화국 통치 형태를 갖춘 민주주의 연방 법치 국가이다.  
1                 러시아연방과 러시아라는 명칭은 동일한 의미를 지닌다.  

Loading Model and Tokenizer
Model: facebook/mbart-large-50-many-to-many-mmt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

✓ Tokenizer loaded


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

✓ Model loaded: 610,879,488 parameters

Data Preprocessing (NLLB-200 Config)


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

✓ Train dataset: 8,000 samples
✓ Validation dataset: 1,000 samples
✓ MAX_LENGTH: 256

Training Configuration (NLLB-200 Settings)


TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [2]:
"""
mBART-50 Fine-tuning with NLLB-200 Configuration
Optimized for Colab Pro+ (A100/V100)
"""

# ============================================================================
# 1. 환경 설정 및 라이브러리 설치
# ============================================================================
print("="*70)
print("mBART-50 Fine-tuning (NLLB-200 Configuration)")
print("="*70)

!pip install transformers datasets sentencepiece accelerate -q

import pandas as pd
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from google.colab import files, drive
import time
from datetime import timedelta
import os

# ============================================================================
# 2. GPU 확인
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'='*70}")
print("GPU Information")
print(f"{'='*70}")
print(f"Device: {device}")
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Total Memory: {gpu_memory:.1f} GB")

    # Colab Pro+ 확인
    if "A100" in gpu_name or "V100" in gpu_name:
        print("✓ Colab Pro+ GPU detected!")
    else:
        print("⚠ Warning: This configuration requires Colab Pro+ (A100/V100)")

# ============================================================================
# 3. Google Drive 마운트
# ============================================================================
print(f"\n{'='*70}")
print("Mounting Google Drive")
print(f"{'='*70}")
drive.mount('/content/drive')

# 저장 경로 설정
OUTPUT_DIR = "/content/drive/MyDrive/mbart_legal_nllb_config"
FINAL_DIR = "/content/drive/MyDrive/mbart_legal_nllb_final"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory: {OUTPUT_DIR}")

# ============================================================================
# 4. 데이터 파일 업로드
# ============================================================================
print(f"\n{'='*70}")
print("Upload Training Data")
print(f"{'='*70}")
print("Please upload train_v2.csv and dev_v2.csv:")

uploaded = files.upload()

# 업로드된 파일 확인
uploaded_files = list(uploaded.keys())
print(f"\n✓ Uploaded files: {uploaded_files}")

# 데이터 로드
if 'train_v2.csv' not in uploaded_files or 'dev_v2.csv' not in uploaded_files:
    raise FileNotFoundError("train_v2.csv and dev_v2.csv are required!")

train_df = pd.read_csv('train_v2.csv')
val_df = pd.read_csv('dev_v2.csv')

print(f"\n{'='*70}")
print("Dataset Information")
print(f"{'='*70}")
print(f"Train samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")
print(f"\nSample data:")
print(train_df.head(2))

# ============================================================================
# 5. 모델 및 토크나이저 로드
# ============================================================================
print(f"\n{'='*70}")
print("Loading Model and Tokenizer")
print(f"{'='*70}")

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
print(f"Model: {MODEL_NAME}")

tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang="ru_RU",
    tgt_lang="ko_KR"
)
print("✓ Tokenizer loaded")

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
print(f"✓ Model loaded: {model.num_parameters():,} parameters")

# ============================================================================
# 6. 데이터 전처리 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Data Preprocessing (NLLB-200 Config)")
print(f"{'='*70}")

# NLLB-200 설정값
MAX_LENGTH = 256  # 최대 시퀀스 길이

def preprocess_function(examples):
    """NLLB-200 방식의 전처리"""
    tokenizer.src_lang = "ru_RU"

    # Source 언어 토크나이징
    inputs = tokenizer(
        examples['ru'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )

    # Target 언어 토크나이징
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['ko'],
            max_length=MAX_LENGTH,
            truncation=True,
            padding=False
        )

    inputs['labels'] = labels['input_ids']
    return inputs

# 데이터셋 변환
train_dataset = Dataset.from_pandas(train_df).map(
    preprocess_function,
    batched=True,
    remove_columns=train_df.columns.tolist()
)

val_dataset = Dataset.from_pandas(val_df).map(
    preprocess_function,
    batched=True,
    remove_columns=val_df.columns.tolist()
)

# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print(f"✓ Train dataset: {len(train_dataset):,} samples")
print(f"✓ Validation dataset: {len(val_dataset):,} samples")
print(f"✓ MAX_LENGTH: {MAX_LENGTH}")

# ============================================================================
# 7. 학습 설정 (NLLB-200 Configuration) - ✅ 수정됨
# ============================================================================
print(f"\n{'='*70}")
print("Training Configuration (NLLB-200 Settings)")
print(f"{'='*70}")

training_args = Seq2SeqTrainingArguments(
    # 기본 설정
    output_dir=OUTPUT_DIR,

    # 학습 설정
    num_train_epochs=3,                      # 에포크 수
    per_device_train_batch_size=8,           # 배치 크기 (GPU 당)
    gradient_accumulation_steps=2,           # 그래디언트 누적 (실질 배치 16)

    # 학습률 및 최적화
    learning_rate=1e-5,                      # 학습률 (1×10⁻⁵)
    warmup_steps=300,                        # 워밍업 단계

    # 로깅 및 저장
    logging_steps=100,                       # 로그 기록 간격
    save_steps=500,                          # 모델 저장 간격
    eval_steps=500,                          # 검증 실행 간격

    # 저장 전략
    save_strategy="steps",
    eval_strategy="steps",                   # ✅ 수정: evaluation_strategy → eval_strategy
    save_total_limit=3,                      # 최대 체크포인트 수

    # 메모리 최적화
    fp16=True,                               # FP16 혼합 정밀도

    # 기타
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    predict_with_generate=False,             # 평가 시 생성 비활성화 (속도 향상)
)

print(f"""
Configuration Summary:
{'='*70}
Model: {MODEL_NAME}
Epochs: 3
Batch size per device: 8
Gradient accumulation: 2
Effective batch size: 16
Learning rate: 1e-5
Warmup steps: 300
MAX_LENGTH: 256
FP16: Enabled
{'='*70}
""")

# ============================================================================
# 8. Trainer 초기화
# ============================================================================
print(f"\n{'='*70}")
print("Initializing Trainer")
print(f"{'='*70}")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer initialized")

# GPU 캐시 정리
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# ============================================================================
# 9. 학습 실행
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING START")
print(f"{'='*70}")

start_time = time.time()
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    # 학습 시작
    train_result = trainer.train()

    # 학습 완료
    success = True

    print(f"\n{'='*70}")
    print("TRAINING COMPLETED!")
    print(f"{'='*70}")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n{'='*70}")
    print("OOM ERROR!")
    print(f"{'='*70}")
    print("Out of memory error occurred.")
    print("\nOptions:")
    print("  1. Reduce batch_size to 4")
    print("  2. Increase gradient_accumulation_steps to 4")
    print("  3. Reduce MAX_LENGTH to 128")
    success = False

except Exception as e:
    print(f"\n{'='*70}")
    print("ERROR!")
    print(f"{'='*70}")
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    success = False

# 학습 시간 계산
duration = time.time() - start_time
duration_str = str(timedelta(seconds=int(duration)))

# ============================================================================
# 10. 모델 저장 및 결과 정리
# ============================================================================
if success:
    print(f"\n{'='*70}")
    print("Saving Model")
    print(f"{'='*70}")

    # 최종 모델 저장
    trainer.save_model(FINAL_DIR)
    tokenizer.save_pretrained(FINAL_DIR)

    print(f"✓ Model saved to: {FINAL_DIR}")

    # 학습 결과 요약
    summary = f"""
{'='*70}
mBART-50 Fine-tuning Results (NLLB-200 Configuration)
{'='*70}

Dataset:
  - Train samples: {len(train_dataset):,}
  - Validation samples: {len(val_dataset):,}

Model:
  - Name: {MODEL_NAME}
  - Parameters: {model.num_parameters():,}

Configuration:
  - Epochs: 3
  - Batch size: 8
  - Gradient accumulation: 2
  - Effective batch size: 16
  - Learning rate: 1e-5
  - Warmup steps: 300
  - MAX_LENGTH: 256
  - FP16: Enabled

Training:
  - Duration: {duration_str}
  - Final loss: {train_result.training_loss:.4f}

Saved:
  - Location: {FINAL_DIR}

{'='*70}
Training completed successfully!
{'='*70}
"""

    print(summary)

    # 요약 파일 저장
    with open(f"{FINAL_DIR}/training_summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary)

    print(f"✓ Summary saved to: {FINAL_DIR}/training_summary.txt")

    # 학습 로그 저장
    if hasattr(trainer.state, 'log_history'):
        import json
        with open(f"{FINAL_DIR}/training_logs.json", 'w', encoding='utf-8') as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print(f"✓ Training logs saved to: {FINAL_DIR}/training_logs.json")

    print(f"\n{'='*70}")
    print("SUCCESS! 🎉")
    print(f"{'='*70}")

else:
    print(f"\n{'='*70}")
    print("TRAINING FAILED")
    print(f"{'='*70}")
    print(f"Duration: {duration_str}")
    print("\nPlease check the error messages above and try again.")

# ============================================================================
# 11. 메모리 정리
# ============================================================================
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\n✓ GPU memory cleared")

print(f"\n{'='*70}")
print("Script completed")
print(f"{'='*70}")

mBART-50 Fine-tuning (NLLB-200 Configuration)

GPU Information
Device: cuda
GPU: NVIDIA A100-SXM4-40GB
Total Memory: 42.5 GB
✓ Colab Pro+ GPU detected!

Mounting Google Drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Output directory: /content/drive/MyDrive/mbart_legal_nllb_config

Upload Training Data
Please upload train_v2.csv and dev_v2.csv:


Saving dev_v2.csv to dev_v2 (1).csv
Saving train_v2.csv to train_v2 (1).csv

✓ Uploaded files: ['dev_v2 (1).csv', 'train_v2 (1).csv']


FileNotFoundError: train_v2.csv and dev_v2.csv are required!

In [3]:
"""
mBART-50 Fine-tuning with NLLB-200 Configuration
Optimized for Colab Pro+ (A100/V100)
"""

# ============================================================================
# 1. 환경 설정 및 라이브러리 설치
# ============================================================================
print("="*70)
print("mBART-50 Fine-tuning (NLLB-200 Configuration)")
print("="*70)

!pip install transformers datasets sentencepiece accelerate -q

import pandas as pd
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from google.colab import files, drive
import time
from datetime import timedelta
import os

# ============================================================================
# 2. GPU 확인
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'='*70}")
print("GPU Information")
print(f"{'='*70}")
print(f"Device: {device}")
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Total Memory: {gpu_memory:.1f} GB")

    # Colab Pro+ 확인
    if "A100" in gpu_name or "V100" in gpu_name:
        print("✓ Colab Pro+ GPU detected!")
    else:
        print("⚠ Warning: This configuration requires Colab Pro+ (A100/V100)")

# ============================================================================
# 3. Google Drive 마운트
# ============================================================================
print(f"\n{'='*70}")
print("Mounting Google Drive")
print(f"{'='*70}")
drive.mount('/content/drive')

# 저장 경로 설정
OUTPUT_DIR = "/content/drive/MyDrive/mbart_legal_nllb_config"
FINAL_DIR = "/content/drive/MyDrive/mbart_legal_nllb_final"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory: {OUTPUT_DIR}")

# ============================================================================
# 4. 데이터 파일 업로드
# ============================================================================
print(f"\n{'='*70}")
print("Upload Training Data")
print(f"{'='*70}")
print("Please upload train_v2.csv and dev_v2.csv:")

uploaded = files.upload()

# 업로드된 파일 확인
uploaded_files = list(uploaded.keys())
print(f"\n✓ Uploaded files: {uploaded_files}")

# 정확한 파일명 체크
if 'train_v2.csv' not in uploaded_files:
    print("\n❌ Error: 'train_v2.csv' not found!")
    print(f"Available files: {uploaded_files}")
    raise FileNotFoundError("Please upload train_v2.csv")

if 'dev_v2.csv' not in uploaded_files:
    print("\n❌ Error: 'dev_v2.csv' not found!")
    print(f"Available files: {uploaded_files}")
    raise FileNotFoundError("Please upload dev_v2.csv")

print("✓ train_v2.csv found")
print("✓ dev_v2.csv found")

# 데이터 로드
train_df = pd.read_csv('train_v2.csv')
val_df = pd.read_csv('dev_v2.csv')

print(f"\n{'='*70}")
print("Dataset Information")
print(f"{'='*70}")
print(f"Train samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")
print(f"\nSample data:")
print(train_df.head(2))

# ============================================================================
# 5. 모델 및 토크나이저 로드
# ============================================================================
print(f"\n{'='*70}")
print("Loading Model and Tokenizer")
print(f"{'='*70}")

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
print(f"Model: {MODEL_NAME}")

tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang="ru_RU",
    tgt_lang="ko_KR"
)
print("✓ Tokenizer loaded")

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
print(f"✓ Model loaded: {model.num_parameters():,} parameters")

# ============================================================================
# 6. 데이터 전처리 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Data Preprocessing (NLLB-200 Config)")
print(f"{'='*70}")

# NLLB-200 설정값
MAX_LENGTH = 256  # 최대 시퀀스 길이

def preprocess_function(examples):
    """NLLB-200 방식의 전처리"""
    tokenizer.src_lang = "ru_RU"

    # Source 언어 토크나이징
    inputs = tokenizer(
        examples['ru'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )

    # Target 언어 토크나이징
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['ko'],
            max_length=MAX_LENGTH,
            truncation=True,
            padding=False
        )

    inputs['labels'] = labels['input_ids']
    return inputs

# 데이터셋 변환
train_dataset = Dataset.from_pandas(train_df).map(
    preprocess_function,
    batched=True,
    remove_columns=train_df.columns.tolist()
)

val_dataset = Dataset.from_pandas(val_df).map(
    preprocess_function,
    batched=True,
    remove_columns=val_df.columns.tolist()
)

# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print(f"✓ Train dataset: {len(train_dataset):,} samples")
print(f"✓ Validation dataset: {len(val_dataset):,} samples")
print(f"✓ MAX_LENGTH: {MAX_LENGTH}")

# ============================================================================
# 7. 학습 설정 (NLLB-200 Configuration)
# ============================================================================
print(f"\n{'='*70}")
print("Training Configuration (NLLB-200 Settings)")
print(f"{'='*70}")

training_args = Seq2SeqTrainingArguments(
    # 기본 설정
    output_dir=OUTPUT_DIR,

    # 학습 설정
    num_train_epochs=3,                      # 에포크 수
    per_device_train_batch_size=8,           # 배치 크기 (GPU 당)
    gradient_accumulation_steps=2,           # 그래디언트 누적 (실질 배치 16)

    # 학습률 및 최적화
    learning_rate=1e-5,                      # 학습률 (1×10⁻⁵)
    warmup_steps=300,                        # 워밍업 단계

    # 로깅 및 저장
    logging_steps=100,                       # 로그 기록 간격
    save_steps=500,                          # 모델 저장 간격
    eval_steps=500,                          # 검증 실행 간격

    # 저장 전략
    save_strategy="steps",
    eval_strategy="steps",
    save_total_limit=3,                      # 최대 체크포인트 수

    # 메모리 최적화
    fp16=True,                               # FP16 혼합 정밀도

    # 기타
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    predict_with_generate=False,             # 평가 시 생성 비활성화 (속도 향상)
)

print(f"""
Configuration Summary:
{'='*70}
Model: {MODEL_NAME}
Epochs: 3
Batch size per device: 8
Gradient accumulation: 2
Effective batch size: 16
Learning rate: 1e-5
Warmup steps: 300
MAX_LENGTH: 256
FP16: Enabled
{'='*70}
""")

# ============================================================================
# 8. Trainer 초기화
# ============================================================================
print(f"\n{'='*70}")
print("Initializing Trainer")
print(f"{'='*70}")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer initialized")

# GPU 캐시 정리
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# ============================================================================
# 9. 학습 실행
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING START")
print(f"{'='*70}")

start_time = time.time()
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    # 학습 시작
    train_result = trainer.train()

    # 학습 완료
    success = True

    print(f"\n{'='*70}")
    print("TRAINING COMPLETED!")
    print(f"{'='*70}")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n{'='*70}")
    print("OOM ERROR!")
    print(f"{'='*70}")
    print("Out of memory error occurred.")
    print("\nOptions:")
    print("  1. Reduce batch_size to 4")
    print("  2. Increase gradient_accumulation_steps to 4")
    print("  3. Reduce MAX_LENGTH to 128")
    success = False

except Exception as e:
    print(f"\n{'='*70}")
    print("ERROR!")
    print(f"{'='*70}")
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    success = False

# 학습 시간 계산
duration = time.time() - start_time
duration_str = str(timedelta(seconds=int(duration)))

# ============================================================================
# 10. 모델 저장 및 결과 정리
# ============================================================================
if success:
    print(f"\n{'='*70}")
    print("Saving Model")
    print(f"{'='*70}")

    # 최종 모델 저장
    trainer.save_model(FINAL_DIR)
    tokenizer.save_pretrained(FINAL_DIR)

    print(f"✓ Model saved to: {FINAL_DIR}")

    # 학습 결과 요약
    summary = f"""
{'='*70}
mBART-50 Fine-tuning Results (NLLB-200 Configuration)
{'='*70}

Dataset:
  - Train samples: {len(train_dataset):,}
  - Validation samples: {len(val_dataset):,}

Model:
  - Name: {MODEL_NAME}
  - Parameters: {model.num_parameters():,}

Configuration:
  - Epochs: 3
  - Batch size: 8
  - Gradient accumulation: 2
  - Effective batch size: 16
  - Learning rate: 1e-5
  - Warmup steps: 300
  - MAX_LENGTH: 256
  - FP16: Enabled

Training:
  - Duration: {duration_str}
  - Final loss: {train_result.training_loss:.4f}

Saved:
  - Location: {FINAL_DIR}

{'='*70}
Training completed successfully!
{'='*70}
"""

    print(summary)

    # 요약 파일 저장
    with open(f"{FINAL_DIR}/training_summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary)

    print(f"✓ Summary saved to: {FINAL_DIR}/training_summary.txt")

    # 학습 로그 저장
    if hasattr(trainer.state, 'log_history'):
        import json
        with open(f"{FINAL_DIR}/training_logs.json", 'w', encoding='utf-8') as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print(f"✓ Training logs saved to: {FINAL_DIR}/training_logs.json")

    print(f"\n{'='*70}")
    print("SUCCESS! 🎉")
    print(f"{'='*70}")

else:
    print(f"\n{'='*70}")
    print("TRAINING FAILED")
    print(f"{'='*70}")
    print(f"Duration: {duration_str}")
    print("\nPlease check the error messages above and try again.")

# ============================================================================
# 11. 메모리 정리
# ============================================================================
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\n✓ GPU memory cleared")

print(f"\n{'='*70}")
print("Script completed")
print(f"{'='*70}")

mBART-50 Fine-tuning (NLLB-200 Configuration)

GPU Information
Device: cuda
GPU: NVIDIA A100-SXM4-40GB
Total Memory: 42.5 GB
✓ Colab Pro+ GPU detected!

Mounting Google Drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Output directory: /content/drive/MyDrive/mbart_legal_nllb_config

Upload Training Data
Please upload train_v2.csv and dev_v2.csv:


Saving dev_v2.csv to dev_v2.csv
Saving train_v2.csv to train_v2.csv

✓ Uploaded files: ['dev_v2.csv', 'train_v2.csv']
✓ train_v2.csv found
✓ dev_v2.csv found

Dataset Information
Train samples: 8,000
Validation samples: 1,000

Sample data:
   id                                                 ru  \
0   1  Российская Федерация - Россия есть демократиче...   
1   2  Наименования Российская Федерация и Россия рав...   

                                             ko  
0  러시아연방, 즉 러시아는 공화국 통치 형태를 갖춘 민주주의 연방 법치 국가이다.  
1                 러시아연방과 러시아라는 명칭은 동일한 의미를 지닌다.  

Loading Model and Tokenizer
Model: facebook/mbart-large-50-many-to-many-mmt
✓ Tokenizer loaded
✓ Model loaded: 610,879,488 parameters

Data Preprocessing (NLLB-200 Config)


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

✓ Train dataset: 8,000 samples
✓ Validation dataset: 1,000 samples
✓ MAX_LENGTH: 256

Training Configuration (NLLB-200 Settings)

Configuration Summary:
Model: facebook/mbart-large-50-many-to-many-mmt
Epochs: 3
Batch size per device: 8
Gradient accumulation: 2
Effective batch size: 16
Learning rate: 1e-5
Warmup steps: 300
MAX_LENGTH: 256
FP16: Enabled


Initializing Trainer


  trainer = Seq2SeqTrainer(


✓ Trainer initialized
✓ GPU cache cleared

TRAINING START
Start time: 2025-11-05 08:31:31



Step,Training Loss,Validation Loss
500,1.3835,1.261342
1000,1.1035,1.091912
1500,0.9518,1.059713


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETED!

Saving Model
✓ Model saved to: /content/drive/MyDrive/mbart_legal_nllb_final

mBART-50 Fine-tuning Results (NLLB-200 Configuration)

Dataset:
  - Train samples: 8,000
  - Validation samples: 1,000

Model:
  - Name: facebook/mbart-large-50-many-to-many-mmt
  - Parameters: 610,879,488

Configuration:
  - Epochs: 3
  - Batch size: 8
  - Gradient accumulation: 2
  - Effective batch size: 16
  - Learning rate: 1e-5
  - Warmup steps: 300
  - MAX_LENGTH: 256
  - FP16: Enabled

Training:
  - Duration: 0:09:00
  - Final loss: 1.3560

Saved:
  - Location: /content/drive/MyDrive/mbart_legal_nllb_final

Training completed successfully!

✓ Summary saved to: /content/drive/MyDrive/mbart_legal_nllb_final/training_summary.txt
✓ Training logs saved to: /content/drive/MyDrive/mbart_legal_nllb_final/training_logs.json

SUCCESS! 🎉

✓ GPU memory cleared

Script completed
