# MediTranslator v3 En‚ÜíVi Training

This notebook trains the v3_en2vi model (278M parameters) for English‚ÜíVietnamese medical text translation.

In [None]:
# Cell 1: Install Dependencies
!pip install -q torch torchaudio torchvision
!pip install -q wandb pyyaml tqdm

In [None]:
# Cell 2: Clone Repository
import os
if not os.path.exists('MediTranslator'):
    !git clone https://github.com/MothMalone/MediTranslator.git
%cd MediTranslator
print('Repository cloned successfully')

In [None]:
# Cell 3: Setup Data Paths
import os
import shutil

# Setup paths
DATA_INPUT = '/kaggle/input/mediatranslator-training-data'
DATA_OUTPUT = './data/raw_opus100'
os.makedirs(DATA_OUTPUT, exist_ok=True)

# Copy training data files
files = ['train.en.txt', 'train.vi.txt', 'public_test.en.txt', 'public_test.vi.txt']

for file in files:
    src = f'{DATA_INPUT}/{file}'
    dst = f'{DATA_OUTPUT}/{file}'
    shutil.copy(src, dst)
    size = os.path.getsize(dst) / 1024 / 1024
    print(f'‚úì Copied {file} ({size:.2f} MB)')

print('\n‚úÖ Data setup complete!')
print(f'Files in {DATA_OUTPUT}:')
for f in os.listdir(DATA_OUTPUT):
    print(f'  - {f}')

In [None]:
# Cell 4: Login to W&B (for tracking)
import wandb
import os

# W&B API key for tracking
WANDB_API_KEY = "445d8df72343591d6588f101349ad4752497ce62"

os.environ['WANDB_API_KEY'] = WANDB_API_KEY
wandb.login(key=WANDB_API_KEY, anonymous="never")
print('‚úÖ Logged in to W&B! Loss will be tracked.')
print('View your training at: https://wandb.ai/joshuafoshua-university-of-engineering-and-technology-hanoi/nlp-transformer-mt')

In [None]:
# Cell 5: Train Model with v3_en2vi Config
import os
os.chdir('/kaggle/working/MediTranslator')
!python scripts/train.py --config experiments/v3_en2vi/config.yaml

## Step 6: Monitor Checkpoints (Run While Training)

**Checkpoints are being saved to your local experiments folder DURING training!**

Run this cell while training is happening to see checkpoints appear in real-time.


In [None]:
import os
import glob
import time
from datetime import datetime

# Monitor checkpoints being saved while training
local_experiments_dir = '/kaggle/working/MediTranslator/experiments/v3_en2vi/checkpoints'
kaggle_output_dir = '/kaggle/output/v3_en2vi/checkpoints'

print("=" * 70)
print("üìä CHECKPOINT MONITORING (REAL-TIME)")
print("=" * 70)

# Create directories if they don't exist
os.makedirs(local_experiments_dir, exist_ok=True)
os.makedirs(kaggle_output_dir, exist_ok=True)

print("\nüîç Checking for checkpoints...\n")

# Check local experiments directory
local_ckpts = sorted(glob.glob(f'{local_experiments_dir}/*.pt'))
kaggle_ckpts = sorted(glob.glob(f'{kaggle_output_dir}/*.pt'))

print(f"üìÇ LOCAL EXPERIMENTS FOLDER:")
print(f"   Path: {local_experiments_dir}")
print(f"   Checkpoints: {len(local_ckpts)}")

if local_ckpts:
    print("\n   Files:")
    for ckpt_path in local_ckpts:
        filename = os.path.basename(ckpt_path)
        size_mb = os.path.getsize(ckpt_path) / (1024**2)
        mtime = os.path.getmtime(ckpt_path)
        time_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
        print(f"   ‚úì {filename} ({size_mb:.1f} MB) - {time_str}")
else:
    print("   ‚è≥ Waiting for first checkpoint (every 20,000 steps)...")

print(f"\nüì§ KAGGLE OUTPUT FOLDER:")
print(f"   Path: {kaggle_output_dir}")
print(f"   Backed up: {len(kaggle_ckpts)}")

if kaggle_ckpts:
    total_size = sum(os.path.getsize(p) / (1024**2) for p in kaggle_ckpts)
    print(f"   Total size: {total_size:.1f} MB")
else:
    print("   ‚è≥ Waiting for backup (happens after first save)...")

print("\n" + "=" * 70)
print("‚ú® SUMMARY:")
print("=" * 70)
if local_ckpts:
    print(f"‚úÖ Checkpoints ARE being saved during training!")
    print(f"   Latest: {os.path.basename(local_ckpts[-1])}")
    print(f"   Ready to use or download from Kaggle Output tab")
else:
    print(f"‚è≥ Training in progress...")
    print(f"   First checkpoint will appear after 20,000 steps")
    print(f"   (Approx {20000 // 250} batches with batch size 64)")
    print(f"   Run this cell again to see updates!")
print("=" * 70)


## Training Complete!

Checkpoints saved to: `experiments/v3_en2vi/checkpoints/`

Logs available at: `experiments/v3_en2vi/logs/`