# MediTranslator v3 En‚ÜíVi LoRA Training

This notebook fine-tunes the v3_en2vi model using LoRA (Low-Rank Adaptation) for efficient parameter-efficient English‚ÜíVietnamese medical text translation.


In [None]:
# Cell 1: Install Dependencies
!pip install -q torch torchaudio torchvision
!pip install -q wandb pyyaml tqdm

In [None]:
# Cell 2: Clone Repository
import os

# Clone the transformer-v3 repo (which is the MediTranslator project)
if not os.path.exists('MediTranslator'):
    !git clone https://github.com/aCoderChild/transformer-v3.git MediTranslator

%cd MediTranslator
print('Repository cloned successfully')


In [None]:
# Cell 3: Setup Data Paths
import os
import shutil

print("=" * 70)
print("üì• SETTING UP DATA")
print("=" * 70)

# Use absolute paths to avoid working directory issues
MEDI_REPO = '/kaggle/working/MediTranslator'
DATA_INPUT = '/kaggle/input/mediatranslator-training-data'
DATA_OUTPUT = os.path.join(MEDI_REPO, 'data', 'raw')

# First, verify input exists
if not os.path.exists(DATA_INPUT):
    print(f"\n‚ùå ERROR: Input dataset not found at {DATA_INPUT}")
    print(f"   Make sure 'mediatranslator-training-data' is connected in Kaggle Input")
else:
    print(f"\n‚úì Input dataset found: {DATA_INPUT}")
    input_files = os.listdir(DATA_INPUT)
    print(f"  Files available: {input_files}")

# Create output directory with absolute path
os.makedirs(DATA_OUTPUT, exist_ok=True)
print(f"\n‚úì Output directory created: {DATA_OUTPUT}")

# Copy training data files
files = ['train.en.txt', 'train.vi.txt', 'public_test.en.txt', 'public_test.vi.txt']

print(f"\nüìã Copying files:")
for file in files:
    src = os.path.join(DATA_INPUT, file)
    dst = os.path.join(DATA_OUTPUT, file)
    
    if not os.path.exists(src):
        print(f"  ‚úó {file} - NOT FOUND in input")
        continue
    
    try:
        shutil.copy2(src, dst)
        size = os.path.getsize(dst) / 1024 / 1024
        print(f"  ‚úì {file} ({size:.2f} MB)")
    except Exception as e:
        print(f"  ‚úó {file} - ERROR: {e}")

print(f"\n‚úÖ Data setup complete!")
print(f"Files in {DATA_OUTPUT}:")
if os.path.exists(DATA_OUTPUT):
    for f in os.listdir(DATA_OUTPUT):
        full_path = os.path.join(DATA_OUTPUT, f)
        size = os.path.getsize(full_path) / 1024 / 1024
        print(f"  ‚úì {f} ({size:.2f} MB)")
else:
    print(f"  ‚úó Directory not found")

print("=" * 70)


In [None]:
# Cell 3.5: Verify Data Setup
import os

print("=" * 70)
print("üîç VERIFYING DATA SETUP")
print("=" * 70)

# Check if data/raw directory exists and has files
data_raw_dir = './data/raw'
print(f"\n‚úì Checking: {data_raw_dir}")

if os.path.exists(data_raw_dir):
    files_in_raw = os.listdir(data_raw_dir)
    print(f"  ‚úì Directory exists")
    print(f"  ‚úì Files found: {len(files_in_raw)}")
    for f in files_in_raw:
        size_mb = os.path.getsize(os.path.join(data_raw_dir, f)) / (1024**2)
        print(f"    - {f} ({size_mb:.1f} MB)")
    
    # Check for required files
    required_files = ['train.en.txt', 'train.vi.txt', 'public_test.en.txt', 'public_test.vi.txt']
    missing = [f for f in required_files if f not in files_in_raw]
    
    if missing:
        print(f"\n‚ö†Ô∏è  Missing files: {missing}")
        print(f"   Please re-run Cell 3 to copy them")
    else:
        print(f"\n‚úÖ All required files present!")
else:
    print(f"  ‚úó Directory does not exist")
    print(f"  Please run Cell 3 first to set up data")

print("\n" + "=" * 70)


In [None]:
# Cell 4: Login to W&B (for tracking)
import wandb
import os

# W&B API key for tracking
WANDB_API_KEY = "445d8df72343591d6588f101349ad4752497ce62"

os.environ['WANDB_API_KEY'] = WANDB_API_KEY
wandb.login(key=WANDB_API_KEY, anonymous="never")
print('‚úÖ Logged in to W&B! Loss will be tracked.')
print('View your training at: https://wandb.ai/joshuafoshua-university-of-engineering-and-technology-hanoi/nlp-transformer-mt')

In [None]:
# Cell 4.5: Setup Checkpoint Backup (Google Drive if running in Colab)
import os

print("=" * 70)
print("üì§ SETTING UP CHECKPOINT BACKUP")
print("=" * 70)

# Try to mount Google Drive if running in Colab
DRIVE_AVAILABLE = False
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    DRIVE_AVAILABLE = True
    print("\n‚úì Google Drive mounted successfully!")
    print("  Checkpoints will auto-save to: Google Drive/MediTranslator_Checkpoints/")
except:
    print("\n‚ö†Ô∏è  Not running in Colab (this is fine)")
    print("  Checkpoints will save to local computer or Kaggle output")

# Setup local backup directory
local_backup = os.path.expanduser("~/MediTranslator_Backups")
print(f"\n  Local backup directory: {local_backup}")

print("\n‚úì Backup setup complete!")
print("  During training, checkpoints will be saved to:")
print("    1. Local computer (preferred)")
print("    2. Google Drive (if Colab + mounted)")
print("    3. Kaggle output (if running on Kaggle)")
print("=" * 70)

## Step 6: Monitor LoRA Training Progress (Run While Training)

**LoRA Checkpoints are being saved to your local experiments folder DURING training!**

Run this cell while training is happening to see checkpoints appear in real-time.


In [None]:
print("\n" + "=" * 70)
print("‚ú® SUMMARY:")
print("=" * 70)
if local_ckpts:
    print(f"‚úÖ Checkpoints ARE being saved during training!")
    print(f"   Latest: {os.path.basename(local_ckpts[-1])}")
    print(f"   Ready to use or download from Kaggle Output tab")
else:
    print(f"‚è≥ Training in progress...")
    print(f"   First checkpoint will appear after 500 steps")
    print(f"   Checkpoints saved every 500 training steps automatically")
    print(f"   Run this cell again to see updates!")
print("=" * 70)

## Training Complete!

Checkpoints saved to: `experiments/v3_en2vi/checkpoints/`

Logs available at: `experiments/v3_en2vi/logs/`