In [2]:
# Cell 1: Complete Environment Setup for Kaggle
# ========================================================

# 1. First, clean up everything
!pip uninstall -y torch torchvision torchaudio transformers peft bitsandbytes numpy 2>/dev/null || echo "No packages to uninstall"
!pip cache purge

# 2. Install NumPy first (must be done before other packages)
!pip install -q numpy==1.26.4  # Critical for compatibility

# 3. Install PyTorch with CUDA 12.1 (Kaggle's version)
!pip install -q torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121

# 4. Install transformer-related packages with compatible versions
!pip install -q transformers==4.41.2 peft==0.10.0 datasets==2.18.0 accelerate==0.29.1
!pip install -q bitsandbytes==0.43.0 einops==0.7.0

# 5. Handle gymnasium separately to avoid conflicts
!pip install -q gymnasium==0.29.0 --no-deps  # Force this version without dependencies

# 6. Verify installations
import os
import sys
import subprocess
import psutil
import numpy as np
import torch
import torchvision

print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Torchvision: {torchvision.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"\n=== CUDA Information ===")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory/1024**3:.2f} GB")
else:
    print("\n⚠️ CUDA not available - attempting repair...")
    !pip install -q --force-reinstall torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
    import torch
    print(f"\nAfter reinstall - CUDA available: {torch.cuda.is_available()}")

# 7. Now import transformer-related packages
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)

print("\n=== Transformer Packages Loaded Successfully ===")

Found existing installation: torch 2.2.1+cu121
Uninstalling torch-2.2.1+cu121:
  Successfully uninstalled torch-2.2.1+cu121
Found existing installation: torchvision 0.17.1+cu121
Uninstalling torchvision-0.17.1+cu121:
  Successfully uninstalled torchvision-0.17.1+cu121
Found existing installation: torchaudio 2.2.1+cu121
Uninstalling torchaudio-2.2.1+cu121:
  Successfully uninstalled torchaudio-2.2.1+cu121
Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: peft 0.10.0
Uninstalling peft-0.10.0:
  Successfully uninstalled peft-0.10.0
Found existing installation: bitsandbytes 0.43.0
Uninstalling bitsandbytes-0.43.0:
  Successfully uninstalled bitsandbytes-0.43.0
Found existing installation: numpy 2.1.2
Uninstalling numpy-2.1.2:
  Successfully uninstalled numpy-2.1.2
Files removed: 30 (778.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m92.9 MB

In [5]:
# Cell 2: Model Loading
# =====================

# Define MODEL_NAME at the top of the cell (should match what you used in Cell 1)
MODEL_NAME = "microsoft/phi-1.5"  # Add this line

def print_memory():
    """Memory usage diagnostics for the environment"""
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU Memory: {gpu_mem:.2f}GB", end=" | ")
    ram = psutil.virtual_memory()
    print(f"RAM: {ram.percent}% ({ram.used/1024**3:.1f}/{ram.total/1024**3:.1f}GB)")

# Define MODEL_NAME at the top of the cell (should match what you used in Cell 1)
MODEL_NAME = "microsoft/phi-1.5"  # Add this line

def load_model(model_name):
    """Load model with improved error handling and phi-1.5 specific settings"""
    print(f"\n=== Loading Model: {model_name} ===")
    print_memory()
    
    # Phi-1.5 specific configuration
    trust_remote_code = True  # Required for phi-1.5
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    
    # Quantization config for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype
    )
    
    try:
        print("Attempting quantized load...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            trust_remote_code=trust_remote_code,
            device_map="auto",
            torch_dtype=torch_dtype
        )
        
        print("\n✅ Model loaded successfully!")
        print_memory()
        return model
        
    except Exception as e:
        print(f"\n❌ Model loading failed: {str(e)}")
        print("Attempting standard load without quantization...")
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=trust_remote_code,
                device_map="auto" if torch.cuda.is_available() else None,
                torch_dtype=torch_dtype
            )
            print("\n✅ Model loaded successfully without quantization!")
            print_memory()
            return model
        except Exception as e:
            print(f"\n❌ Standard load failed: {str(e)}")
            print("Attempting CPU-only fallback...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=trust_remote_code,
                device_map="cpu",
                torch_dtype=torch.float32
            )
            print("\n✅ Model loaded on CPU")
            print_memory()
            return model

model = load_model(MODEL_NAME)


=== Loading Model: microsoft/phi-1.5 ===
RAM: 5.3% (1.2/31.4GB)
Attempting quantized load...

❌ Model loading failed: No GPU found. A GPU is needed for quantization.
Attempting standard load without quantization...

✅ Model loaded successfully without quantization!
RAM: 22.1% (6.5/31.4GB)


In [12]:
# Cell 3: Tokenizer Setup
# =======================

def load_tokenizer(model_name):
    """Load and configure tokenizer"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully")
        return tokenizer
    except Exception as e:
        print(f"Tokenizer loading failed: {str(e)}")
        raise

tokenizer = load_tokenizer(MODEL_NAME)

Tokenizer loaded successfully


In [18]:
# Cell 4: Robust Data Preparation - Fixed Version
# =============================================

# 0. Set critical environment variables first
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

In [19]:
# 1. FIRST CELL - Force clean environment setup
!pip uninstall -y numpy torch -qqq
!pip install --no-cache-dir numpy==1.26.4 torch==2.2.1 --force-reinstall --ignore-installed

# Force reload numpy from the installed location
import sys
import site
from importlib import reload
for module in list(sys.modules):
    if 'numpy' in module:
        del sys.modules[module]

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting torch==2.2.1
  Downloading torch-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting filelock (from torch==2.2.1)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.2.1)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch==2.2.1)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch==2.2.1)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch==2.2.1)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch==2.2.1)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x8

In [20]:
# 2. SECOND CELL - Import with verification
import numpy as np
import torch
print(f"NumPy path: {np.__file__}")  # Verify path is in /usr/local/lib
print(f"NumPy version: {np.__version__}")  
print(f"PyTorch version: {torch.__version__}")

# Enhanced numpy test
def test_numpy():
    try:
        arr = np.array([1, 2, 3])
        assert arr.sum() == 6
        # Test array conversion
        torch.tensor(arr)
        return True
    except Exception as e:
        print(f"NumPy test failed: {str(e)}")
        return False

if not test_numpy():
    # Nuclear option - modify sys.path
    import site
    site.addsitedir('/usr/local/lib/python3.11/site-packages')
    import numpy as np
    print(f"Reinstalled NumPy version: {np.__version__}")

# 3. Import remaining libraries
import re
from datasets import Dataset

# 4. Text cleaning function
def clean_text(text):
    """Enhanced text cleaning function"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s.,;!?\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 5. Dataset preparation with multiple fallbacks
def prepare_dataset(file_path="/kaggle/input/0515-txt", max_samples=1000):
    """Prepare dataset with robust error handling"""
    try:
        # Check if path exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Path not found: {file_path}")
            
        # Handle directory case
        if os.path.isdir(file_path):
            txt_files = [f for f in os.listdir(file_path) 
                        if f.endswith('.txt') and os.path.isfile(os.path.join(file_path, f))]
            if not txt_files:
                raise ValueError("No .txt files found in directory")
            
            # Read first found txt file
            with open(os.path.join(file_path, txt_files[0]), 'r', encoding='utf-8') as f:
                lines = [clean_text(line) for line in f if len(line.split()) > 3][:max_samples]
        else:
            # Handle single file case
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = [clean_text(line) for line in f if len(line.split()) > 3][:max_samples]
                
        return Dataset.from_dict({"text": lines})
    except Exception as e:
        print(f"Dataset preparation failed: {str(e)}")
        return Dataset.from_dict({"text": ["Sample text " + str(i) for i in range(10)]})

NumPy path: /usr/local/lib/python3.11/dist-packages/numpy/__init__.py
NumPy version: 1.26.4
PyTorch version: 2.2.1+cu121
NumPy test failed: Could not infer dtype of numpy.int64
Reinstalled NumPy version: 1.26.4


In [21]:
# 3. THIRD CELL - Dataset processing with workarounds
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def safe_tokenize(examples):
    """Tokenization with explicit numpy workarounds"""
    try:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )
        # Convert to lists explicitly
        return {
            "input_ids": tokenized["input_ids"].tolist(),
            "attention_mask": tokenized["attention_mask"].tolist(),
            "labels": tokenized["input_ids"].tolist()
        }
    except RuntimeError as e:
        if "Numpy is not available" in str(e):
            # Fallback using pure Python
            return {
                "input_ids": [[0]*512],
                "attention_mask": [[1]*512],
                "labels": [[0]*512]
            }
        raise

try:
    print("\n=== Starting Processing ===")
    dataset = prepare_dataset()
    
    # Small batch test first
    test_batch = dataset.select(range(2))
    test_tokenized = test_batch.map(safe_tokenize, batched=True)
    
    # If test succeeds, process full dataset
    tokenized_dataset = dataset.map(safe_tokenize, batched=True, batch_size=4)
    tokenized_dataset.set_format(type='torch')
    
    print("✅ Processing completed successfully!")
    
except Exception as e:
    print(f"\n❌ Error: {str(e)}")
    print("Creating minimal fallback dataset...")
    tokenized_dataset = Dataset.from_dict({
        "input_ids": [[0,1,2,3]],
        "attention_mask": [[1,1,1,1]],
        "labels": [[0,1,2,3]]
    })
    tokenized_dataset.set_format(type='torch')


=== Starting Processing ===
Dataset preparation failed: Can't pickle <class 'numpy.ndarray'>: it's not the same object as numpy.ndarray

❌ Error: Can't pickle <class 'numpy.ndarray'>: it's not the same object as numpy.ndarray
Creating minimal fallback dataset...


  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


PicklingError: Can't pickle <class 'numpy.ndarray'>: it's not the same object as numpy.ndarray

In [None]:
# Cell 5: Training Configuration
# =============================

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# LoRA configuration
peft_config = LoraConfig(
    r=16,  # Increased rank for better adaptation
    lora_alpha=32,
    target_modules=["Wqkv", "out_proj", "fc1", "fc2"],  # Phi-1.5 specific modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training arguments optimized for Kaggle
training_args = TrainingArguments(
    output_dir="/kaggle/working/phi1.5-lora-results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,  # Reduced for Kaggle
    learning_rate=2e-5,
    optim="adamw_torch",
    logging_steps=10,
    save_steps=500,
    fp16=torch.cuda.is_available(),
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    report_to="none"
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

In [None]:
# Cell 6: Training Execution
# =========================

def train_model(model, tokenized_dataset, training_args):
    """Execute the training process"""
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    
    print("Starting training...")
    print_memory()
    trainer.train()
    print("Training completed!")
    return trainer

trainer = train_model(model, tokenized_dataset, training_args)

# Cell 7: Model Saving
# ====================

def save_artifacts(model, tokenizer, output_dir="/kaggle/working/phi1.5-lora-trained"):
    """Save all training artifacts"""
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    # Save training arguments
    training_args.save_to_json(f"{output_dir}/training_args.json")
    print(f"Model and artifacts saved to {output_dir}")

save_artifacts(model, tokenizer)