In [31]:
import time
notebook_start = time.time()

In [32]:
# Minimal working version with maximum compatibility
!pip install -q \
    transformers==4.30.2 \
    torch==2.1.0 \
    numpy==1.23.5 \
    --ignore-installed

from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
print("Model loaded successfully")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlcrate 0.2.0 requires pathos, which is not installed.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
torchvision 0.17.1+cu121 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
datasets 2.14.6 requires fsspec[http]<=2023.10.0,>=2023.1.0, but you have fsspec 2025.5.0 which is incompatible.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 1.23.5 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 1.23.5 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 1.23.5 which is incompatible.
pylibcudf-cu12 25.2.2 requir

ImportError: cannot import name 'get_cached_models' from 'transformers.utils' (/usr/local/lib/python3.11/dist-packages/transformers/utils/__init__.py)

In [None]:
MODEL_NAME = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    padding_side="right",
    trust_remote_code=False
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=False
)

print("Model loaded successfully!")

In [None]:
# Create a minimal dummy dataset
from datasets import Dataset
dummy_data = {"text": ["This is a sample text."] * 10}
dataset = Dataset.from_dict(dummy_data)
print(f"Dummy dataset size: {len(dataset)}")

In [None]:
# Load dataset from Kaggle working directory
try:
    df = pd.read_csv("/kaggle/working/papers_database.csv")
    print(f"✅ Loaded {len(df)} papers from /kaggle/working/papers_database.csv")
    
    # Check if required columns exist
    required_columns = ['id', 'title', 'source_url', 'full_text']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"⚠️ Missing columns: {missing_columns}")
        # Add missing columns with empty values
        for col in missing_columns:
            df[col] = ""
    
except Exception as e:
    print(f"❌ CSV load failed: {str(e)}")
    # Create empty dataframe with required structure
    df = pd.DataFrame(columns=['id', 'title', 'source_url', 'full_text'])
    print("Created empty dataframe as fallback")

# Display sample data
print("\nSample data:")
print(df.head(2))
# Process text into chunks
def chunk_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] if pd.notna(text) else []

samples = []
for _, row in df.iterrows():
    chunks = chunk_text(row.get('full_text', ''))
    samples.extend([{"text": chunk, "source": row.get('title', '')} for chunk in chunks])

dataset = Dataset.from_list(samples[:1000]) if samples else Dataset.from_dict({"text": ["Sample text"]*10})
print(f"Dataset size: {len(dataset)}")

In [None]:
MODEL_NAME = "gpt2"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token

# Model with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

In [None]:
# Tokenize dataset
tokenized_dataset = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, max_length=512, padding="max_length"),
    batched=True
)
tokenized_dataset.set_format(type='torch')

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training args
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_steps=500
)

# Prepare model
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

print("Starting training...")
trainer.train()

In [None]:
trainer.save_model("/kaggle/working/final_model")
!zip -r /kaggle/working/results.zip /kaggle/working/results
print("\nFinal outputs:")
!ls -lh /kaggle/working/

In [None]:
def save_model_artifacts(
    model, 
    tokenizer, 
    training_args: Optional[TrainingArguments] = None, 
    output_dir: str = "/kaggle/working/gpt2-lora-trained"
) -> str:
    """
    Save all model artifacts with comprehensive verification.
    Handles both single-file and sharded model formats.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n💾 Saving model artifacts to: {output_dir}")
    
    # For LoRA models - DON'T merge adapters before saving
    # We want to save the adapter separately
    print("💽 Saving model and adapter...")
    
    # Save the entire model (base model + adapter)
    model.save_pretrained(
        output_dir,
        safe_serialization=True,
        state_dict=model.state_dict()  # Save the complete state including LoRA
    )
    
    # Save tokenizer
    print("🔤 Saving tokenizer...")
    tokenizer.save_pretrained(output_dir)
    
    # Save training arguments if provided
    if training_args is not None:
        print("📝 Saving training arguments...")
        try:
            args_path = os.path.join(output_dir, "training_args.json")
            if hasattr(training_args, 'to_dict'):
                with open(args_path, "w") as f:
                    json.dump(training_args.to_dict(), f, indent=2)
            elif hasattr(training_args, 'to_json_string'):
                with open(args_path, "w") as f:
                    f.write(training_args.to_json_string())
            else:
                print("⚠️ Warning: TrainingArguments has no serialization method")
        except Exception as e:
            print(f"⚠️ Warning: Failed to save training args - {str(e)}")
    
    # Verify the adapter files were saved
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    missing_files = []
    for file in required_files:
        if not os.path.exists(os.path.join(output_dir, file)):
            missing_files.append(file)
    
    if missing_files:
        print(f"\n⚠️ Warning: Missing adapter files: {missing_files}")
        print("Trying alternative save method...")
        # Explicitly save the adapter
        model.save_pretrained(
            output_dir,
            safe_serialization=True,
            adapter_only=True  # This ensures adapter files are saved
        )
    
    print("\n🔍 Verifying saved files:")
    for file in os.listdir(output_dir):
        size = os.path.getsize(os.path.join(output_dir, file)) / 1024
        print(f"- {file} ({size:.2f} KB)")
    
    return output_dir

In [None]:
def load_and_test_model(
    model_path: str = "/kaggle/working/gpt2-lora-trained", 
    max_length: int = 160,
    test_prompts: Optional[list] = None,
    is_peft_model: bool = True
):
    """
    Load and test a saved model with comprehensive error handling
    """
    print(f"\n🔍 Preparing to load model from: {model_path}")
    
    # Verify model directory exists
    if not os.path.exists(model_path):
        raise ValueError(f"Model directory {model_path} does not exist")
    
    # Show directory contents for debugging
    print("\n📂 Model directory contents:")
    for f in sorted(os.listdir(model_path)):
        size = os.path.getsize(os.path.join(model_path, f)) / 1024
        print(f"- {f} ({size:.2f} KB)")
    
    try:
        print("\n🔄 Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("\n🔄 Loading model...")
        if is_peft_model:
            # First check if we have adapter files
            adapter_files = [
                f for f in os.listdir(model_path) 
                if f.startswith('adapter_') or f == 'adapter_config.json'
            ]
            
            if not adapter_files:
                print("⚠️ No adapter files found. Loading as regular model.")
                model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    device_map="auto",
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    local_files_only=True
                )
            else:
                print(f"Found adapter files: {adapter_files}")
                # Load base model first
                base_model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    device_map="auto",
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    local_files_only=True
                )
                
                # Then load the PEFT adapter
                model = PeftModel.from_pretrained(
                    base_model,
                    model_path,
                    local_files_only=True
                )
                
                # Merge and unload for inference
                model = model.merge_and_unload()
        else:
            # For regular models
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                local_files_only=True
            )
            
        print("\n🎉 Model loaded successfully!")
        
        # Default test prompts if none provided
        if test_prompts is None:
            test_prompts = [
                "What is hardware wallet?? ",
                "What is Proof of Work (PoW)?? ",
                "What is cryptography?? ",
                "What is Peer-to-Peer (P2P)?? ",
                "What is block chain?? ",
                "What is private key?? "
            ]
        
        # Create pipeline - REMOVED device parameter since we're using device_map="auto"
        print("\n🚀 Creating text generation pipeline...")
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        
        # Run tests
        print("\n🧪 Running generation tests...")
        for i, prompt in enumerate(test_prompts, 1):
            print(f"\n🔹 Test {i}: {prompt}")
            output = pipe(
                prompt,
                max_length=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_return_sequences=1,
                repetition_penalty=1.2
            )
            print("💬 Response:", output[0]['generated_text'])
            
        return model, tokenizer
        
    except Exception as e:
        print(f"\n❌ Critical error loading model: {str(e)}")
        print("\n🛠️ Debugging info:")
        print(f"- Path: {os.path.abspath(model_path)}")
        print(f"- Directory exists: {os.path.exists(model_path)}")
        if os.path.exists(model_path):
            print("- Contents:", os.listdir(model_path))
        raise

In [None]:
# Main execution
if __name__ == "__main__":
    model_path = "/kaggle/working/gpt2-lora-trained"
    
    # Save model artifacts
    save_model_artifacts(model, tokenizer, training_args)
    
    # Load with explicit path and PEFT flag
    load_and_test_model(model_path, is_peft_model=True)
    
    # Test with custom prompts
    custom_prompts = [
        "What is software wallet, and what's the difference between hardware and software wallet? ",
        "What is PoW? ",
        "Explain PoW in 1 sentence. ",
        "Describe the key features of PoW using 3 words. ",
        "What is PoM? Is it something related to cryptography? ",
        "What is a cryptographic product? ",
        "What is P2P? ",
        "What is block chain? ",
        "What is public key, and what's the difference between private and public key? "
    ]
    load_and_test_model(model_path, test_prompts=custom_prompts, is_peft_model=True)

In [None]:
notebook_end = time.time()
print(f"Total notebook execution time: {notebook_end - notebook_start:.2f} seconds")