In [None]:
# ============================================================================
# STEP 3B: Fix PyTorch/NCCL Compatibility Issue
# ============================================================================

import subprocess
import sys
import os

print("=== PYTORCH/NCCL COMPATIBILITY FIX ===")
print("Issue: ncclMemFree symbol not found")
print("Solution: Reinstall PyTorch with proper CUDA compatibility")

def run_command(cmd, description):
    """Run command with proper error handling"""
    print(f"\n🔄 {description}...")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=600)
        if result.returncode == 0:
            print(f"✅ {description} completed successfully")
            if result.stdout.strip():
                print(f"Output: {result.stdout.strip()[:200]}...")
        else:
            print(f"❌ {description} failed")
            print(f"Error: {result.stderr}")
        return result.returncode == 0
    except subprocess.TimeoutExpired:
        print(f"⏰ {description} timed out")
        return False
    except Exception as e:
        print(f"❌ Exception in {description}: {e}")
        return False

# Step 1: Clean uninstall PyTorch and related packages
print("\n" + "="*50)
print("STEP 1: Clean uninstall conflicting packages")
print("="*50)

uninstall_packages = [
    "torch",
    "torchvision", 
    "torchaudio",
    "transformers",
    "accelerate"
]

for package in uninstall_packages:
    cmd = f"{sys.executable} -m pip uninstall {package} -y"
    run_command(cmd, f"Uninstalling {package}")

# Step 2: Clear pip cache
print("\n" + "="*50)
print("STEP 2: Clear pip cache")
print("="*50)

run_command(f"{sys.executable} -m pip cache purge", "Clearing pip cache")

# Step 3: Install PyTorch with specific CUDA version for H100
print("\n" + "="*50)
print("STEP 3: Install PyTorch with CUDA 12.4 compatibility")
print("="*50)

# Install PyTorch with specific CUDA version
pytorch_cmd = f"{sys.executable} -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124"
run_command(pytorch_cmd, "Installing PyTorch with CUDA 12.4")

# Step 4: Install transformers and accelerate
print("\n" + "="*50)
print("STEP 4: Install core packages")
print("="*50)

core_packages = [
    "transformers>=4.43.0",
    "accelerate>=0.21.0"
]

for package in core_packages:
    cmd = f"{sys.executable} -m pip install {package}"
    run_command(cmd, f"Installing {package}")

# Step 5: Install remaining packages
print("\n" + "="*50)
print("STEP 5: Install remaining packages")
print("="*50)

remaining_packages = [
    "bitsandbytes>=0.41.0",
    "scipy",
    "sentencepiece",
    "protobuf"
]

for package in remaining_packages:
    cmd = f"{sys.executable} -m pip install {package}"
    run_command(cmd, f"Installing {package}")

print("\n" + "="*50)
print("PYTORCH REINSTALLATION COMPLETE!")
print("="*50)
print("🔄 Please RESTART your kernel now")
print("🔄 Then run the verification script again")
print("="*50)

## Verify Installation and Test Compatibility

In [None]:
print("=== STEP 3: VERIFICATION ===")

# Test 1: Basic imports
print("\n🔍 Test 1: Basic package imports...")
try:
    import torch
    print(f"✅ PyTorch {torch.__version__} - CUDA: {torch.cuda.is_available()}")
except ImportError as e:
    print(f"❌ PyTorch import failed: {e}")

try:
    import transformers
    print(f"✅ Transformers {transformers.__version__}")
except ImportError as e:
    print(f"❌ Transformers import failed: {e}")

try:
    import accelerate
    print(f"✅ Accelerate {accelerate.__version__}")
except ImportError as e:
    print(f"❌ Accelerate import failed: {e}")

try:
    import peft
    print(f"✅ PEFT {peft.__version__}")
except ImportError as e:
    print(f"❌ PEFT import failed: {e}")

try:
    import bitsandbytes
    print(f"✅ BitsAndBytes {bitsandbytes.__version__}")
except ImportError as e:
    print(f"❌ BitsAndBytes import failed: {e}")

try:
    import trl
    print(f"✅ TRL {trl.__version__}")
except ImportError as e:
    print(f"❌ TRL import failed: {e}")

try:
    import datasets
    print(f"✅ Datasets {datasets.__version__}")
except ImportError as e:
    print(f"❌ Datasets import failed: {e}")

# Test 2: Advanced imports
print("\n🔍 Test 2: Advanced component imports...")
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    print("✅ Core transformers components")
except ImportError as e:
    print(f"❌ Transformers components failed: {e}")

try:
    from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
    print("✅ PEFT components")
except ImportError as e:
    print(f"❌ PEFT components failed: {e}")

try:
    from trl import SFTTrainer
    print("✅ TRL SFTTrainer")
except ImportError as e:
    print(f"❌ TRL SFTTrainer failed: {e}")

# Test 3: GPU and CUDA compatibility
print("\n🔍 Test 3: GPU compatibility...")
if torch.cuda.is_available():
    print(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
    print(f"✅ CUDA version: {torch.version.cuda}")
    print(f"✅ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Test tensor operations
    try:
        x = torch.randn(100, 100).cuda()
        y = torch.randn(100, 100).cuda()
        z = torch.matmul(x, y)
        print("✅ Basic CUDA tensor operations work")
    except Exception as e:
        print(f"❌ CUDA tensor operations failed: {e}")
else:
    print("❌ CUDA not available")

# Test 4: BitsAndBytes compatibility
print("\n🔍 Test 4: BitsAndBytes compatibility...")
try:
    import bitsandbytes as bnb
    # Test if BitsAndBytes can create quantization config
    from transformers import BitsAndBytesConfig
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
    print("✅ BitsAndBytes quantization config works")
except Exception as e:
    print(f"❌ BitsAndBytes compatibility issue: {e}")

# Test 5: Check Llama-3.1 model accessibility
print("\n🔍 Test 5: Llama-3.1 model access...")
try:
    from transformers import AutoTokenizer
    # Try to load tokenizer (this tests HF access and model compatibility)
    tokenizer = AutoTokenizer.from_pretrained(
        "meta-llama/Llama-3.1-8B-Instruct",
        token="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"
    )
    print("✅ Llama-3.1 tokenizer loads successfully")
    print(f"✅ Vocab size: {tokenizer.vocab_size}")
except Exception as e:
    print(f"❌ Llama-3.1 access issue: {e}")

# Summary
print("\n" + "="*50)
print("VERIFICATION SUMMARY")
print("="*50)

# Check if all critical components are working
critical_tests = [
    torch.cuda.is_available(),
    'transformers' in globals(),
    'bitsandbytes' in globals(),
    'trl' in globals(),
    'peft' in globals()
]

if all(critical_tests):
    print("🎉 ALL CRITICAL TESTS PASSED!")
    print("✅ Ready to proceed with Step 4: Dataset preparation")
else:
    print("⚠️  Some tests failed. Please review errors above.")
    print("❌ NOT ready for training yet")

print("\nNext: If all tests pass, we'll proceed to Step 4")

## Validate with sample training dataset

In [None]:
# ============================================================================
# STEP 4: Dataset Preparation and Processing
# ============================================================================

import json
import pandas as pd
from datasets import Dataset, DatasetDict
import torch

print("=== STEP 4: DATASET PREPARATION ===")

# First, let's create your dataset file from the provided data
def create_dataset_file():
    """Create the JSONL dataset file from your provided data"""
    
    # Your dataset entries (sample from the document you provided)
    dataset_entries = [
        {
            "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
            "instruction": "Under Job Code for the same grade FTE we can see - PM and FC job codes. what is the difference between two",
            "output": "PM job code indicates Full Time Employee and FC job code indicates Full Time Contractor."
        },
        {
            "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
            "instruction": "Process to convert CWR to FTE associates",
            "output": "Raise New Demand New Demand and mark requirement type as CWR Conversion and SO type as FTE. Rest of the Information will remain same like FTE SO creation."
        },
        {
            "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
            "instruction": "How to created CWR so",
            "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt Service defauls to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com  Explore additional resources about SO Simplification on Be.Cognizant."
        },
        {
            "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
            "instruction": "unable to select subcontractor",
            "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt service defaults to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com Explore additional resources about SO Simplification on Be.Cognizant."
        },
        {
            "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
            "instruction": "How to raise request for CWR SO",
            "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt Service defauls to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com .Explore additional resources about SO Simplification on Be.Cognizant."
        }
    ]
    
    # Save to JSONL file
    with open('azure_instruction_dataset.jsonl', 'w', encoding='utf-8') as f:
        for entry in dataset_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    print(f"✅ Created dataset file with {len(dataset_entries)} sample entries")
    print("📁 File saved as: azure_instruction_dataset.jsonl")
    
    return len(dataset_entries)

# Create the dataset file
num_entries = create_dataset_file()

# Data processing functions
def convert_to_conversational_format(data_entry):
    """Convert system/instruction/output format to Llama-3.1 conversational format"""
    system_message = data_entry.get('system', '')
    instruction = data_entry.get('instruction', '')
    output = data_entry.get('output', '')
    
    # Create conversational format suitable for Llama-3.1
    conversation = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{output}<|eot_id|>"""
    
    return conversation

def load_and_process_dataset(file_path, test_size=0.2):
    """Load and process the dataset"""
    print(f"\n🔄 Loading dataset from {file_path}...")
    
    # Load JSONL data
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    entry = json.loads(line.strip())
                    data.append(entry)
                except json.JSONDecodeError as e:
                    print(f"⚠️  Skipping malformed line {line_num}: {e}")
                    continue
        
        print(f"✅ Loaded {len(data)} entries")
        
    except FileNotFoundError:
        print(f"❌ File {file_path} not found!")
        return None
    
    if len(data) == 0:
        print("❌ No valid data found!")
        return None
    
    # Convert to conversational format
    print("🔄 Converting to conversational format...")
    conversations = []
    
    for i, entry in enumerate(data):
        try:
            conv = convert_to_conversational_format(entry)
            conversations.append({"text": conv})
        except Exception as e:
            print(f"⚠️  Skipping entry {i}: {e}")
            continue
    
    print(f"✅ Converted {len(conversations)} conversations")
    
    # Create HuggingFace dataset
    dataset = Dataset.from_list(conversations)
    
    # Split into train/validation
    if len(conversations) > 1:
        dataset = dataset.train_test_split(test_size=test_size, seed=42)
        print(f"📊 Train examples: {len(dataset['train'])}")
        print(f"📊 Validation examples: {len(dataset['test'])}")
    else:
        # For single example, create minimal split
        dataset = DatasetDict({
            'train': dataset,
            'test': dataset.select([0])  # Use same example for validation
        })
        print(f"📊 Single example mode - using same data for train/validation")
    
    return dataset

# Load and process the dataset
dataset = load_and_process_dataset('azure_instruction_dataset.jsonl')

if dataset is not None:
    print("\n✅ Dataset processing completed successfully!")
    
    # Show sample conversation
    print("\n" + "="*50)
    print("SAMPLE CONVERSATION:")
    print("="*50)
    sample_text = dataset['train'][0]['text']
    print(sample_text[:500] + "..." if len(sample_text) > 500 else sample_text)
    
    # Show dataset statistics
    print("\n" + "="*50)
    print("DATASET STATISTICS:")
    print("="*50)
    print(f"📈 Total training examples: {len(dataset['train'])}")
    print(f"📈 Total validation examples: {len(dataset['test'])}")
    
    # Calculate average text length
    lengths = [len(example['text']) for example in dataset['train']]
    avg_length = sum(lengths) / len(lengths)
    max_length = max(lengths)
    min_length = min(lengths)
    
    print(f"📏 Average text length: {avg_length:.0f} characters")
    print(f"📏 Max text length: {max_length} characters")
    print(f"📏 Min text length: {min_length} characters")
    
    print("\n✅ Ready for Step 5: Model loading and training!")
    
else:
    print("❌ Dataset processing failed!")
    print("Please check the dataset file and try again.")

## Install Required Packages

In [None]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
%pip install -q transformers>=4.43.0
%pip install -q accelerate>=0.21.0
%pip install -q peft>=0.4.0
%pip install -q bitsandbytes>=0.41.0
%pip install -q trl>=0.7.0
%pip install -q datasets
%pip install -q scipy
%pip install -q tensorboard
%pip install -q wandb
%pip install -q sentencepiece
%pip install -q protobuf


## Import Libraries and Setup

In [3]:

import os
from datasets import Dataset, DatasetDict
import torch
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

# Setup device and check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB" if torch.cuda.is_available() else "No GPU")


Using device: cuda
GPU Name: NVIDIA H100 NVL
GPU Memory: 99.87 GB


In [12]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
HF_TOKEN = "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"

# QLoRA configuration - optimized for H100
qlora_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA configuration - balanced for instruction tuning
lora_config = LoraConfig(
    r=64,                    # Rank - higher for better performance
    lora_alpha=16,           # Alpha parameter for LoRA scaling
    target_modules=[         # Target all attention layers
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.1,        # Dropout for regularization
    bias="none",             # No bias training
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

# Training hyperparameters - optimized for H100 with 40 cores
training_config = {
    "output_dir": "./llama-3.1-8b-corporate-assistant",
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,    # Optimal for H100
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,     # Effective batch size = 4*4*4 = 64
    "gradient_checkpointing": True,
    "optim": "paged_adamw_32bit",
    "logging_steps": 10,
    "learning_rate": 3e-4,               # Optimal for QLoRA
    "weight_decay": 0.001,
    "fp16": False,
    "bf16": True,                        # Use bf16 for H100
    "max_grad_norm": 0.3,
    "max_steps": -1,
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "lr_scheduler_type": "cosine",
    "report_to": "tensorboard",
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_steps": 100,
    "eval_steps": 100,
    "save_total_limit": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    "dataloader_num_workers": 8,         # Utilize multiple cores
    "remove_unused_columns": False,
    "push_to_hub": False,
    "hub_model_id": None,
    "hub_strategy": "every_save"
}

print("Configuration loaded successfully!")

Configuration loaded successfully!


In [None]:
# def create_sample_dataset():
#     """Create the azure_instruction_dataset.jsonl file with sample corporate data"""
    
#     print("📝 Creating azure_instruction_dataset.jsonl with sample data...")
    
#     # Sample dataset entries from your corporate assistant domain
#     dataset_entries = [
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "Under Job Code for the same grade FTE we can see - PM and FC job codes. what is the difference between two",
#             "output": "PM job code indicates Full Time Employee and FC job code indicates Full Time Contractor."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "Process to convert CWR to FTE associates",
#             "output": "Raise New Demand New Demand and mark requirement type as CWR Conversion and SO type as FTE. Rest of the Information will remain same like FTE SO creation."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "How to created CWR so",
#             "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt Service defauls to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com  Explore additional resources about SO Simplification on Be.Cognizant."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "unable to select subcontractor",
#             "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt service defaults to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com Explore additional resources about SO Simplification on Be.Cognizant."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "How to raise request for CWR SO",
#             "output": "To create a CWR Contractor Workforce Request SO follow these steps: 1. In OneC open the Quick SOgtgt Raise new demand gtgt New demand . 2. Fill in the fields of Project amp Assignment and Location as we fill in for a regular Staffing SO 3. Select SO type as CWRgtgt Service defauls to Custom service gtgt Select grade and choose a Non CCA demand role that is closer to your requirement For the list of NON CCA role codes and job codes Please refer to the worksheet embedded in the PM Handbook gtgt Select the job code with CW00 available in drop down list based on grade selection amp add the Technical skill individually. 4. Fill in the reset of the details and click on Submit to raise the SO request.  If you need any further assistance with creating a CWR SO please reach out to the SOSimplificationPM@cognizant.com .Explore additional resources about SO Simplification on Be.Cognizant."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "Unable to create opportunity id in winzone",
#             "output": "To create new winzone opportunity please reach out Account Manager/ Client Partner/ CRM for further assistance."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "How to view hardcopy of I-140 approval notice",
#             "output": "1.  I-140 approval notices are shipped to cognizant office within two to four weeks once the status updated in the USCIS site. 2.  As the I-140 is a company-owned petition original hardcopy cannot be shared."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "Are there any migration benefits available in Google",
#             "output": "Yes we have multiple benefits and credits available for migration from on-perm to google cloud. Please reach out to ProcurementITCloud@cognizant.com for detailed information."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "What equipment do I need for a telemedicine appointment",
#             "output": "Youll need a device with a camera and microphone such as a smartphone tablet or computer and a reliable internet connection. Available at MHC."
#         },
#         {
#             "system": "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.",
#             "instruction": "Do we still need to validate the SOs through email for GGM SOs",
#             "output": "Currently for APAC ASEAN amp UK region Pre-defined templates details for GGM SOs has been removed from validation procedure. Region wise process will be automated soon for GGM SOs validation."
#         }
#     ]
    
#     # Write to JSONL file
#     filename = "azure_instruction_dataset.jsonl"
#     with open(filename, 'w', encoding='utf-8') as f:
#         for entry in dataset_entries:
#             f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
#     print(f"✅ Created {filename} with {len(dataset_entries)} training examples")
#     return len(dataset_entries)

In [None]:
# def preprocess_dataset(file_path, test_size=0.1):
#     """Load and preprocess the dataset"""
#     print(f"🔄 Loading dataset from {file_path}...")
    
#     # Check if file exists
#     if not os.path.exists(file_path):
#         print(f"❌ File {file_path} not found!")
#         return None
    
#     # Load JSONL data
#     raw_data = []
#     try:
#         with open(file_path, 'r', encoding='utf-8') as f:
#             for line_num, line in enumerate(f, 1):
#                 try:
#                     entry = json.loads(line.strip())
#                     raw_data.append(entry)
#                 except json.JSONDecodeError as e:
#                     print(f"⚠️  Skipping malformed line {line_num}: {e}")
#                     continue
        
#         print(f"✅ Loaded {len(raw_data)} examples")
        
#     except Exception as e:
#         print(f"❌ Error loading file: {e}")
#         return None
    
#     if len(raw_data) == 0:
#         print("❌ No valid data found!")
#         return None
    
#     # Convert to conversational format
#     print("🔄 Converting to conversational format...")
#     conversations = []
#     for entry in raw_data:
#         try:
#             conv = convert_to_conversational_format(entry)
#             conversations.append({"text": conv})
#         except Exception as e:
#             print(f"⚠️  Error converting entry: {e}")
#             continue
    
#     print(f"✅ Converted {len(conversations)} conversations")
    
#     # Create dataset
#     dataset = Dataset.from_list(conversations)
    
#     # Split into train/validation
#     if len(conversations) > 1:
#         dataset = dataset.train_test_split(test_size=test_size, seed=42)
#     else:
#         # For single example, duplicate for validation
#         dataset = DatasetDict({
#             'train': dataset,
#             'test': dataset.select([0])
#         })
    
#     print(f"📊 Train examples: {len(dataset['train'])}")
#     print(f"📊 Validation examples: {len(dataset['test'])}")
    
#     return dataset

# # Step 5D: Execute dataset creation and loading
# print("Starting dataset creation and processing...")

# # Create the dataset file
# num_entries = create_sample_dataset()

# # Set dataset path
# DATASET_PATH = "./azure_instruction_dataset.jsonl"

# # Load and preprocess dataset
# dataset = preprocess_dataset(DATASET_PATH)

# if dataset is not None:
#     print("\n✅ Dataset processing completed successfully!")
    
#     # Display sample
#     print("\n" + "="*50)
#     print("SAMPLE CONVERSATION:")
#     print("="*50)
#     sample_text = dataset['train'][0]['text']
#     print(sample_text[:500] + "..." if len(sample_text) > 500 else sample_text)
    
#     # Calculate statistics
#     print("\n" + "="*50)
#     print("DATASET STATISTICS:")
#     print("="*50)
    
#     # Text length analysis
#     train_lengths = [len(example['text']) for example in dataset['train']]
#     avg_length = sum(train_lengths) / len(train_lengths)
#     max_length = max(train_lengths)
#     min_length = min(train_lengths)
    
#     print(f"📈 Total training examples: {len(dataset['train'])}")
#     print(f"📈 Total validation examples: {len(dataset['test'])}")
#     print(f"📏 Average text length: {avg_length:.0f} characters")
#     print(f"📏 Max text length: {max_length} characters")
#     print(f"📏 Min text length: {min_length} characters")
    
#     # Estimate token count (rough approximation: 1 token ≈ 4 characters)
#     avg_tokens = avg_length // 4
#     max_tokens = max_length // 4
    
#     print(f"🔤 Estimated avg tokens: {avg_tokens}")
#     print(f"🔤 Estimated max tokens: {max_tokens}")
    
#     if max_tokens > 2048:
#         print("⚠️  Warning: Some examples may exceed 2048 tokens")
#         print("💡 Consider reducing max_seq_length in training config")
    
#     print("\n🎯 Recommended max_seq_length for training: " + str(min(2048, max_tokens + 100)))
    
# else:
#     print("❌ Dataset creation failed!")
#     print("Please check the errors above and try again.")

# print("\n" + "="*50)
# print("DATASET PREPARATION COMPLETE!")
# print("="*50)
# if dataset is not None:
#     print("✅ Ready to proceed with model loading and training")
#     print("✅ Dataset variable 'dataset' is now available for training")
# else:
#     print("❌ Dataset preparation failed - cannot proceed with training")

In [5]:
def load_jsonl_data(file_path):
    """Load data from JSONL file"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

In [6]:
def convert_to_conversational_format(data_entry):
    """Convert system/instruction/output format to conversational format"""
    system_message = data_entry.get('system', '')
    instruction = data_entry.get('instruction', '')
    output = data_entry.get('output', '')
    
    # Create conversational format suitable for Llama-3.1
    conversation = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{output}<|eot_id|>"""
    
    return conversation

In [7]:
def preprocess_dataset(file_path, test_size=0.1):
    """Load and preprocess the dataset"""
    print("Loading dataset...")
    raw_data = load_jsonl_data(file_path)
    print(f"Loaded {len(raw_data)} examples")
    
    # Convert to conversational format
    print("Converting to conversational format...")
    conversations = []
    for entry in raw_data:
        conv = convert_to_conversational_format(entry)
        conversations.append({"text": conv})
    
    # Create dataset
    dataset = Dataset.from_list(conversations)
    
    # Split into train/validation
    dataset = dataset.train_test_split(test_size=test_size, seed=42)
    
    print(f"Train examples: {len(dataset['train'])}")
    print(f"Validation examples: {len(dataset['test'])}")
    
    return dataset

In [8]:
# Load your dataset (replace with your file path)
DATASET_PATH = "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/azure_instruction_dataset.jsonl"  # Update this path
dataset = preprocess_dataset(DATASET_PATH)

# Display sample
print("\n=== Sample Conversation ===")
print(dataset['train'][0]['text'][:500] + "...")


Loading dataset...
Loaded 1186 examples
Converting to conversational format...
Train examples: 1067
Validation examples: 119

=== Sample Conversation ===
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions.<|eot_id|><|start_header_id|>user<|end_header_id|>

How do customers gain access to a subscription in CSP<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Workloads/ subscriptions are hosted  on customer domain. Customer has access to the workloads through Azure Portal.<|eot_id|>...


In [9]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=HF_TOKEN,
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model with QLoRA...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=qlora_config,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
model = get_peft_model(model, lora_config)

Loading tokenizer...
Loading model with QLoRA...


Fetching 4 files: 100%|██████████| 4/4 [03:05<00:00, 46.30s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


In [10]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} || All params: {all_param:,} || Trainable%: {100 * trainable_params / all_param:.2f}")

print_trainable_parameters(model)


Trainable params: 167,772,160 || All params: 4,708,372,480 || Trainable%: 3.56


In [19]:
# ============================================================================
# TRL Version Diagnosis and Correct SFTTrainer Syntax
# ============================================================================

import trl
from trl import SFTTrainer
import inspect

print("=== TRL VERSION DIAGNOSIS ===")
print(f"TRL version: {trl.__version__}")

# Check SFTTrainer parameters
print("\n=== SFTTrainer ACCEPTED PARAMETERS ===")
sig = inspect.signature(SFTTrainer.__init__)
params = list(sig.parameters.keys())
print("Accepted parameters:")
for param in params:
    print(f"  • {param}")

print("\n=== CORRECT TRAINER INITIALIZATION ===")

# Version-specific fixes
trl_version = trl.__version__

if trl_version.startswith('0.4') or trl_version.startswith('0.5') or trl_version.startswith('0.6'):
    print("Using TRL 0.4-0.6 syntax:")
    print("""
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=lora_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    max_seq_length=2048,
)
""")

elif trl_version.startswith('0.7') or trl_version.startswith('0.8'):
    print("Using TRL 0.7-0.8 syntax:")
    print("""
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=lora_config,
    formatting_func=lambda x: x["text"],
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    max_seq_length=2048,
)
""")

else:
    print(f"Using TRL {trl_version} (newer) syntax:")
    print("""
# Try this first:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    peft_config=lora_config,
    max_seq_length=2048,
    packing=False,
)

# If above fails, try this minimal version:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
)
""")

print("\n=== ALTERNATIVE: Use DataCollatorForLanguageModeling ===")
print("""
# If SFTTrainer keeps failing, use standard Trainer:
from transformers import Trainer, DataCollatorForLanguageModeling

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=2048)

tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval = dataset["test"].map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Use standard Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)
""")

print("\n" + "="*50)
print("RECOMMENDED ACTION:")
print("="*50)
print("1. Run this diagnostic script first")
print("2. Check your TRL version")
print("3. Use the appropriate syntax shown above")
print("4. If SFTTrainer still fails, use the standard Trainer alternative")

=== TRL VERSION DIAGNOSIS ===
TRL version: 0.18.1

=== SFTTrainer ACCEPTED PARAMETERS ===
Accepted parameters:
  • self
  • model
  • args
  • data_collator
  • train_dataset
  • eval_dataset
  • processing_class
  • compute_loss_func
  • compute_metrics
  • callbacks
  • optimizers
  • optimizer_cls_and_kwargs
  • preprocess_logits_for_metrics
  • peft_config
  • formatting_func

=== CORRECT TRAINER INITIALIZATION ===
Using TRL 0.18.1 (newer) syntax:

# Try this first:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    peft_config=lora_config,
    max_seq_length=2048,
    packing=False,
)

# If above fails, try this minimal version:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
)


=== ALTERNATIVE: Use DataCollatorForLanguageModeling ===

# If SFTTrainer keeps failing, use standard Trainer:
from tra

In [21]:
# Create training arguments
training_args = TrainingArguments(**training_config)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=lora_config,
    formatting_func=lambda x: x["text"],  # This tells it which field to use
)

print("Trainer initialized successfully!")

Applying formatting function to train dataset: 100%|██████████| 1067/1067 [00:00<00:00, 35104.97 examples/s]
Converting train dataset to ChatML: 100%|██████████| 1067/1067 [00:00<00:00, 74317.45 examples/s]
Adding EOS to train dataset: 100%|██████████| 1067/1067 [00:00<00:00, 62758.69 examples/s]
Tokenizing train dataset: 100%|██████████| 1067/1067 [00:00<00:00, 4423.63 examples/s]
Truncating train dataset: 100%|██████████| 1067/1067 [00:00<00:00, 476148.78 examples/s]
Applying formatting function to eval dataset: 100%|██████████| 119/119 [00:00<00:00, 26963.54 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 119/119 [00:00<00:00, 44018.18 examples/s]
Adding EOS to eval dataset: 100%|██████████| 119/119 [00:00<00:00, 42813.71 examples/s]
Tokenizing eval dataset: 100%|██████████| 119/119 [00:00<00:00, 4162.82 examples/s]
Truncating eval dataset: 100%|██████████| 119/119 [00:00<00:00, 83090.09 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Sinc

Trainer initialized successfully!


In [None]:
# ============================================================================
# Fix: TypeError: 'method' object is not subscriptable
# ============================================================================

print("=== DEBUGGING TRAINER ERROR ===")

# Step 1: Check dataset format
print("🔍 Checking dataset format...")
print("Sample from train dataset:")
print(dataset["train"][0])
print("\nDataset columns:", dataset["train"].column_names)

# Step 2: Fix the formatting function
print("\n🔧 Creating proper formatting function...")

def formatting_prompts_func(examples):
    """Proper formatting function for TRL 0.18.1"""
    if isinstance(examples, dict):
        # Single example
        if "text" in examples:
            return examples["text"]
        else:
            return str(examples)
    else:
        # Batch of examples
        texts = []
        for example in examples:
            if isinstance(example, dict) and "text" in example:
                texts.append(example["text"])
            else:
                texts.append(str(example))
        return texts

# Step 3: Alternative - Process dataset first
print("🔄 Processing dataset for compatibility...")

def process_dataset_for_sft(dataset):
    """Process dataset to ensure compatibility"""
    
    # Check if it's already in the right format
    if "text" in dataset.column_names:
        print("✅ Dataset already has 'text' column")
        return dataset
    
    # If not, create it
    def add_text_column(examples):
        # Assuming the dataset has the conversation format
        return {"text": examples.get("text", str(examples))}
    
    return dataset.map(add_text_column)

# Process datasets
processed_train = process_dataset_for_sft(dataset["train"])
processed_eval = process_dataset_for_sft(dataset["test"])

print("✅ Datasets processed")

# Step 4: Create trainer with better error handling
print("\n🏋️ Creating trainer with improved setup...")

try:
    # Option 1: Simple formatting function
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=processed_train,
        eval_dataset=processed_eval,
        peft_config=lora_config,
        formatting_func=formatting_prompts_func,
    )
    print("✅ Trainer created successfully with formatting_func")
    
except Exception as e:
    print(f"❌ Option 1 failed: {e}")
    
    try:
        # Option 2: Without formatting function (auto-detect)
        trainer = SFTTrainer(
            model=model,
            args=training_args,
            train_dataset=processed_train,
            eval_dataset=processed_eval,
            peft_config=lora_config,
        )
        print("✅ Trainer created successfully without formatting_func")
        
    except Exception as e2:
        print(f"❌ Option 2 failed: {e2}")
        
        # Option 3: Manual tokenization approach
        print("🔄 Falling back to manual tokenization...")
        
        from transformers import Trainer, DataCollatorForLanguageModeling
        
        # Tokenize the datasets manually
        def tokenize_function(examples):
            # Handle both single and batch
            if isinstance(examples["text"], str):
                texts = [examples["text"]]
            else:
                texts = examples["text"]
            
            return tokenizer(
                texts,
                truncation=True,
                padding=False,  # Will be done by data collator
                max_length=1536,  # Based on your data analysis
                return_tensors=None
            )
        
        # Tokenize datasets
        tokenized_train = processed_train.map(
            tokenize_function,
            batched=True,
            remove_columns=processed_train.column_names,
        )
        
        tokenized_eval = processed_eval.map(
            tokenize_function,
            batched=True,
            remove_columns=processed_eval.column_names,
        )
        
        # Data collator for language modeling
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,  # We're doing causal LM, not masked LM
            return_tensors="pt",
        )
        
        # Create standard trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_eval,
            data_collator=data_collator,
        )
        
        print("✅ Trainer created with manual tokenization approach")

print("\n" + "="*50)
print("TRAINER SETUP COMPLETE!")
print("="*50)
print("Now try: trainer.train()")

In [22]:
print("Starting training...")
print("="*50)

# Train the model
trainer.train()

print("Training completed!")

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,1.2835,1.643767
2,1.1379,1.516374
3,0.4957,1.597736
4,0.2829,1.678621
5,0.1855,1.878496


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TypeError: 'method' object is not subscriptable

In [None]:
output_dir = "./llama-3.1-8b-corporate-assistant-final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

# Save training metrics
import json
with open(f"{output_dir}/training_metrics.json", "w") as f:
    json.dump(trainer.state.log_history, f, indent=2)

print("Training metrics saved!")

In [None]:
# Load the fine-tuned model for testing
print("Loading fine-tuned model for testing...")

# Create pipeline for inference
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
def test_model(instruction, system_prompt="You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions."):
    """Test the fine-tuned model with a sample instruction"""
    
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    response = pipe(
        prompt,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    
    return response[0]['generated_text'][len(prompt):]

# Test with sample questions
test_cases = [
    "How to raise a staffing SO request?",
    "What is the process for H1B nomination?",
    "How can I access Quick SO application?",
    "Unable to create SO in the system"
]

print("\n=== Model Testing ===")
for test_case in test_cases:
    print(f"\nQ: {test_case}")
    print(f"A: {test_model(test_case)}")
    print("-" * 50)

In [None]:
def evaluate_model_performance():
    """Evaluate the model on validation set"""
    print("Evaluating model performance...")
    
    eval_results = trainer.evaluate()
    
    print("\n=== Evaluation Results ===")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")
    
    return eval_results

# Run evaluation
eval_results = evaluate_model_performance()

In [None]:
def prepare_for_deployment():
    """Prepare model artifacts for Azure deployment"""
    
    deployment_dir = "./deployment_artifacts"
    
    # Create deployment directory
    import os
    os.makedirs(deployment_dir, exist_ok=True)
    
    # Save model and tokenizer
    model.save_pretrained(f"{deployment_dir}/model")
    tokenizer.save_pretrained(f"{deployment_dir}/tokenizer")
    
    # Create deployment config
    deployment_config = {
        "model_name": "llama-3.1-8b-corporate-assistant",
        "model_version": "1.0",
        "framework": "transformers",
        "python_version": "3.9",
        "requirements": [
            "torch>=2.0.1",
            "transformers>=4.31.0",
            "peft>=0.4.0",
            "bitsandbytes>=0.40.2",
            "accelerate>=0.21.0"
        ],
        "inference_config": {
            "max_new_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True,
            "repetition_penalty": 1.1
        }
    }
    
    with open(f"{deployment_dir}/deployment_config.json", "w") as f:
        json.dump(deployment_config, f, indent=2)
    
    print(f"Deployment artifacts saved to {deployment_dir}")
    print("Ready for Azure deployment!")
    
    return deployment_dir

# Prepare deployment artifacts
deployment_path = prepare_for_deployment()

print("\n" + "="*50)
print("FINE-TUNING COMPLETED SUCCESSFULLY!")
print("="*50)
print(f"✅ Model saved to: ./llama-3.1-8b-corporate-assistant-final")
print(f"✅ Deployment artifacts: {deployment_path}")
print(f"✅ Training metrics saved")
print(f"✅ Model tested and validated")
print("\nNext steps: Deploy to Azure ML for inferencing")