In [None]:
# Install required packages with proper versions
!pip install -q -U transformers peft accelerate datasets bitsandbytes trl

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    Trainer  # Import Trainer class
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
import os

# Configuration
os.environ["WANDB_DISABLED"] = "true"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
data_path = "/content/qaparis.json"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with quantization
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

# Improved LoRA config
peft_config = LoraConfig(
    r=16,  # Increased from 8
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,  # Increased from 0.05
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load and prepare data
with open(data_path) as f:
    data = json.load(f)

def format_instruction(sample):
    return f"<|system|>\nYou are a CLI expert assistant.</s>\n<|user|>\n{sample['question']}</s>\n<|assistant|>\n{sample['answer']}</s>"

dataset = Dataset.from_list(data)
dataset = dataset.map(lambda x: {"text": format_instruction(x)})

def tokenize(sample):
    return tokenizer(
        sample["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

dataset = dataset.map(tokenize, batched=True)

# Remove the original columns after tokenization
dataset = dataset.remove_columns(['question', 'answer', 'text'])


# Improved training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,  # Increased from 4
    gradient_accumulation_steps=2,  # Reduced from 4
    learning_rate=1e-4,  # Reduced from 2e-4
    num_train_epochs=3,  # Increased from 1
    logging_steps=10,
    save_strategy="no",
    optim="paged_adamw_8bit",  # Better optimizer for 4-bit
    warmup_ratio=0.1,  # Add warmup
    lr_scheduler_type="cosine",  # Better learning rate schedule
    report_to="none",
    fp16=True,
    remove_unused_columns=False  # Keep this as False to control column removal manually
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()

# Save adapter
model.save_pretrained("lora_adapter")

trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.0284
20,1.1183
30,0.8233


In [None]:
!python agent.py "List all Python files in the current directory recursively."

2025-06-19 10:16:17.081090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750328177.111055   13607 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750328177.120497   13607 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[1;33m[DRY RUN][0m [1;36mpython -m find -r[0m
[1;32mCommand:[0m python -m find -r

Use:
python -m find -r

Explanation:
-m: runs the command as a module (Python script)
-r: recursively list all files in the current directory

Example:
$ python -m find -r
```
.
├─

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import json
import re
from rouge_score import rouge_scorer
import warnings

warnings.filterwarnings("ignore")

class EnhancedEvaluator:
    def __init__(self):
        self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Load models
        self.base_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        self.ft_model = PeftModel.from_pretrained(self.base_model, "lora_adapter")

        # Create pipelines
        self.base_pipe = pipeline(
            "text-generation",
            model=self.base_model,
            tokenizer=self.tokenizer,
            device_map="auto"
        )
        self.ft_pipe = pipeline(
            "text-generation",
            model=self.ft_model,
            tokenizer=self.tokenizer,
            device_map="auto"
        )

        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

        # Test prompts with multiple acceptable solutions
        self.test_prompts = [
            {
                "prompt": "Create a new Git branch and switch to it.",
                "valid_commands": [
                    r"git checkout -b \w+",
                    r"git branch \w+ && git checkout \w+",
                    r"git switch -c \w+"
                ]
            },
            {
                "prompt": "Compress the folder reports into reports.tar.gz.",
                "valid_commands": [
                    r"tar -czf reports.tar.gz reports",
                    r"tar -cvzf reports.tar.gz reports"
                ]
            },
            {
                "prompt": "List all Python files in the current directory recursively.",
                "valid_commands": [
                    r"find . -name '\*.py'",
                    r"ls -R \| grep .py",
                    r"grep -r --include='\*.py' '' ."
                ]
            },
            {
                "prompt": "Set up a virtual environment and install requests.",
                "valid_commands": [
                    r"python -m venv venv && source venv/bin/activate && pip install requests",
                    r"virtualenv venv && . venv/bin/activate && pip install requests"
                ]
            },
            {
                "prompt": "Fetch only the first ten lines of a file named output.log.",
                "valid_commands": [
                    r"head -n 10 output.log",
                    r"sed -n '1,10p' output.log"
                ]
            },
            # Edge cases
            {
                "prompt": "Delete all files with .tmp extension in /tmp directory safely.",
                "valid_commands": [
                    r"find /tmp -name '\*.tmp' -type f -delete",
                    r"find /tmp -name '\*.tmp' -exec rm {} \+"
                ]
            },
            {
                "prompt": "Find and replace text 'old_text' with 'new_text' in all .py files recursively.",
                "valid_commands": [
                    r"find . -name '\*.py' -exec sed -i 's/old_text/new_text/g' {} \+",
                    r"grep -rl 'old_text' --include='\*.py' . | xargs sed -i 's/old_text/new_text/g'"
                ]
            }
        ]

    def extract_command(self, response):
        """Improved command extraction with regex patterns"""
        # Look for code blocks
        code_match = re.search(r'```(?:bash|sh)?\n(.*?)\n```', response, re.DOTALL)
        if code_match:
            commands = [c.strip() for c in code_match.group(1).split('\n') if c.strip()]
            if commands:
                return commands[0]

        # Look for command-like lines
        command_pattern = r'^\s*(?:git|tar|find|python|head|sed|rm)\s+.*'
        for line in response.split('\n'):
            line = line.strip()
            if re.match(command_pattern, line):
                return line

        return response.strip()

    def generate_response(self, pipe, prompt):
        """Generate response with better prompt engineering"""
        system_prompt = (
            "<|system|>\nYou are a CLI expert assistant. Respond ONLY with the exact command needed "
            "to complete the task. Do not include any explanations, comments, or examples.\n</s>\n"
        )
        user_prompt = f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
        full_prompt = system_prompt + user_prompt

        try:
            response = pipe(
                full_prompt,
                max_new_tokens=50,
                do_sample=False,
                temperature=0.01,
                top_k=1
            )[0]["generated_text"][len(full_prompt):].strip()
            return response
        except Exception as e:
            return f"Error: {str(e)}"

    def command_is_valid(self, command, valid_patterns):
        """Check if command matches any valid pattern"""
        if not command or "Error" in command:
            return False
        return any(re.match(pattern, command) for pattern in valid_patterns)

    def score_plan_quality(self, command, valid_patterns):
        """Improved quality scoring (0-2)"""
        if not command or "Error" in command:
            return 0

        # Score 2: Command matches a valid pattern exactly
        if any(re.fullmatch(pattern, command) for pattern in valid_patterns):
            return 2

        # Score 1: Command contains key elements
        key_terms = ["git", "tar", "find", "python", "head", "sed", "rm"]
        if any(term in command for term in key_terms):
            return 1

        return 0

    def evaluate(self):
        """Run complete evaluation"""
        results = []

        print("Running Enhanced Evaluation...")
        print("=" * 50)

        for test in self.test_prompts:
            prompt = test["prompt"]
            valid_patterns = test["valid_commands"]

            print(f"\nTest: {prompt}")
            print("-" * 50)

            # Generate responses
            base_response = self.generate_response(self.base_pipe, prompt)
            ft_response = self.generate_response(self.ft_pipe, prompt)

            # Extract commands
            base_cmd = self.extract_command(base_response)
            ft_cmd = self.extract_command(ft_response)

            # Calculate ROUGE-L
            base_rouge = self.scorer.score(prompt, base_cmd)['rougeL'].fmeasure
            ft_rouge = self.scorer.score(prompt, ft_cmd)['rougeL'].fmeasure

            # Score quality
            base_quality = self.score_plan_quality(base_cmd, valid_patterns)
            ft_quality = self.score_plan_quality(ft_cmd, valid_patterns)

            # Check validity
            base_valid = self.command_is_valid(base_cmd, valid_patterns)
            ft_valid = self.command_is_valid(ft_cmd, valid_patterns)

            print(f"Base response: {base_cmd}")
            print(f"Fine-tuned:    {ft_cmd}")
            print(f"Valid? Base: {base_valid}, Fine-tuned: {ft_valid}")
            print(f"ROUGE-L: Base: {base_rouge:.3f}, Fine-tuned: {ft_rouge:.3f}")
            print(f"Quality: Base: {base_quality}/2, Fine-tuned: {ft_quality}/2")

            results.append({
                "prompt": prompt,
                "base_command": base_cmd,
                "ft_command": ft_cmd,
                "base_valid": base_valid,
                "ft_valid": ft_valid,
                "base_rouge": base_rouge,
                "ft_rouge": ft_rouge,
                "base_quality": base_quality,
                "ft_quality": ft_quality
            })

        # Save results
        with open("evaluation_results.json", "w") as f:
            json.dump(results, f, indent=2)

        # Generate summary
        self.generate_summary(results)

        return results

    def generate_summary(self, results):
        """Generate evaluation summary"""
        avg_base_rouge = sum(r["base_rouge"] for r in results) / len(results)
        avg_ft_rouge = sum(r["ft_rouge"] for r in results) / len(results)
        avg_base_quality = sum(r["base_quality"] for r in results) / len(results)
        avg_ft_quality = sum(r["ft_quality"] for r in results) / len(results)

        base_valid_count = sum(1 for r in results if r["base_valid"])
        ft_valid_count = sum(1 for r in results if r["ft_valid"])

        print("\n" + "=" * 50)
        print("EVALUATION SUMMARY")
        print("=" * 50)
        print(f"Valid Commands:")
        print(f"  Base Model: {base_valid_count}/{len(results)}")
        print(f"  Fine-tuned: {ft_valid_count}/{len(results)}")

        print(f"\nAverage ROUGE-L Score:")
        print(f"  Base Model: {avg_base_rouge:.3f}")
        print(f"  Fine-tuned: {avg_ft_rouge:.3f}")
        print(f"  Improvement: {((avg_ft_rouge - avg_base_rouge) / avg_base_rouge * 100):.1f}%")

        print(f"\nAverage Quality Score (0-2):")
        print(f"  Base Model: {avg_base_quality:.1f}")
        print(f"  Fine-tuned: {avg_ft_quality:.1f}")
        print(f"  Improvement: {((avg_ft_quality - avg_base_quality) / (avg_base_quality + 0.001) * 100):.1f}%")

if __name__ == "__main__":
    evaluator = EnhancedEvaluator()
    results = evaluator.evaluate()

Device set to use cuda:0
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Running Enhanced Evaluation...

Test: Create a new Git branch and switch to it.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: git checkout -b new_branch
Fine-tuned:    git checkout -b new_branch
Valid? Base: True, Fine-tuned: True
ROUGE-L: Base: 0.286, Fine-tuned: 0.286
Quality: Base: 2/2, Fine-tuned: 2/2

Test: Compress the folder reports into reports.tar.gz.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: tar -czvf reports.tar.gz reports
Fine-tuned:    tar -czvf reports.tar.gz reports
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.429, Fine-tuned: 0.429
Quality: Base: 1/2, Fine-tuned: 1/2

Test: List all Python files in the current directory recursively.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: python -m find -r
Fine-tuned:    python -m find -r
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.154, Fine-tuned: 0.154
Quality: Base: 1/2, Fine-tuned: 1/2

Test: Set up a virtual environment and install requests.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: $ virtualenv -p python3 env
$ source env/bin/activate
$ pip install requests
```

Explanation:

1. Set up a virtual environment.
2. Activate the virtual environment.
Fine-tuned:    $ virtualenv -p python3 env
$ source env/bin/activate
$ pip install requests
```

Explanation:

1. Set up a virtual environment.
2. Activate the virtual environment.
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.323, Fine-tuned: 0.323
Quality: Base: 1/2, Fine-tuned: 1/2

Test: Fetch only the first ten lines of a file named output.log.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: cat output.log | head -10
Fine-tuned:    cat output.log | head -10
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.235, Fine-tuned: 0.235
Quality: Base: 1/2, Fine-tuned: 1/2

Test: Delete all files with .tmp extension in /tmp directory safely.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: Use: rm -rf /tmp/.tmp*

Explanation:
-r: recursive (delete all files and subdirectories)
-f: force (delete even if file is not a directory)
-i: interactive
Fine-tuned:    Use: rm -rf /tmp/.tmp*

Explanation:
-r: recursive (delete all files and subdirectories)
-f: force (delete even if file is not a directory)
-i: interactive
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.229, Fine-tuned: 0.229
Quality: Base: 1/2, Fine-tuned: 1/2

Test: Find and replace text 'old_text' with 'new_text' in all .py files recursively.
--------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base response: Use: find. -type f -name '*.py' -exec sed -i "s/old_text/new_text/g" {} \;

Example:
    find. -type f -name '*.py'
Fine-tuned:    Use: find. -type f -name '*.py' -exec sed -i "s/old_text/new_text/g" {} \;

Example:
    find. -type f -name '*.py'
Valid? Base: False, Fine-tuned: False
ROUGE-L: Base: 0.343, Fine-tuned: 0.343
Quality: Base: 1/2, Fine-tuned: 1/2

EVALUATION SUMMARY
Valid Commands:
  Base Model: 1/7
  Fine-tuned: 1/7

Average ROUGE-L Score:
  Base Model: 0.285
  Fine-tuned: 0.285
  Improvement: 0.0%

Average Quality Score (0-2):
  Base Model: 1.1
  Fine-tuned: 1.1
  Improvement: 0.0%
