In [None]:
# fine tuning retraing parameters 
from pathlib import Path
import logging
from typing import Dict, List, Optional, Union
import yaml
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from torch.cuda.amp import autocast
import psutil
import json

class LLMPipeline:
    def __init__(self, config_path: str):
        """Initialize the LLM pipeline with configuration."""
        self.config = self._load_config(config_path)
        self.setup_logging()
        self.tokenizer = None
        self.model = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    def _load_config(self, config_path: str) -> Dict:
        """Load configuration from YAML file."""
        with open(config_path, 'r') as f:
            return yaml.safe_load(f)
    
    def setup_logging(self):
        """Configure logging with rotation and formatting."""
        logging.basicConfig(
            level=self.config.get('logging_level', 'INFO'),
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('pipeline.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def preprocess_data(self, texts: List[str]) -> Dataset:
        """Preprocess and tokenize input texts."""
        try:
            # Initialize tokenizer if not already done
            if self.tokenizer is None:
                self.tokenizer = AutoTokenizer.from_pretrained(
                    self.config['model_name'],
                    use_fast=True
                )

            # Basic text cleaning
            cleaned_texts = [
                text.strip().replace('\n', ' ').replace('\r', ' ')
                for text in texts
            ]

            # Create dataset
            dataset = Dataset.from_dict({'text': cleaned_texts})

            # Tokenization function
            def tokenize_function(examples):
                return self.tokenizer(
                    examples['text'],
                    padding='max_length',
                    truncation=True,
                    max_length=self.config['max_length']
                )

            # Apply tokenization
            tokenized_dataset = dataset.map(
                tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=['text']
            )

            return tokenized_dataset

        except Exception as e:
            self.logger.error(f"Error in preprocessing: {str(e)}")
            raise

    def setup_model(self):
        """Initialize model with PEFT configuration."""
        try:
            # Load base model
            base_model = AutoModelForCausalLM.from_pretrained(
                self.config['model_name'],
                torch_dtype=torch.float16,
                device_map='auto'
            )

            # Configure LoRA
            lora_config = LoraConfig(
                task_type=TaskType.CAUSAL_LM,
                r=8,  # LoRA attention dimension
                lora_alpha=32,
                lora_dropout=0.1,
                target_modules=["q_proj", "v_proj"]
            )

            # Apply LoRA
            self.model = get_peft_model(base_model, lora_config)
            self.model.to(self.device)

        except Exception as e:
            self.logger.error(f"Error in model setup: {str(e)}")
            raise

    def train(self, dataset: Dataset):
        """Train the model using PEFT."""
        try:
            training_args = TrainingArguments(
                output_dir="./results",
                per_device_train_batch_size=4,
                gradient_accumulation_steps=4,
                learning_rate=2e-4,
                num_train_epochs=3,
                fp16=True,
                logging_steps=100,
                save_strategy="steps",
                save_steps=200,
                evaluation_strategy="steps",
                eval_steps=200,
                save_total_limit=3,
            )

            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=dataset,
                data_collator=DataCollatorForLanguageModeling(
                    tokenizer=self.tokenizer,
                    mlm=False
                )
            )

            trainer.train()
            
        except Exception as e:
            self.logger.error(f"Error during training: {str(e)}")
            raise

    def inference(self, text: str) -> str:
        """Perform inference with resource monitoring."""
        try:
            # Monitor resource usage
            cpu_percent = psutil.cpu_percent()
            memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB

            # Log resource usage
            self.logger.info(f"CPU Usage: {cpu_percent}%, Memory: {memory:.2f}MB")

            # Tokenize input
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=self.config['max_length']
            ).to(self.device)

            # Generate with automatic mixed precision
            with autocast():
                outputs = self.model.generate(
                    inputs.input_ids,
                    max_length=self.config['max_length'],
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True
                )

            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        except Exception as e:
            self.logger.error(f"Error during inference: {str(e)}")
            raise

    def save_model(self, path: str):
        """Save the model and configuration."""
        try:
            # Create directory if it doesn't exist
            save_path = Path(path)
            save_path.mkdir(parents=True, exist_ok=True)

            # Save model
            self.model.save_pretrained(save_path)
            self.tokenizer.save_pretrained(save_path)

            # Save configuration
            with open(save_path / 'pipeline_config.json', 'w') as f:
                json.dump(self.config, f)

            self.logger.info(f"Model saved successfully to {path}")

        except Exception as e:
            self.logger.error(f"Error saving model: {str(e)}")
            raise

    def load_model(self, path: str):
        """Load a saved model and configuration."""
        try:
            load_path = Path(path)
            
            # Load configuration
            with open(load_path / 'pipeline_config.json', 'r') as f:
                self.config = json.load(f)

            # Load tokenizer and model
            self.tokenizer = AutoTokenizer.from_pretrained(load_path)
            self.model = AutoModelForCausalLM.from_pretrained(
                load_path,
                torch_dtype=torch.float16,
                device_map='auto'
            )

            self.logger.info(f"Model loaded successfully from {path}")

        except Exception as e:
            self.logger.error(f"Error loading model: {str(e)}")
            raise

In [None]:
# didnt really work
import subprocess
import time

def run_local_ollama_model(model_name, prompt, max_tokens=100, timeout=60):
    """
    Runs a local Ollama model with controlled response length.

    Args:
        model_name (str): Model name (e.g., "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L")
        prompt (str): Input prompt for the model
        max_tokens (int): Maximum number of tokens in response (default: 100)
        timeout (int): Maximum time to wait (seconds)

    Returns:
        str: Model's output or error message
    """
    start_time = time.time()
    
    try:
        # Add concise instruction to the prompt
        constrained_prompt = f"{prompt} Please answer concisely and keep responses under {max_tokens} tokens."
        
        process = subprocess.run(
            ["ollama", "run", model_name, "--num-predict", str(max_tokens)],
            input=constrained_prompt,
            capture_output=True,
            text=True,
            timeout=timeout
        )
    except subprocess.TimeoutExpired:
        return "Error: Response timed out."

    elapsed_time = time.time() - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")
    
    return process.stdout.strip()

# Example usage with length constraints
model = "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L"
prompt_text = "Explain the benefits of running models locally."

# Get response with max 50 tokens
response = run_local_ollama_model(model, prompt_text, max_tokens=50)
print("AI Response:", response)

python(74097) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Execution time: 0.21 seconds
AI Response: 


In [None]:
# use llama 
import subprocess

# Define the prompt for the AI
prompt = "What is the capital of France?"

# Run the Ollama model and send the prompt
process = subprocess.run(
    ["ollama", "run", "huihui_ai/llama3.2-abliterate:1b"],
    input=prompt,
    capture_output=True,
    text=True
)

# Print the response
print("AI Response:", process.stdout)


python(74294) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


KeyboardInterrupt: 

In [None]:
# works no pickle
import subprocess
import time

def run_local_ollama_model(model_name, prompt, timeout=60):
    """
    Runs a local Ollama model with a given prompt and returns its response.

    Args:
        model_name (str): The full name of the model (e.g., "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L").
        prompt (str): The question or input prompt for the model.
        timeout (int): Maximum time (in seconds) to wait for the response.

    Returns:
        str: The model's output or an error message if it times out.
    """
    start_time = time.time()
    
    try:
        # Execute the Ollama model command locally
        process = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,
            capture_output=True,
            text=True,
            timeout=timeout
        )
    except subprocess.TimeoutExpired:
        return "Error: The model took too long to respond. Please try again."

    elapsed_time = time.time() - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")
    
    return process.stdout.strip()

# --- Template Usage Example ---
model = "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L"
prompt_text = "Explain the benefits of running models locally."

# Call the function and capture the response
response = run_local_ollama_model(model, prompt_text)

# Print the model's response
print("AI Response:", response)


Execution time: 33.51 seconds
AI Response: Running models locally has several key benefits compared to running them in the cloud or on a remote server:

1. Faster development and testing: Local machine setup allows you to quickly run your model on a real dataset, without waiting for resources to be provisioned. You can iterate and experiment faster by making changes to code/hyperparameters and re-running your model immediately. This enables more frequent development cycles.

2. More control over environment: Running models locally gives you full control over the machine setup - OS, hardware, software versions, etc. You can fine-tune the environment to match requirements of your specific model. Cloud setups often have a default generic environment that may not always match your needs. 

3. Reduced latency for interactive use: With local deployment, there is no need to wait for cloud resources to be provisioned and booted up before running models interactively on them. You can quickly sp

In [None]:
# works creates pickle
import subprocess
import time

def run_local_ollama_model(model_name, prompt, max_tokens=100, timeout=60):
    """
    Runs a local Ollama model with controlled response length.

    Args:
        model_name (str): Model name (e.g., "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L")
        prompt (str): Input prompt for the model
        max_tokens (int): Maximum number of tokens in response (default: 100)
        timeout (int): Maximum time to wait (seconds)

    Returns:
        str: Model's output or error message
    """
    start_time = time.time()
    
    try:
        # Add concise instruction to the prompt
        constrained_prompt = f"{prompt} Please answer concisely and keep responses under {max_tokens} tokens."
        
        process = subprocess.run(
            ["ollama", "run", model_name, "--num-predict", str(max_tokens)],
            input=constrained_prompt,
            capture_output=True,
            text=True,
            timeout=timeout
        )
    except subprocess.TimeoutExpired:
        return "Error: Response timed out."

    elapsed_time = time.time() - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")
    
    return process.stdout.strip()

# Example usage with length constraints
model = "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L"
prompt_text = "Explain the benefits of running models locally."

# Get response with max 50 tokens
response = run_local_ollama_model(model, prompt_text, max_tokens=50)
print("AI Response:", response)

In [None]:
# takes 2 mins initially
import subprocess
import time
import pickle
import os

def run_local_ollama_model(model_name, prompt, timeout=60):
    """
    Runs a local Ollama model with a given prompt and returns its response.
    
    The prompt is expected to include any chain-of-thought instructions.
    
    Args:
        model_name (str): The full name of the model (e.g., "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L").
        prompt (str): The complete prompt for the model.
        timeout (int): Maximum time (in seconds) to wait for the response.
    
    Returns:
        str: The model's output or an error message if it times out.
    """
    start_time = time.time()
    
    try:
        process = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,
            capture_output=True,
            text=True,
            timeout=timeout
        )
    except subprocess.TimeoutExpired:
        return "Error: The model took too long to respond. Please try again."

    elapsed_time = time.time() - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")
    return process.stdout.strip()

def append_response_to_pickle(new_response, filename="responses.pkl"):
    """
    Appends a new response to a pickle file. Creates a new file if it doesn't exist.
    
    Args:
        new_response (str): The model's response to store.
        filename (str): The pickle file name.
    """
    if os.path.exists(filename):
        with open(filename, "rb") as f:
            responses = pickle.load(f)
    else:
        responses = []
    
    responses.append(new_response)
    
    with open(filename, "wb") as f:
        pickle.dump(responses, f)
    print(f"Response appended to {filename}.")

# --- Interactive Template Usage Example ---
model = "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L"

print("Enter your prompts to interact with the model. Type 'exit' to quit.")

while True:
    user_prompt = input("Your prompt: ")
    if user_prompt.lower() in ["exit", "quit"]:
        print("Exiting interactive loop.")
        break
    
    # Construct a prompt that adds chain-of-thought instructions and instructs the model
    # to produce a concise final answer (approximately half the length of a typical output).
    augmented_prompt = (
        "Please provide a detailed chain-of-thought explanation for your reasoning, "
        "then give your final answer. However, ensure that the final answer is concise—roughly half the length of a typical response.\n\n"
        f"Question: {user_prompt}"
    )
    
    # Run the model with the augmented prompt
    response = run_local_ollama_model(model, augmented_prompt)
    print("AI Response:\n", response)
    
    # Append the response to the pickle file
    append_response_to_pickle(response)


Enter your prompts to interact with the model. Type 'exit' to quit.


python(75524) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Execution time: 58.97 seconds
AI Response:
 Here is an outline with 3 main points and 4 deep sub-sections for each main point:

Main Point 1: The Importance of Sleep  
1) Sleep is essential for physical health. Lack of sleep can lead to obesity, diabetes, high blood pressure, heart disease and stroke.  
2) Sleep helps the body repair and recover from the day's activities. Sufficient sleep allows tissues to regenerate and immune system to reset.
3) Quality sleep promotes cognitive function. Proper rest supports memory formation and prevents mental decline with age.

Deep Sub-Point 1a: The Stages of Sleep
i) Non-rapid eye movement (NREM) sleep - Deep, regenerative stage that lasts about 20% of total sleep time. Body relaxes as breathing slows down.  
ii) Rapid eye movement (REM) sleep - Vivid, dream-filled stage where the brain is more active than in waking state. Muscles are relaxed.
iii) Alpha and theta waves - Hypnotic brainwaves associated with deep relaxation during NREM sleep. Key 

python(76399) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Execution time: 20.14 seconds
AI Response:
 To find the nth Fibonacci number, we can use a recursive formula with base cases of 0 and 1. The recursive formula is:

Fn = Fn-1 + Fn-2 for n > 2 
       = 0 if n=0
       = 1 if n=1

Proof: Consider the closed form expression for the nth Fibonacci number as derived by Lucas (1890):

Fn = Fib(n) = (φ^n - (1-φ)^n)/(φ-1), where φ = (1 + √5)/2 is the golden ratio. 

Now, using the Binomial Theorem expanded to 3rd degree:
Fib(n) = Σ i=0 to n ((C(n,i)) * Fib(i-1) * (-1)^i) / i!  where C(n,i) = (n choose i) is the binomial coefficient.

For large n (>10), this recursive formula matches the closed form expression almost exactly up to machine precision loss:
| Fn - Fib(n) | < 5e-3 for all n > 10

The proof shows these formulas generate the same results but through different logical processes, one a simple recurrence relation and the other an extended Binomial expansion. Both converge to Fibonacci numbers as expected.
Response appended to responses.p

In [2]:
import subprocess
import time
import pickle
from datetime import datetime
import os

HISTORY_FILE = "ollama_history.pkl"

def save_to_pickle(response_data):
    """Save response data to pickle file with timestamp"""
    try:
        # Try to load existing history
        with open(HISTORY_FILE, "rb") as f:
            history = pickle.load(f)
    except (FileNotFoundError, EOFError):
        # Create new history with creation timestamp
        history = {
            'created_at': datetime.now().isoformat(),
            'interactions': []
        }
    
    # Append new interaction
    history['interactions'].append(response_data)
    
    # Save updated history
    with open(HISTORY_FILE, "wb") as f:
        pickle.dump(history, f)

def run_local_ollama_model(model_name, prompt, max_tokens=100, timeout=60):
    """
    Runs a local Ollama model and saves responses to timestamped pickle file.
    """
    start_time = time.time()
    response_data = {
        'timestamp': datetime.now().isoformat(),
        'model': model_name,
        'prompt': prompt,
        'response': '',
        'execution_time': 0
    }

    try:
        constrained_prompt = f"{prompt} Please answer concisely and keep responses under {max_tokens} tokens."
        
        process = subprocess.run(
            ["ollama", "run", model_name, "--num-predict", str(max_tokens)],
            input=constrained_prompt,
            capture_output=True,
            text=True,
            timeout=timeout
        )
        
        response = process.stdout.strip()
        elapsed_time = time.time() - start_time
        
        # Update response data
        response_data.update({
            'response': response,
            'execution_time': round(elapsed_time, 2)
        })
        
        # Save to pickle
        save_to_pickle(response_data)
        
        print(f"Execution time: {elapsed_time:.2f} seconds")
        return response
        
    except subprocess.TimeoutExpired:
        response_data['response'] = "Error: Response timed out."
        save_to_pickle(response_data)
        return response_data['response']

def load_history():
    """Load and print interaction history"""
    try:
        with open(HISTORY_FILE, "rb") as f:
            history = pickle.load(f)
            print(f"History created at: {history['created_at']}")
            for idx, interaction in enumerate(history['interactions'], 1):
                print(f"\nInteraction {idx} ({interaction['timestamp']}):")
                print(f"Model: {interaction['model']}")
                print(f"Prompt: {interaction['prompt']}")
                print(f"Response: {interaction['response']}")
                print(f"Execution time: {interaction['execution_time']}s")
    except FileNotFoundError:
        print("No history found")

# Example usage
if __name__ == "__main__":
    model = "hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L"
    prompt_text = "Explain the benefits of running models locally."
    
    # Get response
    response = run_local_ollama_model(model, prompt_text, max_tokens=50)
    print("AI Response:", response)
    
    # Get another response
    response2 = run_local_ollama_model(model, "What is transfer learning?", max_tokens=30)
    print("\nSecond Response:", response2)
    
    # View history
    print("\n=== Interaction History ===")
    load_history()

Execution time: 0.10 seconds
AI Response: 
Execution time: 0.02 seconds

Second Response: 

=== Interaction History ===
History created at: 2025-02-16T21:11:31.131597

Interaction 1 (2025-02-16T21:11:31.002505):
Model: hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L
Prompt: Explain the benefits of running models locally.
Response: 
Execution time: 0.13s

Interaction 2 (2025-02-16T21:11:31.133212):
Model: hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L
Prompt: What is transfer learning?
Response: 
Execution time: 0.01s

Interaction 3 (2025-02-16T21:12:05.029014):
Model: hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L
Prompt: Explain the benefits of running models locally.
Response: 
Execution time: 0.1s

Interaction 4 (2025-02-16T21:12:05.131621):
Model: hf.co/TheDrummer/Gemmasutra-Mini-2B-v1-GGUF:Q3_K_L
Prompt: What is transfer learning?
Response: 
Execution time: 0.02s
