In [None]:
# cl2

import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import dspy

# Debugging: Print Python and Library Versions
import sys
print("Python Version:", sys.version)
print("Torch Version:", torch.__version__)
print("Transformers Version:", transformers.__version__)
print("DSPy Version:", dspy.__version__)

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Using device: {device}")

# Load the CSV
csv_path = "./151_ideas_updated.csv"
try:
    # Load CSV with extra diagnostics
    data = pd.read_csv(csv_path, on_bad_lines='skip')
    print("\nCSV Load Successful")
    print("CSV Columns:", list(data.columns))
    print("Total Rows:", len(data))
    print("\nFirst few rows:\n", data.head())

except Exception as e:
    print(f"CSV Loading Error: {e}")
    import traceback
    traceback.print_exc()
    exit(1)

# Model and Tokenizer Loading with Extensive Debugging
huggingface_model = 'facebook/opt-350m'
try:
    print("\nLoading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
    print("Tokenizer Loaded Successfully")

    print("\nLoading Model...")
    model = AutoModelForCausalLM.from_pretrained(
        huggingface_model, 
        torch_dtype=torch.float16 if device == "cuda" else torch.float32
    ).to(device)
    print("Model Loaded Successfully")

    print("\nCreating Pipeline...")
    text_generator = pipeline(
        'text-generation', 
        model=model, 
        tokenizer=tokenizer,
        device=0 if device == "cuda" else -1
    )
    print("Pipeline Created Successfully")

except Exception as e:
    print(f"Model Loading Error: {e}")
    import traceback
    traceback.print_exc()
    exit(1)

# Basic Pipeline Test
try:
    print("\nTesting Text Generation...")
    test_prompt = "Hello, how are you?"
    result = text_generator(test_prompt, max_length=50)
    print("Generation Test Result:")
    print(result)

except Exception as e:
    print(f"Generation Test Error: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# cl kinda works

import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import dspy
import torch

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the CSV
csv_path = "./151_ideas_updated.csv"
try:
    data = pd.read_csv(csv_path, on_bad_lines='skip')

    # Clean and prepare the data
    data = data.dropna().reset_index(drop=True)
    print("CSV Data Columns:", list(data.columns))
    print("First few rows:\n", data.head())

except Exception as e:
    print(f"Error loading CSV: {e}")
    exit(1)

# Load Hugging Face model and tokenizer
huggingface_model = 'facebook/opt-350m'
try:
    tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
    model = AutoModelForCausalLM.from_pretrained(
        huggingface_model, 
        torch_dtype=torch.float16 if device == "cuda" else torch.float32
    ).to(device)

    # Create text generation pipeline with explicit device
    text_generator = pipeline(
        'text-generation', 
        model=model, 
        tokenizer=tokenizer,
        device=0 if device == "cuda" else -1  # use GPU if available
    )

except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

# Custom Language Model for DSPy
class SimpleLLM(dspy.LM):
    def __init__(self, generator):
        self.generator = generator
    
    def __call__(self, prompt, **kwargs):
        try:
            # Generate response
            response = self.generator(
                prompt, 
                max_length=100, 
                num_return_sequences=1
            )[0]['generated_text']
            return response
        except Exception as e:
            print(f"Generation error: {e}")
            return "I'm having trouble generating a response."

# Define a simple signature for the chatbot
class ChatbotSignature(dspy.Signature):
    """Generate a helpful and concise response to a user's query."""
    query = dspy.InputField()
    response = dspy.OutputField(desc="Helpful and relevant answer")

# Configure DSPy with the custom language model
dspy.settings.configure(lm=SimpleLLM(text_generator))

# Create a prediction module
chatbot = dspy.Predict(ChatbotSignature)

# Interactive chat loop
def chat():
    print("DSPy Chatbot: Hello! I'm ready to help. Type 'exit' to quit.")
    print("Loaded dataset columns:", list(data.columns))
    
    while True:
        try:
            user_input = input("You: ")
            
            # Exit condition
            if user_input.lower() == 'exit':
                print("DSPy Chatbot: Goodbye!")
                break
            
            # Enhance prompt with context from data if possible
            # Assumes there's a column that might be relevant
            context_column = data.columns[0] if len(data.columns) > 0 else None
            
            if context_column:
                # Add some context from the dataset
                enhanced_prompt = f"Using information from the dataset, {user_input}"
            else:
                enhanced_prompt = user_input
            
            # Generate response
            response = chatbot(query=enhanced_prompt)
            print("DSPy Chatbot:", response.response)
        
        except Exception as e:
            print(f"Chat error: {e}")

# Run the chatbot
if __name__ == "__main__":
    chat()

In [None]:
# m2 claude optimized 

import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import dspy

# Optimized for M2 Mac
huggingface_model = 'distilgpt2'  # Smaller, more lightweight model

# Load the CSV
csv_path = "./151_ideas_updated.csv"
data = pd.read_csv(csv_path, on_bad_lines='skip')

# Clean the data
data = data.dropna().reset_index(drop=True)

# Load tokenizer and model with MPS (Metal Performance Shaders) support for M2
try:
    tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
    
    # Ensure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Use MPS for M2 Mac
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    
    model = AutoModelForCausalLM.from_pretrained(
        huggingface_model,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    ).to(device)

    # Create text generation pipeline
    text_generator = pipeline(
        'text-generation', 
        model=model, 
        tokenizer=tokenizer,
        device=device
    )

except Exception as e:
    print(f"Model loading error: {e}")
    import traceback
    traceback.print_exc()
    exit(1)

# Custom Language Model for DSPy
class SimpleLLM(dspy.LM):
    def __init__(self, generator):
        self.generator = generator
    
    def __call__(self, prompt, **kwargs):
        try:
            # Generate response
            responses = self.generator(
                prompt, 
                max_length=100, 
                num_return_sequences=1,
                do_sample=True,
                temperature=0.7
            )
            return responses[0]['generated_text']
        except Exception as e:
            print(f"Generation error: {e}")
            return "I'm having trouble generating a response."

# Define chatbot signature
class ChatbotSignature(dspy.Signature):
    """Generate a helpful and concise response to a user's query."""
    query = dspy.InputField()
    response = dspy.OutputField(desc="Helpful and relevant answer")

# Configure DSPy
dspy.settings.configure(lm=SimpleLLM(text_generator))

# Create prediction module
chatbot = dspy.Predict(ChatbotSignature)

# Interactive chat loop
def chat():
    print("DSPy Chatbot: Hello! I'm ready to help. Type 'exit' to quit.")
    print("Dataset columns:", list(data.columns))
    
    while True:
        try:
            user_input = input("You: ")
            
            if user_input.lower() == 'exit':
                print("DSPy Chatbot: Goodbye!")
                break
            
            # Add context from the dataset
            context = data['Ideas'].sample(1).values[0]
            enhanced_prompt = f"Context from dataset: {context}\n\nUser query: {user_input}"
            
       
       
            # Generate response
            response = chatbot(query=enhanced_prompt)
            print("DSPy Chatbot:", response.response)
        
        except Exception as e:
            print(f"Chat error: {e}")

# Run the chatbot
if __name__ == "__main__":
    chat()

DSPy Chatbot: Hello! I'm ready to help. Type 'exit' to quit.
Dataset columns: ['Ideas', 'Theme a', 'Theme-b', 'Theme-c', 'Unnamed: 4', 'Unnamed: 5']


In [6]:
# claude - 12-16-24

import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import dspy

# Configuration & Data Loading
huggingface_model = 'facebook/opt-350m'
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

# Load and clean your CSV dataset
csv_path = "./151_ideas_updated.csv"

try:
    # Load and preprocess the dataset
    data = pd.read_csv(csv_path, on_bad_lines='skip')
    data = data.drop(columns=['Unnamed: 4', 'Unnamed: 5'], errors='ignore')  # Drop unnecessary columns
    data = data.dropna().reset_index(drop=True)  # Remove NaN values and reset index

    # Convert to Hugging Face Dataset and split into train and test sets
    dataset = Dataset.from_pandas(data)
    dataset = dataset.train_test_split(test_size=0.2)
    trainset, devset = dataset['train'], dataset['test']

    print(f"Trainset size: {len(trainset)}, Devset size: {len(devset)}")
except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")

# Load Hugging Face tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
model = AutoModelForCausalLM.from_pretrained(huggingface_model)

# Create a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Custom Language Model using DSPy LM
class HuggingFaceLM(dspy.LM):
    def __init__(self, generator, **kwargs):
        self.generator = generator
        self.kwargs = kwargs
    
    def generate(self, prompt, **kwargs):
        """Generate output text given a prompt."""
        try:
            # Merge any additional kwargs with defaults
            final_kwargs = {**self.kwargs, **kwargs}
            output = self.generator(prompt, max_length=50, **final_kwargs)
            return output[0]['generated_text']
        except Exception as e:
            print(f"Error during generation: {e}")
            return ""

# Configure DSPy settings
dspy.settings.configure(
    lm=HuggingFaceLM(text_generator, temperature=0.7),  # Custom Hugging Face LM
    rm=colbertv2_wiki17_abstracts                        # Retrieval model
)

# Define the Basic Question-Answering Signature
class BasicQA(dspy.Signature):
    """Answer questions with short factual answers."""
    question = dspy.InputField(desc="A question to answer")
    answer = dspy.OutputField(desc="A concise answer between 1 and 5 words")

# Example pipeline to generate predictions
example = devset[0]
question = example['Ideas']

# Define DSPy Prediction Pipeline
generate_answer = dspy.Predict(BasicQA)

# Execute pipeline with an example question
try:
    result = generate_answer(question=question)
    print(f"Question: {question}")
    print(f"Predicted Answer: {result.answer}")
except Exception as e:
    print(f"Error during DSPy pipeline execution: {e}")


Trainset size: 119, Devset size: 30


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0


Error during DSPy pipeline execution: 'HuggingFaceLM' object has no attribute 'cache'


In [26]:
# g2 

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Example Question with Answer
example = devset[0]
question = example['Ideas']

# Create the kwargs wrapper (WITHOUT temperature)
class LMWithKwargs:
    def __init__(self, model):
        self.model = model

    def __call__(self, *args, **kwargs):
        return self.model(*args) # call with just *args

# Load the actual language model and wrap it
model = AutoModelForCausalLM.from_pretrained(huggingface_model)
lm_with_kwargs = LMWithKwargs(model)

# Now configure dspy with the wrapper
dspy.settings.configure(lm=lm_with_kwargs, rm=colbertv2_wiki17_abstracts)

# Load tokenizer from Hugging Face (needed for tokenization)
tokenizer = AutoTokenizer.from_pretrained(huggingface_model)

# Tokenize the question
question_ids = tokenizer(question, return_tensors='pt').input_ids
question_text = tokenizer.decode(question_ids.squeeze(), skip_special_tokens=True)

# Generate Response with tokenized question
generate_answer = dspy.Predict(BasicQA)
pred = generate_answer(question=question_text)
print(f"Question: {question}\nPredicted Answer: {pred.answer}")

# Tokenize the predicted answer (if it's a string)
if isinstance(pred.answer, str): #check if its a string
    labels = tokenizer(pred.answer, return_tensors='pt').input_ids
elif isinstance(pred.answer, list): #check if its a list
    labels = tokenizer(pred.answer[0], return_tensors='pt').input_ids
else:
    labels = None
    print("pred.answer is not a string or list")
# Tokenize the example
inputs = tokenizer(question, return_tensors='pt')

# Prepare dataset in Hugging Face format (only if labels is not None)
if labels is not None:
    hf_dataset = DatasetDict({
        'train': Dataset.from_dict({
            'input_ids': [inputs['input_ids'].squeeze().tolist()],
            'attention_mask': [inputs['attention_mask'].squeeze().tolist()],
            'labels': [labels.squeeze().tolist()]
        }),
        'validation': Dataset.from_dict({
            'input_ids': [inputs['input_ids'].squeeze().tolist()],
            'attention_mask': [inputs['attention_mask'].squeeze().tolist()],
            'labels': [labels.squeeze().tolist()]
        })
    })

    # Print to verify
    print(hf_dataset)
else:
    print("Skipping hf_dataset creation due to invalid labels")

AttributeError: 'LMWithKwargs' object has no attribute 'kwargs'

In [19]:
# chain gem

# 3. Chatbot with Chain of Thought
print("\n### Generate Response with Chain of Thought ###\n")
generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
pred = generate_answer_with_chain_of_thought(question=example.question)
print(f"Question: {example.question}\nThought: {pred.rationale.split('.', 1)[1].strip()}\nPredicted Answer: {pred.answer}")


AttributeError: 'dict' object has no attribute 'question'

In [13]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer
import dspy

# Configuration & Data Loading
huggingface_model = 'facebook/opt-350m'
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
dspy.settings.configure(lm=tokenizer, rm=colbertv2_wiki17_abstracts)  # Configure with tokenizer for simplicity

# Load and clean your CSV dataset
csv_path = "./151_ideas_updated.csv"
try:
    # Load CSV with proper column handling
    data = pd.read_csv(csv_path, on_bad_lines='skip')
    
    # Drop columns with unnamed or unnecessary data
    data = data.drop(columns=['Unnamed: 4', 'Unnamed: 5'], errors='ignore')
    
    # Drop rows with missing values and reset the index
    data = data.dropna().reset_index(drop=True)
    
    # Convert the cleaned DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(data)
    dataset = dataset.train_test_split(test_size=0.2)  # Split into train and validation sets

    # Prepare the train and dev sets
    trainset = dataset['train']
    devset = dataset['test']

    # Print the size and first few elements of each set
    print(len(trainset), len(devset))
    print(f"Trainset Data {trainset[:5]}")
    print(f"Devset Data {devset[:5]}")

except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")


In [14]:
# Define BasicQA class
class BasicQA(dspy.Signature):  # A. Signature
    """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

# Example Question with Answer
example = devset[0]
question = example['Ideas']

# Generate Response
generate_answer = dspy.Predict(BasicQA)
pred = generate_answer(question=question)
print(f"Question: {question}\nPredicted Answer: {pred.answer}")

# Load tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(huggingface_model)
model = AutoModelForCausalLM.from_pretrained(huggingface_model)

# Tokenize the example
inputs = tokenizer(question, return_tensors='pt')
labels = tokenizer(pred.answer, return_tensors='pt')

# Prepare dataset in Hugging Face format
hf_dataset = DatasetDict({
    'train': Dataset.from_dict({
        'input_ids': [inputs['input_ids'].squeeze().tolist()],
        'attention_mask': [inputs['attention_mask'].squeeze().tolist()],
        'labels': [labels['input_ids'].squeeze().tolist()]
    }),
    'validation': Dataset.from_dict({
        'input_ids': [inputs['input_ids'].squeeze().tolist()],
        'attention_mask': [inputs['attention_mask'].squeeze().tolist()],
        'labels': [labels['input_ids'].squeeze().tolist()]
    })
})

# Print to verify
print(hf_dataset)


AttributeError: 'GPT2TokenizerFast' object has no attribute 'kwargs'