In [None]:
import torch
from transformers import pipeline
import pandas as pd
import re

class HuggingFaceLanguageModel:
    """
    A wrapper around the Hugging Face transformers pipeline for text generation.
    Enhanced to support longer, more detailed responses.
    """
    def __init__(self, model_name="facebook/opt-350m", **kwargs):
        """
        Initialize the language model with more parameters for longer responses.
        
        Args:
            model_name (str): Hugging Face model name. Defaults to a larger model.
            **kwargs: Additional configuration parameters for the pipeline.
        """
        self.generator = pipeline(
            "text-generation", 
            model=model_name, 
            device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
            **kwargs
        )

    def __call__(self, prompt: str, **kwargs):
        """
        Generate a response with more control over length and creativity.
        
        Args:
            prompt (str): Input prompt for text generation
            **kwargs: Customizable generation parameters
        
        Returns:
            str: Generated text response
        """
        # Enhanced set of kwargs for more control
        valid_kwargs = {
            "max_length": kwargs.get("max_length", 500),  # Increased default length
            "num_return_sequences": kwargs.get("num_return_sequences", 1),
            "do_sample": kwargs.get("do_sample", True),
            "temperature": kwargs.get("temperature", 0.8),  # Slightly higher for more creativity
            "top_k": kwargs.get("top_k", 50),  # Added top_k sampling
            "top_p": kwargs.get("top_p", 0.95),  # Nucleus sampling
            "truncation": kwargs.get("truncation", True),
            "no_repeat_ngram_size": kwargs.get("no_repeat_ngram_size", 2)  # Reduce repetition
        }
        
        # Generate the response
        try:
            response = self.generator(
                prompt,
                **valid_kwargs
            )[0]["generated_text"]

            # More sophisticated text cleaning
            response = response.replace(prompt, "").strip()
            print(f"[DEBUG] Generated Response Length: {len(response)} characters")
            return response
        
        except Exception as e:
            print(f"[ERROR] Generation failed: {e}")
            return "I apologize, but I encountered an error generating a response."

class DataProcessor:
    """
    A class to process CSV data with enhanced search capabilities.
    """
    def __init__(self, csv_path: str, column_to_search: str):
        """
        Initialize data processor with CSV file.
        
        Args:
            csv_path (str): Path to the CSV file
            column_to_search (str): Column to perform primary search
        """
        print(f"[DEBUG] Loading CSV from: {csv_path}")
        self.df = pd.read_csv(csv_path, quotechar='"', escapechar='\\', 
                               skipinitialspace=True, on_bad_lines='skip')
        self.column_to_search = column_to_search

        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")
        print(f"[DEBUG] Loaded DataFrame Columns: {self.df.columns.tolist()}")

    def flexible_search(self, query: str, top_n: int = 10):
        """
        Advanced search method with multiple matching strategies and relevance ranking.
        
        Args:
            query (str): Search query
            top_n (int): Number of top results to return
        
        Returns:
            pd.DataFrame: Ranked search results
        """
        def calculate_relevance(text, query):
            """Calculate relevance score for text."""
            if pd.isna(text):
                return 0
            
            text_lower = str(text).lower()
            query_lower = query.lower()
            
            # Multiple relevance factors
            relevance_factors = [
                10 if query_lower in text_lower else 0,           # Exact match
                5 if any(word.strip() in text_lower for word in query_lower.split()) else 0,  # Partial word match
                3 if re.search(re.escape(query_lower), text_lower) else 0  # Regex match
            ]
            
            return sum(relevance_factors)
        
        # Apply relevance scoring
        self.df['search_relevance'] = self.df[self.column_to_search].apply(
            lambda text: calculate_relevance(text, query)
        )
        
        # Sort and return top results
        results = self.df[self.df['search_relevance'] > 0].sort_values(
            'search_relevance', ascending=False
        ).head(top_n)
        
        # Drop the temporary relevance column
        results = results.drop(columns=['search_relevance'])
        
        return results

    def retrieve(self, query: str, top_n: int = 10):
        """
        Retrieve top relevant results for a query.
        
        Args:
            query (str): Search query
            top_n (int): Number of top results to return
        
        Returns:
            pd.DataFrame: Top relevant results
        """
        print(f"[DEBUG] Searching for query: {query}")
        results = self.flexible_search(query, top_n)
        print(f"[DEBUG] Retrieved {len(results)} matching rows")
        return results

    def preprocess(self, question):
        """
        Preprocess the input question.
        
        Args:
            question (str): Input question
        
        Returns:
            str: Preprocessed question
        """
        preprocessed_question = question.lower().strip()
        print(f"[DEBUG] Preprocessed Question: {preprocessed_question}")
        return preprocessed_question

class RAG:
    """
    Enhanced Retrieval-Augmented Generation class.
    """
    def __init__(self, lm, csv_path: str, column_to_search: str):
        """
        Initialize RAG with language model and data processor.
        
        Args:
            lm: Language model instance
            csv_path (str): Path to CSV file
            column_to_search (str): Column to search in CSV
        """
        print(f"[DEBUG] Initializing RAG with CSV: {csv_path} and Column: {column_to_search}")
        self.lm = lm
        self.processor = DataProcessor(csv_path, column_to_search)

    def create_comprehensive_prompt(self, context, question):
        """
        Create a more structured and comprehensive prompt.
        
        Args:
            context (str): Retrieved context
            question (str): User's question
        
        Returns:
            str: Enhanced prompt for generation
        """
        prompt_template = f"""Given the following context:
{context}

Question: {question}

Please provide a detailed, comprehensive, and insightful answer. Explain the key points thoroughly, provide context, and if applicable, offer practical insights or actionable advice. Use a clear, engaging writing style and ensure the response is informative and helpful.

Detailed Answer:"""
        
        return prompt_template

    def respond(self, context, question):
        """
        Generate a response using context and question.
        
        Args:
            context (str): Retrieved context
            question (str): User's question
        
        Returns:
            str: Generated response
        """
        if context.strip() == "No relevant context found.":
            prompt = f"""Question: {question}

Despite having no specific context, please provide a comprehensive and informative answer. Draw upon general knowledge and offer insights that address the question thoroughly.

Detailed Answer:"""
        else:
            prompt = self.create_comprehensive_prompt(context, question)
        
        print(f"[DEBUG] Prompt Sent to Model:\n{prompt}")
        response = self.lm(prompt, max_length=800)  # Longer max length for detailed response
        print(f"[DEBUG] Model Response Length: {len(response)} characters")
        return response

    def forward(self, question):
        """
        Main method to process a question and generate a response.
        
        Args:
            question (str): User's input question
        
        Returns:
            str: Generated response
        """
        # Preprocess the question
        question = self.processor.preprocess(question)

        # Retrieve relevant context
        context_df = self.processor.retrieve(question, top_n=10)
        if not context_df.empty:
            print(f"[DEBUG] Retrieved Context DataFrame:\n{context_df}")
            context = context_df.to_string(index=False)
        else:
            print("[DEBUG] No relevant context found.")
            context = "No relevant context found."

        # Generate a response using the LM
        response = self.respond(context=context, question=question)
        print(f"[DEBUG] Final Generated Response Length: {len(response)} characters")
        return response

# Example usage
if __name__ == "__main__":
    # Initialize Language Model with a larger model for better responses
    lm = HuggingFaceLanguageModel(model_name="facebook/opt-350m")
    
    # CSV and search configuration
    csv_path = "./151_ideas_updated.csv"
    column_to_search = "Ideas"

    # Create RAG instance
    rag = RAG(lm, csv_path, column_to_search)

    # Test queries
    test_queries = [
        "how do you describe expression",
        "maximize beauty",
        "meditation techniques"
    ]

    # Example usage
    sample_query = "meditation techniques"
    print(f"--- Query: {sample_query} ---")
    response = rag.forward(sample_query)
    print("\nFull Response:")
    print(response)

    # Optionally, test multiple queries
    for query in test_queries:
        print(f"\n--- Query: {query} ---")
        print(rag.forward(query))

pytorch_model.bin:   4%|3         | 105M/2.63G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [7]:
# works but response lame

import torch
from transformers import pipeline
import pandas as pd
import re

class HuggingFaceLanguageModel:
    """
    A wrapper around the Hugging Face transformers pipeline for text generation.
    """
    def __init__(self, model_name="gpt2", **kwargs):
        self.generator = pipeline("text-generation", model=model_name, **kwargs)

    def __call__(self, prompt: str, **kwargs):
        # Filter out any unsupported kwargs for the generate method
        valid_kwargs = {key: kwargs[key] for key in ["max_length", "num_return_sequences", "do_sample", "temperature"] if key in kwargs}
        
        # Generate the response
        response = self.generator(
            prompt,
            max_length=valid_kwargs.get("max_length", 100),
            num_return_sequences=valid_kwargs.get("num_return_sequences", 1),
            do_sample=valid_kwargs.get("do_sample", True),
            temperature=valid_kwargs.get("temperature", 0.7)
        )[0]["generated_text"]

        # Clean up the generated text
        response = response.replace(prompt, "").strip()
        return response

class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights.
    """
    def __init__(self, csv_path: str, column_to_search: str):
        self.df = pd.read_csv(csv_path, quotechar='"', escapechar='\\', skipinitialspace=True, on_bad_lines='skip')
        self.column_to_search = column_to_search

        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

    def safe_contains(self, text, query):
        if not isinstance(text, str):
            return False
        return query.lower() in text.lower()

    def retrieve(self, query: str):
        results = self.df[
            self.df[self.column_to_search].apply(
                lambda x: self.safe_contains(str(x), query)
            )
        ]
        return results

    def preprocess(self, question):
        # Example preprocessing: you can customize this as needed
        return question.lower()

    def postprocess(self, context):
        # Example postprocessing: you can customize this as needed
        return context

class RAG:
    """
    Retrieval-Augmented Generation class to combine context retrieval and generation.
    """
    def __init__(self, lm, csv_path: str, column_to_search: str):
        self.lm = lm
        self.processor = DataProcessor(csv_path, column_to_search)

    def respond(self, context, question):
        # Combine context and question to form a prompt
        prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
        return self.lm(prompt)
        print(f"Retrieved Context:\n{context}")
        print(f"Prompt Sent:\n{prompt}")


    def forward(self, question):
        # Preprocess the question
        question = self.processor.preprocess(question)

        # Retrieve relevant context
        context_df = self.processor.retrieve(question)
        context = context_df.head(5).to_string(index=False) if not context_df.empty else "No relevant context found."

        # Generate a response using the LM
        return self.respond(context=context, question=question)

# Example usage
if __name__ == "__main__":
    
# facebook/opt-350m
# facebook/opt-1.3b 2.3 gb

    lm = HuggingFaceLanguageModel(model_name="facebook/opt-125m")
    csv_path = "./151_ideas_updated.csv"  # Replace with your actual CSV path
    column_to_search = "Ideas"  # Replace with your column name

    rag = RAG(lm, csv_path, column_to_search)

    question = "How do you maximize the beauty?"
    print("Answer:")
    print(rag.forward(question))

  



config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Answer:


model.safetensors:   0%|          | 0.00/251M [00:00<?, ?B/s]

when you don't have to think about it, I have a good idea.
I'm not saying you won't have to think about it, but I do think you should think about it.

Thanks for your help.

I'm wondering if I could share a few tips:
1. Make sure you're not using a single image, even if you know where to


In [10]:
import torch
from transformers import pipeline
import pandas as pd
import re

class HuggingFaceLanguageModel:
    """
    A wrapper around the Hugging Face transformers pipeline for text generation.
    """
    def __init__(self, model_name="gpt2", **kwargs):
        self.generator = pipeline("text-generation", model=model_name, **kwargs)

    def __call__(self, prompt: str, **kwargs):
        # Filter out any unsupported kwargs for the generate method
        valid_kwargs = {key: kwargs[key] for key in ["max_length", "num_return_sequences", "do_sample", "temperature"] if key in kwargs}
        
        # Generate the response
        response = self.generator(
            prompt,
            max_length=valid_kwargs.get("max_length", 150),  # Increased max_length for richer responses
            num_return_sequences=valid_kwargs.get("num_return_sequences", 1),
            do_sample=valid_kwargs.get("do_sample", True),
            temperature=valid_kwargs.get("temperature", 0.7)
        )[0]["generated_text"]

        # Clean up the generated text
        response = response.replace(prompt, "").strip()
        print(f"[DEBUG] Generated Response: {response}")  # Debug output
        return response

class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights.
    """
    def __init__(self, csv_path: str, column_to_search: str):
        print(f"[DEBUG] Loading CSV from: {csv_path}")
        self.df = pd.read_csv(csv_path, quotechar='"', escapechar='\\', skipinitialspace=True, on_bad_lines='skip')
        self.column_to_search = column_to_search

        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")
        print(f"[DEBUG] Loaded DataFrame Columns: {self.df.columns.tolist()}")

    def safe_contains(self, text, query):
        if not isinstance(text, str):
            return False
        return query.lower() in text.lower()

    def retrieve(self, query: str):
        print(f"[DEBUG] Searching for query: {query}")
        results = self.df[
            self.df[self.column_to_search].apply(
                lambda x: self.safe_contains(str(x), query)
            )
        ]
        print(f"[DEBUG] Retrieved {len(results)} matching rows")
        return results

    def preprocess(self, question):
        # Example preprocessing: you can customize this as needed
        preprocessed_question = question.lower()
        print(f"[DEBUG] Preprocessed Question: {preprocessed_question}")
        return preprocessed_question

    def postprocess(self, context):
        # Example postprocessing: you can customize this as needed
        print(f"[DEBUG] Postprocessing Context: {context}")
        return context

class RAG:
    """
    Retrieval-Augmented Generation class to combine context retrieval and generation.
    """
    def __init__(self, lm, csv_path: str, column_to_search: str):
        print(f"[DEBUG] Initializing RAG with CSV: {csv_path} and Column: {column_to_search}")
        self.lm = lm
        self.processor = DataProcessor(csv_path, column_to_search)

    def respond(self, context, question):
        # Combine context and question to form a prompt
        if context.strip() == "No relevant context found.":
            prompt = f"The following question is provided without relevant context:\nQuestion: {question}\nAnswer:"
        else:
            prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
        
        print(f"[DEBUG] Prompt Sent to Model:\n{prompt}")
        response = self.lm(prompt)
        print(f"[DEBUG] Model Response:\n{response}")
        return response

    def forward(self, question):
        # Preprocess the question
        question = self.processor.preprocess(question)

        # Retrieve relevant context
        context_df = self.processor.retrieve(question)
        if not context_df.empty:
            print(f"[DEBUG] Retrieved Context DataFrame:\n{context_df.head(5)}")
        else:
            print("[DEBUG] No relevant context found.")

        context = context_df.head(5).to_string(index=False) if not context_df.empty else "No relevant context found."
        print(f"[DEBUG] Final Context for Model:\n{context}")

        # Generate a response using the LM
        response = self.respond(context=context, question=question)
        print(f"[DEBUG] Final Generated Response:\n{response}")
        return response

# Example usage
if __name__ == "__main__":
    # facebook/opt-350m
    # facebook/opt-1.3b 2.3 gb

    lm = HuggingFaceLanguageModel(model_name="facebook/opt-350m")
    csv_path = "./151_ideas_updated.csv"  # Replace with your actual CSV path
    column_to_search = "Ideas"  # Replace with your column name

    rag = RAG(lm, csv_path, column_to_search)

    question = "how do you describe expression?"
    print("Answer:")
    print(rag.forward(question))


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[DEBUG] Initializing RAG with CSV: ./151_ideas_updated.csv and Column: Ideas
[DEBUG] Loading CSV from: ./151_ideas_updated.csv
[DEBUG] Loaded DataFrame Columns: ['Ideas', 'Theme a', 'Theme-b', 'Theme-c', 'Unnamed: 4', 'Unnamed: 5']
Answer:
[DEBUG] Preprocessed Question: how do you describe expression?
[DEBUG] Searching for query: how do you describe expression?
[DEBUG] Retrieved 0 matching rows
[DEBUG] No relevant context found.
[DEBUG] Final Context for Model:
No relevant context found.
[DEBUG] Prompt Sent to Model:
The following question is provided without relevant context:
Question: how do you describe expression?
Answer:
[DEBUG] Generated Response: When you talk about "expression" you are using the word
"expression" to describe the meaning of the
subject's words.
For example, imagine that you are talking about something
that is an expression of a thing.
You could describe the
subject's words as:
"You can't say that the tree doesn't contain flowers."
or "The tree doesn't contain fl

In [14]:
# chain of thought added

import pandas as pd
import re
from transformers import pipeline
import dspy



class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2', **kwargs):
        # Ensure default temperature and other parameters
        self.kwargs = {
            "temperature": 0.7,  # Default temperature
            **kwargs
        }
        self.generator = pipeline(
            'text-generation', 
            model=model_name, 
            device=0,  # Use GPU if available
            truncation=True
        )

    def __call__(self, prompt: str, **kwargs):
        # Merge default kwargs with any overrides
        effective_kwargs = {**self.kwargs, **kwargs}
        response = self.generator(
            prompt, 
            max_length=100,  # Shorter max length
            num_return_sequences=1,
            **effective_kwargs
        )[0]['generated_text']

        # Clean up the generated text
        response = response.replace(prompt, '').strip()
        return response


class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        self.df = pd.read_csv(csv_path, 
                              quotechar='"', 
                              escapechar='\\', 
                              skipinitialspace=True,
                              on_bad_lines='skip')
        self.column_to_search = column_to_search

        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def safe_contains(self, text, query):
        if not isinstance(text, str):
            return False
        return query.lower() in text.lower()

    def retrieve(self, query: str):
        results = self.df[
            self.df[self.column_to_search].apply(
                lambda x: self.safe_contains(str(x), query)
            )
        ]
        return results

    def generate_query(self, context: list, question: str):
        context_str = str(context) if context else "No previous context"
        prompt = (
            f"Context: {context_str}\n"
            f"Question: {question}\n"
            "Generate a concise keyword or phrase to search for relevant information."
        )
        return self.language_model(prompt)

    def generate_answer(self, context: pd.DataFrame, question: str):
        context_text = context.head(5).to_string(index=False)
        prompt = (
            f"Context: {context_text}\n"
            f"Question: {question}\n"
            "Provide a clear and concise answer based on the context."
        )
        return self.language_model(prompt)

    def forward(self, question: str):
        context, queries = [], [question]

        for hop in range(2):
            query = self.generate_query(context=context, question=question)
            query = query[:50]
            if query.lower() in [q.lower() for q in queries]:
                query = question.split()[:2]

            print(f"Generated Query: {query}")

            hop_results = self.retrieve(query)
            if not hop_results.empty:
                context.append(hop_results)
            queries.append(query)

        context_df = pd.concat(context).drop_duplicates() if context else self.df
        return self.generate_answer(context=context_df, question=question)

    def suggest_improvements(self, query: str):
        context = self.retrieve(query)

        if context.empty:
            return {"query": query, "suggestions": "No results found."}

        prompt = (
            f"Context: {context.to_string(index=False)}\n"
            f"Past Query: {query}\n"
            "Suggest refinements to improve the search query."
        )
        suggestions = self.language_model(prompt)
        return {"query": query, "suggestions": suggestions}



class RAG(dspy.Module):
    """
    Retrieval-Augmented Generation module.
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        self.processor = DataProcessor(csv_path, column_to_search, model_name)

        # Define a custom Hugging Face LM and set it in dspy settings
        self.lm = HuggingFaceLanguageModel(model_name=model_name)
        dspy.settings.lm = self.lm  # Set the LM globally for dspy

        # Initialize the chain of thought with the LM
        self.respond = dspy.ChainOfThought('context, question -> response', lm=self.lm)

    def forward(self, question: str):
        # Retrieve relevant context
        context = self.processor.retrieve(question).head(5).to_dict('records')
        if not context:
            context = [{'message': 'No relevant context found.'}]
        
        # Generate a response using the LM
        return self.respond(context=context, question=question)


# Example Usage
if __name__ == "__main__":
    csv_path = './151_ideas_updated.csv'
    column_to_search = 'Ideas'  # Adjust to the appropriate column

    rag = RAG(csv_path=csv_path, column_to_search=column_to_search)

    question = "What does it mean to Maximize the Beauty?"
    print("\nAnswer:")
    print(rag.forward(question))

    print("\nSuggestions for Query Improvement:")
    processor = rag.processor
    improvement_suggestions = processor.suggest_improvements(question)
    print(improvement_suggestions["suggestions"])


 		You are using the client HuggingFaceLanguageModel, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb



Answer:


ValueError: The following `model_kwargs` are not used by the model: ['lm'] (note: typos in the generate arguments will also show up in this list)

In [7]:
import pandas as pd
import re
from transformers import pipeline


class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2'):  # Use a lightweight model for efficiency
        self.generator = pipeline('text-generation', 
                                  model=model_name, 
                                  device=0,  # Use GPU if available
                                  truncation=True)

    def __call__(self, prompt: str, **kwargs):
        response = self.generator(
            prompt, 
            max_length=len(prompt) + 250,   # Shorter max length
            num_return_sequences=1,
            **kwargs
        )[0]['generated_text']
        
        # Clean up the generated text
        response = response.replace(prompt, '').strip()
        return response


class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        # Load CSV data with more robust parsing
        self.df = pd.read_csv(csv_path, 
                               quotechar='"', 
                               escapechar='\\', 
                               skipinitialspace=True,
                               on_bad_lines='skip')
        self.column_to_search = column_to_search

        # Verify the column exists
        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

        # Initialize the language model
        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def generate_query(self, context: list, question: str):
        """
        Generate a query using context and a question.
        """
        # Specific prompt to generate a focused query
        context_str = str(context) if context else "No previous context"
        prompt = (
            f"Context: {context_str}\n"
            f"Question: {question}\n"
            "Generate a very short, specific keyword or phrase to search for relevant information. "
            "Focus on the key concept directly related to the question."
        )
        return self.language_model(prompt)

    def safe_contains(self, text, query):
        """
        Safely check if query is in text, using simple string matching
        """
        if not isinstance(text, str):
            return False
        
        # Use simple case-insensitive substring search
        return query.lower() in text.lower()

    def retrieve(self, query: str):
        """
        Retrieve data from the CSV based on the generated query.
        """
        # Use a custom search function to avoid regex parsing issues
        results = self.df[
            self.df[self.column_to_search].apply(
                lambda x: self.safe_contains(str(x), query)
            )
        ]
        return results

    def generate_answer(self, context: pd.DataFrame, question: str):
        """
        Generate an answer based on retrieved context.
        """
        # Truncate context if it's too large
        context_text = context.head(5).to_string(index=False)
        
        prompt = (
            f"Context: {context_text}\n"
            f"Question: {question}\n"
            "Based on the given context, provide a clear and concise answer. "
            "Focus on explaining the key concept directly related to the question."
        )
        return self.language_model(prompt)

    def forward(self, question: str):
        """
        Multi-hop process for query generation, retrieval, and summarization.
        """
        context, queries = [], [question]

        for hop in range(2):
            # Generate a query based on existing context
            query = self.generate_query(context=context, question=question)
            
            # Ensure query is not empty or too similar
            query = query[:50]  # Truncate to reasonable length
            if query.lower() in [q.lower() for q in queries]:
                query = question.split()[:2]  # Fallback to first two words
            
            print(f"Generated Query: {query}")
            
            # Retrieve results and add to context
            hop_results = self.retrieve(query)
            if not hop_results.empty:
                context.append(hop_results)
            queries.append(query)

        # Generate final answer from collected context
        context_df = pd.concat(context).drop_duplicates() if context else self.df
        return self.generate_answer(context=context_df, question=question)

    def suggest_improvements(self, query: str):
        """
        Suggest improvements to refine the search query.
        """
        # Perform a search and collect context
        context = self.retrieve(query)

        if context.empty:
            return {"query": query, "suggestions": "No results found, so suggestions cannot be provided."}

        # Generate improvement suggestions
        prompt = (
            f"Context: {context.to_string(index=False)}\n"
            f"Past Query: {query}\n"
            "Instruction: Suggest ways to improve or refine the search query for better results."
        )
        suggestions = self.language_model(prompt)
        return {"query": query, "suggestions": suggestions}


# Example Usage
csv_path = './151_ideas_updated.csv'
column_to_search = 'Ideas'  # Adjust to the appropriate column
processor = DataProcessor(csv_path=csv_path, column_to_search=column_to_search)

# Multi-hop query generation and answering
question = "what is it to Maximize the Beauty?"
answer = processor.forward(question)
print("\nFinal Answer:")
print(answer)

# Suggestions for query improvement
improvement_suggestions = processor.suggest_improvements(question)
print("\nSuggestions for Improvement:")
print(improvement_suggestions["suggestions"])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Query: As you can see, the following keywords are added t
Generated Query: This is usually done through keyword search, but a


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



Final Answer:


Suggestions for Improvement:
No results found, so suggestions cannot be provided.


In [5]:
# from docs rag 2

import pandas as pd
from transformers import pipeline


class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2'):  # Use a lightweight model for efficiency
        self.generator = pipeline('text-generation', model=model_name)

    def __call__(self, prompt: str, **kwargs):
        response = self.generator(prompt, max_length=150, **kwargs)[0]['generated_text']
        return response


class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        # Load CSV data
        self.df = pd.read_csv(csv_path)
        self.column_to_search = column_to_search

        # Verify the column exists
        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

        # Initialize the language model
        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def generate_query(self, context: list, question: str):
        """
        Generate a query using context and a question.
        """
        prompt = (
            f"Context: {context}\n"
            f"Question: {question}\n"
            "Generate a distinct and concise query to retrieve relevant data."
        )
        return self.language_model(prompt)

    def retrieve(self, query: str):
        """
        Retrieve data from the CSV based on the generated query.
        """
        results = self.df[self.df[self.column_to_search].str.contains(query, case=False, na=False)]
        return results

    def generate_answer(self, context: pd.DataFrame, question: str):
        """
        Generate an answer based on retrieved context.
        """
        context_text = context.to_string(index=False)
        prompt = (
            f"Context: {context_text}\n"
            f"Question: {question}\n"
            "Provide a detailed summary of the retrieved data."
        )
        return self.language_model(prompt)

    def forward(self, question: str):
        """
        Multi-hop process for query generation, retrieval, and summarization.
        """
        context, queries = [], [question]

        for hop in range(2):
            query = self.generate_query(context=context, question=question)
            
            # Assertions and suggestions
            if len(query) >= 100:
                print("FAIL! ✗ Query too long. Regenerating with updated prompt...")
                query = self.generate_query(context=context, question=f"{question} (Keep it concise)")

            if query in queries:
                print(f"FAIL! ✗ Query not distinct from previous attempts. Regenerating...")
                query = self.generate_query(
                    context=context,
                    question=f"{question} (Avoid similarity with previous attempts: {queries})"
                )

            print(f"Generated Query: {query}")
            context.append(self.retrieve(query))
            queries.append(query)

        # Generate final answer from collected context
        context_df = pd.concat(context).drop_duplicates()
        return self.generate_answer(context=context_df, question=question)

    def suggest_improvements(self, query: str):
        """
        Suggest improvements to refine the search query.
        """
        # Perform a search and collect context
        context = self.retrieve(query)

        if context.empty:
            return {"query": query, "suggestions": "No results found, so suggestions cannot be provided."}

        # Generate improvement suggestions
        prompt = (
            f"Context: {context.to_string(index=False)}\n"
            f"Past Query: {query}\n"
            "Instruction: Suggest ways to improve or refine the search query for better results."
        )
        suggestions = self.language_model(prompt)
        return {"query": query, "suggestions": suggestions}


# Example Usage
csv_path = './151_ideas_updated.csv'
column_to_search = 'Ideas'  # Adjust to the appropriate column
processor = DataProcessor(csv_path=csv_path, column_to_search=column_to_search)

# Multi-hop query generation and answering
question = "Maximize the Beauty"
answer = processor.forward(question)
print("\nFinal Answer:")
print(answer)

# Suggestions for query improvement
improvement_suggestions = processor.suggest_improvements(question)
print("\nSuggestions for Improvement:")
print(improvement_suggestions["suggestions"])


ParserError: Error tokenizing data. C error: Expected 6 fields in line 110, saw 9
