In [7]:
import pandas as pd
import re
from transformers import pipeline


class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2'):  # Use a lightweight model for efficiency
        self.generator = pipeline('text-generation', 
                                  model=model_name, 
                                  device=0,  # Use GPU if available
                                  truncation=True)

    def __call__(self, prompt: str, **kwargs):
        response = self.generator(
            prompt, 
            max_length=len(prompt) + 250,   # Shorter max length
            num_return_sequences=1,
            **kwargs
        )[0]['generated_text']
        
        # Clean up the generated text
        response = response.replace(prompt, '').strip()
        return response


class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        # Load CSV data with more robust parsing
        self.df = pd.read_csv(csv_path, 
                               quotechar='"', 
                               escapechar='\\', 
                               skipinitialspace=True,
                               on_bad_lines='skip')
        self.column_to_search = column_to_search

        # Verify the column exists
        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

        # Initialize the language model
        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def generate_query(self, context: list, question: str):
        """
        Generate a query using context and a question.
        """
        # Specific prompt to generate a focused query
        context_str = str(context) if context else "No previous context"
        prompt = (
            f"Context: {context_str}\n"
            f"Question: {question}\n"
            "Generate a very short, specific keyword or phrase to search for relevant information. "
            "Focus on the key concept directly related to the question."
        )
        return self.language_model(prompt)

    def safe_contains(self, text, query):
        """
        Safely check if query is in text, using simple string matching
        """
        if not isinstance(text, str):
            return False
        
        # Use simple case-insensitive substring search
        return query.lower() in text.lower()

    def retrieve(self, query: str):
        """
        Retrieve data from the CSV based on the generated query.
        """
        # Use a custom search function to avoid regex parsing issues
        results = self.df[
            self.df[self.column_to_search].apply(
                lambda x: self.safe_contains(str(x), query)
            )
        ]
        return results

    def generate_answer(self, context: pd.DataFrame, question: str):
        """
        Generate an answer based on retrieved context.
        """
        # Truncate context if it's too large
        context_text = context.head(5).to_string(index=False)
        
        prompt = (
            f"Context: {context_text}\n"
            f"Question: {question}\n"
            "Based on the given context, provide a clear and concise answer. "
            "Focus on explaining the key concept directly related to the question."
        )
        return self.language_model(prompt)

    def forward(self, question: str):
        """
        Multi-hop process for query generation, retrieval, and summarization.
        """
        context, queries = [], [question]

        for hop in range(2):
            # Generate a query based on existing context
            query = self.generate_query(context=context, question=question)
            
            # Ensure query is not empty or too similar
            query = query[:50]  # Truncate to reasonable length
            if query.lower() in [q.lower() for q in queries]:
                query = question.split()[:2]  # Fallback to first two words
            
            print(f"Generated Query: {query}")
            
            # Retrieve results and add to context
            hop_results = self.retrieve(query)
            if not hop_results.empty:
                context.append(hop_results)
            queries.append(query)

        # Generate final answer from collected context
        context_df = pd.concat(context).drop_duplicates() if context else self.df
        return self.generate_answer(context=context_df, question=question)

    def suggest_improvements(self, query: str):
        """
        Suggest improvements to refine the search query.
        """
        # Perform a search and collect context
        context = self.retrieve(query)

        if context.empty:
            return {"query": query, "suggestions": "No results found, so suggestions cannot be provided."}

        # Generate improvement suggestions
        prompt = (
            f"Context: {context.to_string(index=False)}\n"
            f"Past Query: {query}\n"
            "Instruction: Suggest ways to improve or refine the search query for better results."
        )
        suggestions = self.language_model(prompt)
        return {"query": query, "suggestions": suggestions}


# Example Usage
csv_path = './151_ideas_updated.csv'
column_to_search = 'Ideas'  # Adjust to the appropriate column
processor = DataProcessor(csv_path=csv_path, column_to_search=column_to_search)

# Multi-hop query generation and answering
question = "what is it to Maximize the Beauty?"
answer = processor.forward(question)
print("\nFinal Answer:")
print(answer)

# Suggestions for query improvement
improvement_suggestions = processor.suggest_improvements(question)
print("\nSuggestions for Improvement:")
print(improvement_suggestions["suggestions"])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Query: As you can see, the following keywords are added t
Generated Query: This is usually done through keyword search, but a


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



Final Answer:


Suggestions for Improvement:
No results found, so suggestions cannot be provided.


In [5]:
# from docs rag 2

import pandas as pd
from transformers import pipeline


class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2'):  # Use a lightweight model for efficiency
        self.generator = pipeline('text-generation', model=model_name)

    def __call__(self, prompt: str, **kwargs):
        response = self.generator(prompt, max_length=150, **kwargs)[0]['generated_text']
        return response


class DataProcessor:
    """
    A class to process CSV data, search, summarize, and analyze insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        # Load CSV data
        self.df = pd.read_csv(csv_path)
        self.column_to_search = column_to_search

        # Verify the column exists
        if self.column_to_search not in self.df.columns:
            raise ValueError(f"Column '{self.column_to_search}' not found in the CSV.")

        # Initialize the language model
        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def generate_query(self, context: list, question: str):
        """
        Generate a query using context and a question.
        """
        prompt = (
            f"Context: {context}\n"
            f"Question: {question}\n"
            "Generate a distinct and concise query to retrieve relevant data."
        )
        return self.language_model(prompt)

    def retrieve(self, query: str):
        """
        Retrieve data from the CSV based on the generated query.
        """
        results = self.df[self.df[self.column_to_search].str.contains(query, case=False, na=False)]
        return results

    def generate_answer(self, context: pd.DataFrame, question: str):
        """
        Generate an answer based on retrieved context.
        """
        context_text = context.to_string(index=False)
        prompt = (
            f"Context: {context_text}\n"
            f"Question: {question}\n"
            "Provide a detailed summary of the retrieved data."
        )
        return self.language_model(prompt)

    def forward(self, question: str):
        """
        Multi-hop process for query generation, retrieval, and summarization.
        """
        context, queries = [], [question]

        for hop in range(2):
            query = self.generate_query(context=context, question=question)
            
            # Assertions and suggestions
            if len(query) >= 100:
                print("FAIL! ✗ Query too long. Regenerating with updated prompt...")
                query = self.generate_query(context=context, question=f"{question} (Keep it concise)")

            if query in queries:
                print(f"FAIL! ✗ Query not distinct from previous attempts. Regenerating...")
                query = self.generate_query(
                    context=context,
                    question=f"{question} (Avoid similarity with previous attempts: {queries})"
                )

            print(f"Generated Query: {query}")
            context.append(self.retrieve(query))
            queries.append(query)

        # Generate final answer from collected context
        context_df = pd.concat(context).drop_duplicates()
        return self.generate_answer(context=context_df, question=question)

    def suggest_improvements(self, query: str):
        """
        Suggest improvements to refine the search query.
        """
        # Perform a search and collect context
        context = self.retrieve(query)

        if context.empty:
            return {"query": query, "suggestions": "No results found, so suggestions cannot be provided."}

        # Generate improvement suggestions
        prompt = (
            f"Context: {context.to_string(index=False)}\n"
            f"Past Query: {query}\n"
            "Instruction: Suggest ways to improve or refine the search query for better results."
        )
        suggestions = self.language_model(prompt)
        return {"query": query, "suggestions": suggestions}


# Example Usage
csv_path = './151_ideas_updated.csv'
column_to_search = 'Ideas'  # Adjust to the appropriate column
processor = DataProcessor(csv_path=csv_path, column_to_search=column_to_search)

# Multi-hop query generation and answering
question = "Maximize the Beauty"
answer = processor.forward(question)
print("\nFinal Answer:")
print(answer)

# Suggestions for query improvement
improvement_suggestions = processor.suggest_improvements(question)
print("\nSuggestions for Improvement:")
print(improvement_suggestions["suggestions"])


ParserError: Error tokenizing data. C error: Expected 6 fields in line 110, saw 9
