In [6]:
import pandas as pd
from transformers import pipeline

class HuggingFaceLanguageModel:
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='distilgpt2'):  # Use a smaller model
        self.generator = pipeline('text-generation', model=model_name)

    def __call__(self, prompt: str, **kwargs):
        response = self.generator(prompt, max_length=175, **kwargs)[0]['generated_text']
        return response


class DataProcessor:
    """
    DSPy-style module for processing CSV data and generating insights
    """
    def __init__(self, csv_path: str, column_to_search: str, model_name='distilgpt2'):
        # Load CSV data
        self.df = pd.read_csv(csv_path, usecols=[0, 1, 2, 3, 4, 5])  # Use only the first 6 columns
        self.column_to_search = column_to_search

        # Assertion: Ensure the search column exists in the DataFrame
        assert self.column_to_search in self.df.columns, f"Column '{self.column_to_search}' not found in the CSV."

        # Configure Hugging Face Language Model
        self.language_model = HuggingFaceLanguageModel(model_name=model_name)

    def search_data(self, query: str):
        """
        Search CSV data and generate insights
        """
        # Filter dataframe based on query
        results = self.df[self.df[self.column_to_search].str.contains(query, case=False, na=False)]

        # Generate summary using language model
        summary_prompt = f"Summarize these {len(results)} results about '{query}': {results.to_string()}"
        summary = self.language_model(summary_prompt)

        return {
            'query_results': results,
            'result_count': len(results),
            'summary': summary
        }

    def advanced_analysis(self, query: str, additional_columns: list = None):
        """
        Perform more advanced analysis with multiple columns
        """
        results = self.search_data(query)

        if additional_columns:
            # Aggregate additional columns if specified
            aggregations = {col: ['mean', 'count'] for col in additional_columns 
                            if self.df[col].dtype in ['int64', 'float64']}

            additional_stats = results['query_results'].agg(aggregations)
            results['additional_stats'] = additional_stats

        return results

    def suggest_improvements(self, query: str):
        """
        Suggest improvements based on search results
        """
        results = self.search_data(query)
        suggestion_prompt = (
            f"Based on these {len(results['query_results'])} results, suggest ways to refine the search query '{query}' "
            f"to improve insights or better filter data."
        )
        suggestions = self.language_model(suggestion_prompt)
        return {'query': query, 'suggestions': suggestions}

# Initialize the processor
csv_path = './151_ideas_updated.csv'
column_to_search = 'Ideas'
processor = DataProcessor(csv_path=csv_path, column_to_search=column_to_search)

# Test with a query
query = "Maximize the Beauty"
results = processor.search_data(query)
print("Search Results:")
print(results['query_results'])
print("\nSummary:", results['summary'])

# Optional: Advanced analysis
# advanced_results = processor.advanced_analysis(query, additional_columns=['Column1', 'Column2'])
# print("\nAdvanced Analysis:", advanced_results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


ValueError: Input length of input_ids is 175, but `max_length` is set to 175. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.