In [None]:
# pip install dspy-ai transformers pandas

import dspy
import pandas as pd
from transformers import pipeline
from typing import List, Dict, Any


class HuggingFaceLanguageModel(dspy.LM):
    """
    Custom Language Model wrapper for Hugging Face transformers
    """
    def __init__(self, model_name='facebook/opt-350m'):
        self.generator = pipeline('text-generation', model=model_name)
    
    def __call__(self, prompt: str, **kwargs):
        # Convert Hugging Face pipeline output to DSPy format
        response = self.generator(prompt, max_length=150, **kwargs)[0]['generated_text']
        return dspy.Prediction(text=response)


class DataProcessor(dspy.Module):
    """
    DSPy Module for processing CSV data and generating insights
    """
    def __init__(self, csv_path: str, column_to_search: str):
        super().__init__()
        
        # Load CSV data
        self.df = pd.read_csv(csv_path)
        self.column_to_search = column_to_search
        
        # Assertion: Ensure the search column exists in the DataFrame
        assert self.column_to_search in self.df.columns, f"Column '{self.column_to_search}' not found in the CSV."
        
        # Configure DSPy with Hugging Face Language Model
        self.language_model = HuggingFaceLanguageModel()
        dspy.settings.configure(lm=self.language_model)
    
    def search_data(self, query: str) -> Dict[str, Any]:
        """
        Search CSV data and generate insights
        """
        # Filter dataframe based on query
        results = self.df[self.df[self.column_to_search].str.contains(query, case=False, na=False)]
        
        # Generate summary using language model
        summary_prompt = f"Summarize these {len(results)} results about '{query}': {results.to_string()}"
        summary = self.language_model(summary_prompt).text
        
        return {
            'query_results': results,
            'result_count': len(results),
            'summary': summary
        }

    def advanced_analysis(self, query: str, additional_columns: List[str] = None):
        """
        Perform more advanced analysis with multiple columns
        """
        results = self.search_data(query)
        
        if additional_columns:
            # Aggregate additional columns if specified
            aggregations = {col: ['mean', 'count'] for col in additional_columns 
                            if self.df[col].dtype in ['int64', 'float64']}
            
            additional_stats = results['query_results'].agg(aggregations)
            results['additional_stats'] = additional_stats
        
        return results

    def suggest_improvements(self, query: str) -> Dict[str, Any]:
        """
        Suggest improvements based on search results
        """
        results = self.search_data(query)
        suggestion_prompt = (
            f"Based on these {len(results['query_results'])} results, suggest ways to refine the search query '{query}' "
            f"to improve insights or better filter data."
        )
        suggestions = self.language_model(suggestion_prompt).text
        return {'query': query, 'suggestions': suggestions}

    def optimize_themes(self, query: str) -> Dict[str, Any]:
        """
        Use DSPy optimizer to suggest the best theme combinations for a query
        """
        from dspy import Optimizer
        
        # Define optimization variables and constraints
        optimizer = Optimizer()
        optimizer.add_variable("fun", weight=1.0)
        optimizer.add_variable("rational", weight=0.5)
        optimizer.add_variable("positive", weight=1.5)
        optimizer.add_variable("spicy", weight=0.8)
        
        # Simulate an objective function
        optimizer.set_objective(
            objective_function=lambda x: x["fun"] + x["positive"] - 0.5 * x["spicy"]
        )
        
        # Solve for the best theme combination
        solution = optimizer.solve()
        return {
            'query': query,
            'optimized_themes': solution
        }


def main():
    # Example usage
    # df = pd.read_csv("./151_ideas_updated.csv", usecols=[0, 1, 2, 3, 4, 5])
    try:
        # Initialize processor with your CSV and specify search column
        processor = DataProcessor(
            csv_path='("./151_ideas_updated.csv"',  # Replace with your CSV path
            column_to_search='description'  # Replace with your column name
        )
        
        # Simple search
        simple_result = processor.search_data('example query')
        print("Simple Search Results:")
        print(simple_result['query_results'])
        print("\nSummary:", simple_result['summary'])
        
        # Advanced analysis with multiple columns
        advanced_result = processor.advanced_analysis(
            query='example query', 
            additional_columns=['price', 'rating']
        )
        print("\nAdvanced Analysis:")
        print(advanced_result['additional_stats'])
        
        # Suggestions for improvements
        suggestions = processor.suggest_improvements('example query')
        print("\nSuggestions for Query Refinement:")
        print(suggestions['suggestions'])
        
        # Optimized themes
        optimized = processor.optimize_themes('example query')
        print("\nOptimized Themes:")
        print(optimized['optimized_themes'])
    
    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == '__main__':
    main()


An error occurred: [Errno 2] No such file or directory: 'your_data.csv'
