In [13]:
import os
import json
from typing import Dict, Union
import importlib.util

def check_module(module_name: str) -> bool:
    """Check if a Python module is installed."""
    return importlib.util.find_spec(module_name) is not None

def universal_analyzer(
    input_path: str = 'dataspace/input/',
    output_file: str = 'dataspace/analysis_results.json',
    api_key: str = '',
    system_role: str = "You are a highly skilled analyst with expertise in multiple domains.",
    analysis_task: str = "Analyze this content and provide key insights, patterns, and recommendations.",
    additional_instructions: str = "",
    model: str = "gpt-4",
    max_tokens: int = 2000,
    temperature: float = 0.7,
    chunk_size: int = 4000  # Maximum characters per API call
) -> Dict:
    """
    A universal analyzer that can process various file types and provide AI-powered analysis using OpenAI.

    Parameters:
    - input_path (str): Path to the input file or directory. Defaults to 'dataspace/input/'.
    - output_file (str): Path to the output JSON file. Defaults to 'dataspace/analysis_results.json'.
    - api_key (str): OpenAI API key. Required.
    - system_role (str): System role description for the AI assistant. Defaults to a generic analyst role.
    - analysis_task (str): The analysis task to be performed. Defaults to a general analysis prompt.
    - additional_instructions (str): Any additional instructions for the AI assistant.
    - model (str): The OpenAI model to use. Defaults to 'gpt-4'.
    - max_tokens (int): Maximum number of tokens for the AI response. Defaults to 2000.
    - temperature (float): Sampling temperature for the AI response. Defaults to 0.7.
    - chunk_size (int): Maximum number of characters per API call. Defaults to 4000.

    Returns:
    - Dict: A dictionary containing the analysis results.
    """

    # Check for required modules
    installed_modules = {}
    required_modules = ['docx', 'pdfplumber', 'pandas', 'openpyxl']
    for module in required_modules:
        installed_modules[module] = check_module(module)

    # Define supported file types based on installed modules
    supported_file_types = ['.txt']
    if installed_modules.get('docx'):
        supported_file_types.append('.docx')
        import docx
    if installed_modules.get('pdfplumber'):
        supported_file_types.append('.pdf')
        import pdfplumber
    if installed_modules.get('pandas'):
        supported_file_types.extend(['.csv', '.xlsx'])
        import pandas as pd
    else:
        raise ImportError("The 'pandas' module is required for this function.")

    if not api_key:
        raise ValueError("OpenAI API key is required.")

    # Initialize OpenAI client
    import openai
    openai.api_key = api_key

    analysis_results = {}

    def read_file_content(file_path: str) -> str:
        """Reads the content of a file based on its extension."""
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        elif file_extension == '.docx' and installed_modules.get('docx'):
            doc = docx.Document(file_path)
            return '\n'.join([para.text for para in doc.paragraphs])
        elif file_extension == '.pdf' and installed_modules.get('pdfplumber'):
            with pdfplumber.open(file_path) as pdf:
                pages = [page.extract_text() for page in pdf.pages]
                return '\n'.join(pages)
        elif file_extension == '.csv' and installed_modules.get('pandas'):
            df = pd.read_csv(file_path)
            return df.to_csv(index=False)
        elif file_extension == '.xlsx' and installed_modules.get('pandas'):
            df = pd.read_excel(file_path)
            return df.to_csv(index=False)
        else:
            raise ValueError(f"Unsupported file type or required module not installed for: {file_extension}")

    def analyze_content(content: str, filename: str) -> str:
        """Helper function to analyze content using OpenAI API."""
        try:
            # Split content into chunks if it's too long
            content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
            
            full_analysis = []

            for i, chunk in enumerate(content_chunks):
                # Prepare messages for the AI assistant
                messages = [
                    {"role": "system", "content": system_role},
                    {"role": "user", "content": f"Task: {analysis_task}\n\nAdditional Instructions: {additional_instructions}\n\nAnalyzing file: {filename}\n\nContent Part {i+1}/{len(content_chunks)}:\n{chunk}"}
                ]
                
                response = openai.ChatCompletion.create(
                    model=model,
                    messages=messages,
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                
                full_analysis.append(response['choices'][0]['message']['content'])
            
            # Consolidate analysis if multiple chunks
            if len(full_analysis) > 1:
                summary_messages = [
                    {"role": "system", "content": system_role},
                    {"role": "user", "content": f"Please provide a consolidated summary of all the previous analyses:\n\n{''.join(full_analysis)}"}
                ]
                
                summary_response = openai.ChatCompletion.create(
                    model=model,
                    messages=summary_messages,
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                
                return summary_response['choices'][0]['message']['content']
            
            return full_analysis[0]
            
        except Exception as e:
            return f"Error during analysis: {str(e)}"

    try:
        # Create output directory if it doesn't exist
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Add module availability to results
        analysis_results['system_info'] = {
            'installed_modules': installed_modules,
            'supported_file_types': supported_file_types
        }

        # Process single file or directory
        if os.path.isfile(input_path):
            file_extension = os.path.splitext(input_path)[1].lower()
            if file_extension in supported_file_types:
                content = read_file_content(input_path)
                analysis = analyze_content(content, os.path.basename(input_path))
                analysis_results[os.path.basename(input_path)] = {
                    'file_path': input_path,
                    'analysis': analysis
                }
            else:
                analysis_results[os.path.basename(input_path)] = {
                    'file_path': input_path,
                    'error': f"Unsupported file type: {file_extension}"
                }
        else:
            # Process directory
            for root, _, files in os.walk(input_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    file_extension = os.path.splitext(file)[1].lower()
                    
                    if file_extension in supported_file_types:
                        content = read_file_content(file_path)
                        analysis = analyze_content(content, file)
                        analysis_results[file] = {
                            'file_path': file_path,
                            'analysis': analysis
                        }
                    else:
                        analysis_results[file] = {
                            'file_path': file_path,
                            'error': f"Unsupported file type: {file_extension}"
                        }

        # Save results to JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(analysis_results, f, indent=4, ensure_ascii=False)

        return analysis_results

    except Exception as e:
        error_result = {'error': str(e)}
        
        # Save error to JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(error_result, f, indent=4, ensure_ascii=False)
            
        return error_result


In [18]:
# Ensure you have the necessary modules installed:
# pip install openai pandas docx pdfplumber openpyxl

# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
input_path='/Users/an/Library/CloudStorage/OneDrive-Personal/Documents/GitHub/SLEGO-Project/slegospace/dataspace/historical_data.csv'

# Call the universal_analyzer function
results = universal_analyzer(
    input_path= input_path,  # Update this path to your input directory or file
    output_file='dataspace/analysis_results.json',
    api_key=api_key,
    analysis_task="Provide a summary and key insights from the content.",
    additional_instructions="Focus on any financial data and trends.",
    model="gpt-4o",  # Use "gpt-3.5-turbo" if you don't have access to gpt-4
    max_tokens=1500,
    temperature=0.5
)


# The analysis results are now saved in 'dataspace/analysis_results.json'


In [19]:
results

{'system_info': {'installed_modules': {'docx': True,
   'pdfplumber': False,
   'pandas': True,
   'openpyxl': True},
  'supported_file_types': ['.txt', '.docx', '.csv', '.xlsx']},
 'historical_data.csv': {'file_path': '/Users/an/Library/CloudStorage/OneDrive-Personal/Documents/GitHub/SLEGO-Project/slegospace/dataspace/historical_data.csv',
  'analysis': 'Error during analysis: \n\nYou tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.\n\nYou can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. \n\nAlternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`\n\nA detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742\n'}}