In [21]:
import yfinance as yf
import pandas as pd
import json
import os
from openai import OpenAI
from typing import Union, Dict, Any, List

def __check_module(module_name: str) -> bool:
    """Private function to check if a Python module is installed."""
    import importlib.util
    return importlib.util.find_spec(module_name) is not None

def __ensure_directory_exists(file_path: str) -> None:
    """Private function to ensure directory exists."""
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)

def __read_file_content(file_path: str, installed_modules: Dict[str, bool]) -> str:
    """Private function to read content from various file types."""
    file_extension = os.path.splitext(file_path)[1].lower()
    
    try:
        if file_extension == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
                
        elif file_extension == '.json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                return json.dumps(data, indent=2)
                
        elif file_extension == '.docx' and installed_modules.get('docx'):
            import docx
            doc = docx.Document(file_path)
            return '\n'.join([para.text for para in doc.paragraphs])
            
        elif file_extension == '.csv' and installed_modules.get('pandas'):
            df = pd.read_csv(file_path)
            return df.to_string()
            
        elif file_extension == '.xlsx' and installed_modules.get('pandas'):
            df = pd.read_excel(file_path)
            return df.to_string()
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")
            
    except Exception as e:
        raise ValueError(f"Error reading {file_extension} file: {str(e)}")

def __analyze_content(client: OpenAI, content: str, filename: str, 
                     system_role: str, analysis_task: str, 
                     additional_instructions: str, model: str,
                     max_tokens: int, temperature: float, 
                     chunk_size: int) -> str:
    """Private function to analyze content using OpenAI API."""
    try:
        # Split content into manageable chunks
        content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        full_analysis = []

        # Analyze each chunk
        for i, chunk in enumerate(content_chunks):
            messages = [
                {"role": "system", "content": system_role},
                {"role": "user", "content": f"""
Task: {analysis_task}
Additional Instructions: {additional_instructions}
Analyzing file: {filename}
Content Part {i+1}/{len(content_chunks)}:

{chunk}
                """}
            ]
            
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature
            )
            
            full_analysis.append(response.choices[0].message.content)
        
        # Consolidate analysis if multiple chunks exist
        if len(full_analysis) > 1:
            summary_messages = [
                {"role": "system", "content": system_role},
                {"role": "user", "content": f"Please provide a consolidated summary of these analyses:\n\n{''.join(full_analysis)}"}
            ]
            
            summary_response = client.chat.completions.create(
                model=model,
                messages=summary_messages,
                max_tokens=max_tokens,
                temperature=temperature
            )
            
            return summary_response.choices[0].message.content
        
        return full_analysis[0]
        
    except Exception as e:
        return f"Error during analysis: {str(e)}"

def universal_analyzer(
    input_path: str = 'dataspace/input/',
    output_file: str = 'dataspace/analysis_results.json',
    api_key: str = '',
    system_role: str = "You are a highly skilled analyst with expertise in multiple domains.",
    analysis_task: str = "Analyze this content and provide key insights, patterns, and recommendations.",
    additional_instructions: str = "",
    model: str = "gpt-4",
    max_tokens: int = 2000,
    temperature: float = 0.7,
    chunk_size: int = 4000
) -> Dict:
    """
    Analyzes various file types using OpenAI's GPT models.
    
    Parameters:
    - input_path: Path to file or directory to analyze
    - output_file: Where to save analysis results
    - api_key: OpenAI API key
    - system_role: Role description for the AI
    - analysis_task: Main analysis task description
    - additional_instructions: Extra analysis instructions
    - model: OpenAI model to use
    - max_tokens: Maximum tokens in response
    - temperature: Response randomness (0-1)
    - chunk_size: Maximum characters per API call
    
    Returns:
    - Dict containing analysis results and metadata
    """
    
    try:
        # Initialize modules and file types
        installed_modules = {}
        required_modules = ['docx', 'pdfplumber', 'pandas', 'openpyxl']
        for module in required_modules:
            installed_modules[module] = __check_module(module)

        supported_file_types = ['.txt', '.json']  # Add JSON support
        if installed_modules.get('docx'):
            supported_file_types.append('.docx')
        if installed_modules.get('pandas'):
            supported_file_types.extend(['.csv', '.xlsx'])

        if not api_key:
            raise ValueError("OpenAI API key is required")

        # Initialize client and results
        client = OpenAI(api_key=api_key)
        analysis_results = {
            'system_info': {
                'installed_modules': installed_modules,
                'supported_file_types': supported_file_types
            }
        }

        # Process single file
        if os.path.isfile(input_path):
            file_extension = os.path.splitext(input_path)[1].lower()
            if file_extension in supported_file_types:
                content = __read_file_content(input_path, installed_modules)
                analysis = __analyze_content(
                    client, content, os.path.basename(input_path),
                    system_role, analysis_task, additional_instructions,
                    model, max_tokens, temperature, chunk_size
                )
                analysis_results[os.path.basename(input_path)] = {
                    'file_path': input_path,
                    'analysis': analysis
                }
            else:
                analysis_results[os.path.basename(input_path)] = {
                    'file_path': input_path,
                    'error': f"Unsupported file type: {file_extension}"
                }
                
        # Process directory
        else:
            for root, _, files in os.walk(input_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    file_extension = os.path.splitext(file)[1].lower()
                    
                    if file_extension in supported_file_types:
                        content = __read_file_content(file_path, installed_modules)
                        analysis = __analyze_content(
                            client, content, file,
                            system_role, analysis_task, additional_instructions,
                            model, max_tokens, temperature, chunk_size
                        )
                        analysis_results[file] = {
                            'file_path': file_path,
                            'analysis': analysis
                        }
                    else:
                        analysis_results[file] = {
                            'file_path': file_path,
                            'error': f"Unsupported file type: {file_extension}"
                        }

        # Save results
        __ensure_directory_exists(output_file)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(analysis_results, f, indent=4, ensure_ascii=False)
            
        return analysis_results

    except Exception as e:
        error_result = {'error': str(e)}
        __ensure_directory_exists(output_file)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(error_result, f, indent=4, ensure_ascii=False)
        return error_result

In [22]:
# Ensure you have the necessary modules installed:
# pip install openai pandas docx pdfplumber openpyxl

# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
input_path='/Users/an/Library/CloudStorage/OneDrive-Personal/Documents/GitHub/SLEGO-Project/slegospace/dataspace/news.json'

# Call the universal_analyzer function
results = universal_analyzer(
    input_path= input_path,  # Update this path to your input directory or file
    output_file='dataspace/analysis_results.json',
    api_key=api_key,
    analysis_task="Provide a summary and key insights from the content.",
    additional_instructions="Focus on any financial data and trends.",
    model="gpt-4o",  # Use "gpt-3.5-turbo" if you don't have access to gpt-4
    max_tokens=1500,
    temperature=0.5
)


# The analysis results are now saved in 'dataspace/analysis_results.json'


In [33]:
results

{'system_info': {'installed_modules': {'docx': True,
   'pdfplumber': False,
   'pandas': True,
   'openpyxl': True},
  'supported_file_types': ['.txt', '.json', '.docx', '.csv', '.xlsx']},
 'news.json': {'file_path': '/Users/an/Library/CloudStorage/OneDrive-Personal/Documents/GitHub/SLEGO-Project/slegospace/dataspace/news.json',
  'analysis': 'The consolidated summary of the analyses from the news.json file highlights several key themes and insights within the financial and technology sectors:\n\n1. **AI Investment and Market Dynamics**:\n   - There\'s a notable focus on artificial intelligence (AI) investments, with companies like Amazon, Alphabet, Apple, Microsoft, and Nvidia actively increasing their spending in this area. This surge reflects a strategic emphasis on AI as a driver for future innovation and growth. However, there is also a mention of fading initial excitement around AI, suggesting that companies need to demonstrate tangible results to sustain investor interest.\n\n2