In [2]:
# Imports
import os
import json
import sys
from pathlib import Path
from datetime import datetime
import google.generativeai as genai


PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from analysis.load_data import load_dataset
from analysis.insights.prompt_templates import *
from analysis.utils.preprocessing import prepare_reddit_for_llm, create_statistical_summary_for_llm
from analysis.utils.evaluation import analyze_comprehensive_mental_health

from dotenv import load_dotenv
load_dotenv()

ModuleNotFoundError: No module named 'insights'

In [None]:
# Configure Gemini
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
model = genai.GenerativeModel('gemini-pro')

print("Gemini model loaded successfully")

In [3]:
# Load Reddit Data
reddit_df = load_dataset('reddit')

# Load CDC data
cdc_df = load_dataset('cdc')

# Load WHO Suicide data
who_suicide_df = load_dataset('who_suicide')

# Load Mental Health Care data
mental_health_care_df = load_dataset('mental_health_care')

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 722 rows from reddit_extract.sql
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
INFO:analysis.load_data:Loaded 72 rows from cdc_extract.sql
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
INFO:analysis.load_data:Loaded 38316 rows from who_suicide_extract.sql
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3

In [None]:
# Prepare Reddit sample
reddit_sample = prepare_reddit_for_llm(reddit_df)

# Generate statistical summary
statistical_data = create_statistical_summary_for_llm(cdc_df, who_suicide_df, mental_health_care_df)

# Format statistical data for LLM
stats_text = ""
for dataset, stats in statistical_data.items():
    stats_text += f"\n{dataset}:\n"
    for key, value in stats.items():
        stats_text += f"- {key}: {value}\n"


In [None]:
# Run analysis with LLM
comprehensive_analysis = analyze_comprehensive_mental_health(reddit_sample, stats_text, model)

if comprehensive_analysis:
    print("\n" + "="*80)
    print("COMPREHENSIVE MENTAL HEALTH DATA ANALYSIS")
    print("="*80)
    print(comprehensive_analysis)
else:
    print("Analysis failed")

In [None]:
# Save analysis results
if comprehensive_analysis:
    # Save analysis to outputs
    os.makedirs('../outputs/reports', exist_ok=True)
    
    # Create comprehensive report with metadata from all sources
    report = {
        'timestamp': datetime.now().isoformat(),
        'analysis_type': 'comprehensive_mental_health',
        'data_sources': {
            'reddit_sample_size': len(reddit_sample.split('---'))-1,
            'statistical_sources': list(statistical_data.keys()),
            'cdc_records': len(cdc_df) if len(cdc_df) > 0 else 0,
            'who_records': len(who_suicide_df) if len(who_suicide_df) > 0 else 0,
            'care_records': len(mental_health_care_df) if len(mental_health_care_df) > 0 else 0
        },
        'analysis': comprehensive_analysis,
        'data_summary': statistical_data
    }
    
    filename = f"../outputs/reports/comprehensive_mental_health_{datetime.now().strftime('%Y%m%d')}.json"
    
    with open(filename, 'w') as f:
        json.dump(report, f, indent=2, default=str)

else:
    print("No analysis to save - comprehensive analysis failed")