In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import sys
from pathlib import Path


PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from analysis.load_data import load_dataset
from analysis.utils.preprocessing import prepare_time_series, prepare_who_time_series
from analysis.config.model_config import FORECASTING_CONFIG
from analysis.forecasting.forecasting_utils import train_prophet_model, evaluate_forecast_model, save_forecast_results, cross_validate_timeseries

ERROR:prophet.plot:Importing plotly failed. Interactive plots will not work.


In [2]:
# Load CDC data
cdc_df = load_dataset('cdc')

# Load Google Trends data
trends_df = load_dataset('trends')  

# Load WHO suicide data
who_suicide_df = load_dataset('who_suicide')

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
ERROR:analysis.load_data:Error executing query from /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/pipeline/snowflake/cdc_sql/cdc_extract.sql: Execution failed on sql '-- Use CDC Schema
USE SCHEMA MENTAL_HEALTH.CDC;

-- Select all CDC data
SELECT * FROM CDC_PROCESSED;': 000008 (0A000): 01bf7e76-0000-376b-0024-9b8b0016f006: Actual statement count 2 did not match the desired statement count 1.
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
ERROR:analysis.load_data:Error executing query from /Users/Andrew/Desktop/Computer Science/Mental_Health_Projec

In [None]:

# Prepare CDC anxiety data
cdc_ts = prepare_time_series(cdc_df, 'date', 'anxiety')

# Prepare Google Trends data
trends_ts = prepare_time_series(trends_df, 'date', 'interest')

# Prepare WHO suicide data
who_suicide_ts = prepare_who_time_series(who_suicide_df, 'year', 'suicides_no')

# Convert all to Prophet format
datasets_to_forecast = {}

# CDC data
cdc_prophet = cdc_ts.reset_index()
cdc_prophet.columns = ['ds', 'y']
datasets_to_forecast['cdc_anxiety'] = cdc_prophet

# Google Trends data
trends_prophet = trends_ts.reset_index()
trends_prophet.columns = ['ds', 'y']
datasets_to_forecast['google_trends'] = trends_prophet

# WHO data
who_prophet = who_suicide_ts.reset_index()
who_prophet.columns = ['ds', 'y']
datasets_to_forecast['who_suicides'] = who_prophet

# Plot all time series for comparison
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

for i, (name, data) in enumerate(datasets_to_forecast.items()):
    axes[i].plot(data['ds'], data['y'])
    axes[i].set_title(f'{name.replace("_", " ").title()} Over Time')
    axes[i].set_xlabel('Date')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Train Prophet model
config = FORECASTING_CONFIG['prophet']
forecast_results = {}

for dataset_name, prophet_data in datasets_to_forecast.items():
    # Train model using utility function
    model = train_prophet_model(prophet_data, config)
    
    # Generate forecast (different periods based on data frequency)
    if dataset_name == 'who_suicides':
        periods = 5  # 5 years ahead for annual data
    else:
        periods = 90  # 90 days ahead for higher frequency data
    
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    
    # Store results
    forecast_results[dataset_name] = {
        'model': model,
        'forecast': forecast,
        'training_data': prophet_data
    }

In [None]:
# Visualize all forecasts
fig, axes = plt.subplots(len(forecast_results), 2, figsize=(20, 6*len(forecast_results)))

for i, (dataset_name, results) in enumerate(forecast_results.items()):
    model = results['model']
    forecast = results['forecast']
    
    # Main forecast plot
    fig1 = model.plot(forecast, ax=axes[i,0])
    axes[i,0].set_title(f'{dataset_name.replace("_", " ").title()} Forecast')
    axes[i,0].set_ylabel('Value')
    
    # Components plot
    fig2 = model.plot_components(forecast, ax=axes[i,1] if len(forecast_results) == 1 else None)
    if len(forecast_results) > 1:
        # For multiple subplots, create separate components plot
        plt.figure(figsize=(12, 8))
        model.plot_components(forecast)
        plt.suptitle(f'{dataset_name.replace("_", " ").title()} Components')
        plt.show()

plt.tight_layout()
plt.show()

In [None]:
# Evaluate model performance
evaluation_results = {}

for dataset_name, results in forecast_results.items():
    training_data = results['training_data']
    
    if len(training_data) > 30:
        # Split data for validation
        train_size = int(len(training_data) * 0.8)
        train_data = training_data[:train_size]
        test_data = training_data[train_size:]
        
        # Train evaluation model
        eval_model = train_prophet_model(train_data, config)
        
        # Evaluate performance
        metrics = evaluate_forecast_model(eval_model, train_data, test_data, model_type='prophet')
        evaluation_results[dataset_name] = metrics
        
        print("Model Evaluation Metrics:")
        for metric, value in metrics.items():
            print(f"  {metric.upper()}: {value:.3f}")
    else:
        print(f"Insufficient data for evaluation ({len(training_data)} points)")
        evaluation_results[dataset_name] = None

In [None]:
# Save all models/results
timestamp = pd.Timestamp.now().strftime("%Y%m%d")

for dataset_name, results in forecast_results.items():
    model = results['model'] 
    forecast = results['forecast']
    
    filename = f'{dataset_name}_{timestamp}'
    save_forecast_results(model, forecast, filename)

In [None]:
# Cross-validation
cv_results = {}

for dataset_name, results in forecast_results.items():
    training_data = results['training_data']

    # Adjust minimum data requirements based on frequency
    min_required = 365 if dataset_name != 'who_suicides' else 15  # 15 years for annual data
    
    if len(training_data) > min_required:
        try:
            cv_output, cv_metrics = cross_validate_timeseries(training_data, config)
            cv_results[dataset_name] = cv_metrics
            
            print("Cross-validation metrics summary:")
            print(cv_metrics[['mape', 'rmse']].describe())
            
        except Exception as e:
            print(f"Cross-validation failed: {e}")
            cv_results[dataset_name] = None
    else:
        print(f"Need more data for cross-validation ({len(training_data)}/{min_required} points)")
        cv_results[dataset_name] = None