In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import sys
from pathlib import Path


PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from analysis.load_data import load_dataset
from analysis.utils.preprocessing import prepare_time_series, prepare_who_time_series
from analysis.config.model_config import FORECASTING_CONFIG
from analysis.forecasting.forecasting_utils import train_prophet_model, evaluate_forecast_model, save_forecast_results, cross_validate_timeseries

ERROR:prophet.plot:Importing plotly failed. Interactive plots will not work.


In [2]:
# Load News data
news_df = load_dataset('news')
print("News Length: ", len(news_df))
print(f"Columns: {list(news_df.columns)}")
print(f"\nFirst few rows:")
print(news_df.head())

# Load Suicide Demographics data
suicide_demo_df = load_dataset('suicide_demographics')
print("\nSuicide Demographics Length: ", len(suicide_demo_df))
print(f"Columns: {list(suicide_demo_df.columns)}")
print(f"\nFirst few rows:")
print(suicide_demo_df.head())

# Load WHO Suicide data
who_suicide_df = load_dataset('who_suicide')
print("\nWHO Suicide Length: ", len(who_suicide_df))
print(f"Columns: {list(who_suicide_df.columns)}")
print(f"\nFirst few rows:")
print(who_suicide_df.head())

INFO:botocore.credentials:Found credentials in environment variables.
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 4 rows from news_extract.sql


News Length:  4
Columns: ['DATE', 'ARTICLE_COUNT', 'SAMPLE_HEADLINES', 'SOURCES']

First few rows:
         DATE  ARTICLE_COUNT  \
0  2025-10-22             93   
1  2025-10-23             92   
2  2025-10-24             32   
3  2025-10-25             97   

                                    SAMPLE_HEADLINES  \
0  Mexico to Tax Mature Video Games | Pregnant IC...   
1  Villains To Be Removed at Disney Park: Will Ma...   
2  Psychologist Warns of Donald Trump’s ‘Massive ...   
3  Particulate matter pollutant levels cross Indi...   

                                             SOURCES  
0  NBC News, Landezine.com, Catholicnewsagency.co...  
1  Yahoo Entertainment, TheStranger.com, Rolling ...  
2  Bemorewithless.com, SFGate, New York Post, For...  
3  ABC News (AU), Daily Signal, XDA Developers, A...  


INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 5484 rows from suicide_demographics_extract.sql



Suicide Demographics Length:  5484
Columns: ['indicator', 'unit', 'stub_name', 'stub_label', 'year', 'age', 'estimate', 'flag', 'demographic_category', 'demographic_value']

First few rows:
                 indicator                                               unit  \
0  Death rates for suicide  Deaths per 100,000 resident population, age-ad...   
1  Death rates for suicide  Deaths per 100,000 resident population, age-ad...   
2  Death rates for suicide  Deaths per 100,000 resident population, age-ad...   
3  Death rates for suicide  Deaths per 100,000 resident population, age-ad...   
4  Death rates for suicide  Deaths per 100,000 resident population, age-ad...   

  stub_name   stub_label  year       age  estimate  flag demographic_category  \
0     Total  All persons  1950  All ages      13.2  None                Total   
1     Total  All persons  1960  All ages      12.5  None                Total   
2     Total  All persons  1970  All ages      13.1  None                Total  

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 38316 rows from who_suicide_extract.sql



WHO Suicide Length:  38316
Columns: ['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicide_rate_per_100k']

First few rows:
   country  year     sex          age  suicides_no  population  \
0  Albania  1985  Female  15-24 years          0.0    277900.0   
1  Albania  1985  Female  25-34 years          0.0    246800.0   
2  Albania  1985  Female  35-54 years          0.0    267500.0   
3  Albania  1985  Female   5-14 years          0.0    298300.0   
4  Albania  1985  Female  55-74 years          0.0    138700.0   

   suicide_rate_per_100k  
0                    0.0  
1                    0.0  
2                    0.0  
3                    0.0  
4                    0.0  


In [None]:

# Prepare CDC anxiety data
cdc_ts = prepare_time_series(cdc_df, 'date', 'anxiety')

# Prepare Google Trends data
trends_ts = prepare_time_series(trends_df, 'date', 'interest')

# Prepare WHO suicide data
who_suicide_ts = prepare_who_time_series(who_suicide_df, 'year', 'suicides_no')

# Convert all to Prophet format
datasets_to_forecast = {}

# CDC data
cdc_prophet = cdc_ts.reset_index()
cdc_prophet.columns = ['ds', 'y']
datasets_to_forecast['cdc_anxiety'] = cdc_prophet

# Google Trends data
trends_prophet = trends_ts.reset_index()
trends_prophet.columns = ['ds', 'y']
datasets_to_forecast['google_trends'] = trends_prophet

# WHO data
who_prophet = who_suicide_ts.reset_index()
who_prophet.columns = ['ds', 'y']
datasets_to_forecast['who_suicides'] = who_prophet

# Plot all time series for comparison
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

for i, (name, data) in enumerate(datasets_to_forecast.items()):
    axes[i].plot(data['ds'], data['y'])
    axes[i].set_title(f'{name.replace("_", " ").title()} Over Time')
    axes[i].set_xlabel('Date')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Train Prophet model
config = FORECASTING_CONFIG['prophet']
forecast_results = {}

for dataset_name, prophet_data in datasets_to_forecast.items():
    # Train model using utility function
    model = train_prophet_model(prophet_data, config)
    
    # Generate forecast (different periods based on data frequency)
    if dataset_name == 'who_suicides':
        periods = 5  # 5 years ahead for annual data
    else:
        periods = 90  # 90 days ahead for higher frequency data
    
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    
    # Store results
    forecast_results[dataset_name] = {
        'model': model,
        'forecast': forecast,
        'training_data': prophet_data
    }

In [None]:
# Visualize all forecasts
fig, axes = plt.subplots(len(forecast_results), 2, figsize=(20, 6*len(forecast_results)))

for i, (dataset_name, results) in enumerate(forecast_results.items()):
    model = results['model']
    forecast = results['forecast']
    
    # Main forecast plot
    fig1 = model.plot(forecast, ax=axes[i,0])
    axes[i,0].set_title(f'{dataset_name.replace("_", " ").title()} Forecast')
    axes[i,0].set_ylabel('Value')
    
    # Components plot
    fig2 = model.plot_components(forecast, ax=axes[i,1] if len(forecast_results) == 1 else None)
    if len(forecast_results) > 1:
        # For multiple subplots, create separate components plot
        plt.figure(figsize=(12, 8))
        model.plot_components(forecast)
        plt.suptitle(f'{dataset_name.replace("_", " ").title()} Components')
        plt.show()

plt.tight_layout()
plt.show()

In [None]:
# Evaluate model performance
evaluation_results = {}

for dataset_name, results in forecast_results.items():
    training_data = results['training_data']
    
    if len(training_data) > 30:
        # Split data for validation
        train_size = int(len(training_data) * 0.8)
        train_data = training_data[:train_size]
        test_data = training_data[train_size:]
        
        # Train evaluation model
        eval_model = train_prophet_model(train_data, config)
        
        # Evaluate performance
        metrics = evaluate_forecast_model(eval_model, train_data, test_data, model_type='prophet')
        evaluation_results[dataset_name] = metrics
        
        print("Model Evaluation Metrics:")
        for metric, value in metrics.items():
            print(f"  {metric.upper()}: {value:.3f}")
    else:
        print(f"Insufficient data for evaluation ({len(training_data)} points)")
        evaluation_results[dataset_name] = None

In [None]:
# Save all models/results
timestamp = pd.Timestamp.now().strftime("%Y%m%d")

for dataset_name, results in forecast_results.items():
    model = results['model'] 
    forecast = results['forecast']
    
    filename = f'{dataset_name}_{timestamp}'
    save_forecast_results(model, forecast, filename)

In [None]:
# Cross-validation
cv_results = {}

for dataset_name, results in forecast_results.items():
    training_data = results['training_data']

    # Adjust minimum data requirements based on frequency
    min_required = 365 if dataset_name != 'who_suicides' else 15  # 15 years for annual data
    
    if len(training_data) > min_required:
        try:
            cv_output, cv_metrics = cross_validate_timeseries(training_data, config)
            cv_results[dataset_name] = cv_metrics
            
            print("Cross-validation metrics summary:")
            print(cv_metrics[['mape', 'rmse']].describe())
            
        except Exception as e:
            print(f"Cross-validation failed: {e}")
            cv_results[dataset_name] = None
    else:
        print(f"Need more data for cross-validation ({len(training_data)}/{min_required} points)")
        cv_results[dataset_name] = None