# 02 - Multivariate Time Series Analysis

This notebook focuses on:
1. Time series decomposition for multiple variables
2. Stationarity testing for multivariate series
3. Cross-correlation analysis between features
4. Granger causality testing
5. Cointegration analysis
6. Feature engineering for multivariate time series


In [None]:
# Import libraries with robust error handling
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss, grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
import os

# Set style
plt.style.use('seaborn')
sns.set_palette("husl")

print("✅ All imports completed successfully!")


In [None]:
# Load prepared data from previous notebook
print("=== LOADING PREPARED DATA ===")

try:
    # Load the prepared data splits
    in_time_data = pd.read_csv('../data_multivariate/in_time_features.csv', index_col=0, parse_dates=True)
    out_of_time_data = pd.read_csv('../data_multivariate/out_of_time_features.csv', index_col=0, parse_dates=True)
    
    print(f"✅ Data loaded successfully!")
    print(f"In-time data shape: {in_time_data.shape}")
    print(f"Out-of-time data shape: {out_of_time_data.shape}")
    print(f"Date range: {in_time_data.index.min()} to {out_of_time_data.index.max()}")
    
    # Set up dataset configuration
    target = 'PM2.5'  # Default target for air quality data
    feature_columns = [col for col in in_time_data.columns if col != target]
    
    print(f"Features: {len(feature_columns)}")
    print(f"Target: {target}")
    print(f"Feature columns: {feature_columns}")
    
except FileNotFoundError:
    print("❌ Prepared data files not found")
    print("Please run 01_multivariate_data_exploration.ipynb first")
    in_time_data = None
    out_of_time_data = None


In [None]:
# Multivariate Time Series Decomposition
print("=== MULTIVARIATE TIME SERIES DECOMPOSITION ===")

if in_time_data is not None and not in_time_data.empty:
    # Select key variables for decomposition analysis
    key_variables = [target] + feature_columns[:3]  # Target + first 3 features
    
    print(f"Analyzing decomposition for: {key_variables}")
    
    # Test different periods for seasonal patterns
    periods_to_test = [24, 168, 720]  # Daily (24h), Weekly (168h), Monthly (720h)
    
    decomposition_results = {}
    
    for variable in key_variables:
        if variable in in_time_data.columns:
            print(f"\n--- {variable} Decomposition ---")
            
            ts_data = in_time_data[variable].dropna()
            if len(ts_data) < 100:  # Need sufficient data
                print(f"Skipping {variable}: insufficient data ({len(ts_data)} points)")
                continue
            
            variable_results = {}
            
            for period in periods_to_test:
                if period >= len(ts_data) // 2:
                    print(f"Period {period}: Skipped (too large for data length {len(ts_data)})")
                    continue
                
                try:
                    # Test both additive and multiplicative models
                    for model_type in ['additive', 'multiplicative']:
                        try:
                            decomposition = seasonal_decompose(ts_data, model=model_type, period=period)
                            
                            # Calculate component strengths
                            trend_strength = np.var(decomposition.trend.dropna()) / np.var(decomposition.observed)
                            seasonal_strength = np.var(decomposition.seasonal.dropna()) / np.var(decomposition.observed)
                            residual_strength = np.var(decomposition.resid.dropna()) / np.var(decomposition.observed)
                            
                            variable_results[f'period_{period}_{model_type}'] = {
                                'trend_strength': trend_strength,
                                'seasonal_strength': seasonal_strength,
                                'residual_strength': residual_strength,
                                'decomposition': decomposition
                            }
                            
                            print(f"  Period {period} ({model_type}): Trend={trend_strength:.4f}, Seasonal={seasonal_strength:.4f}, Residual={residual_strength:.4f}")
                            
                        except Exception as e:
                            print(f"  Period {period} ({model_type}): Failed - {str(e)[:50]}...")
                
                except Exception as e:
                    print(f"  Period {period}: Failed - {str(e)[:50]}...")
            
            decomposition_results[variable] = variable_results
    
    print(f"\n✅ Decomposition analysis completed for {len(decomposition_results)} variables")
    
else:
    print("❌ No data available for decomposition analysis")


In [None]:
# Stationarity Testing for Multivariate Series
print("=== STATIONARITY TESTING FOR MULTIVARIATE SERIES ===")

def test_stationarity_multivariate(series, name='Time Series'):
    """
    Test stationarity using ADF and KPSS tests for multivariate series
    """
    print(f'\nResults for {name}:')
    print('-' * 50)
    
    # ADF Test
    adf_result = adfuller(series.dropna())
    print('ADF Statistic:', adf_result[0])
    print('p-value:', adf_result[1])
    print('Critical Values:')
    for key, value in adf_result[4].items():
        print(f'\t{key}: {value}')
    
    adf_stationary = adf_result[1] <= 0.05
    if adf_stationary:
        print("ADF Test: Series is stationary")
    else:
        print("ADF Test: Series is non-stationary")
    
    print('\n' + '-' * 30)
    
    # KPSS Test
    kpss_result = kpss(series.dropna())
    print('KPSS Statistic:', kpss_result[0])
    print('p-value:', kpss_result[1])
    print('Critical Values:')
    for key, value in kpss_result[3].items():
        print(f'\t{key}: {value}')
    
    kpss_stationary = kpss_result[1] >= 0.05
    if kpss_stationary:
        print("KPSS Test: Series is stationary")
    else:
        print("KPSS Test: Series is non-stationary")
    
    # Interpretation
    print('\n' + '=' * 50)
    print('INTERPRETATION:')
    if adf_stationary and kpss_stationary:
        print('✅ Both tests agree: Series is STATIONARY')
    elif not adf_stationary and not kpss_stationary:
        print('✅ Both tests agree: Series is NON-STATIONARY')
    else:
        print('⚠️  Tests disagree: Series is likely TREND-STATIONARY')
    
    return {
        'adf_statistic': adf_result[0],
        'adf_pvalue': adf_result[1],
        'adf_stationary': adf_stationary,
        'kpss_statistic': kpss_result[0],
        'kpss_pvalue': kpss_result[1],
        'kpss_stationary': kpss_stationary
    }

if in_time_data is not None and not in_time_data.empty:
    # Test stationarity for all variables
    stationarity_results = {}
    
    print("Testing stationarity for all variables...")
    
    for column in in_time_data.columns:
        if in_time_data[column].dtype in ['float64', 'int64']:
            print(f"\n" + "="*60)
            print(f"TESTING STATIONARITY: {column.upper()}")
            print("="*60)
            
            stationarity_results[column] = test_stationarity_multivariate(
                in_time_data[column], 
                f'{column} (Original)'
            )
            
            # Test first difference if original is non-stationary
            if not (stationarity_results[column]['adf_stationary'] and 
                   stationarity_results[column]['kpss_stationary']):
                
                print(f"\n" + "="*60)
                print(f"TESTING FIRST DIFFERENCE: {column.upper()}")
                print("="*60)
                
                diff_series = in_time_data[column].diff().dropna()
                if len(diff_series) > 0:
                    stationarity_results[f'{column}_diff'] = test_stationarity_multivariate(
                        diff_series, 
                        f'{column} (First Difference)'
                    )
    
    # Summary of stationarity results
    print(f"\n" + "="*80)
    print("STATIONARITY TESTING SUMMARY")
    print("="*80)
    
    summary_data = []
    for var_name, results in stationarity_results.items():
        summary_data.append({
            'Variable': var_name,
            'ADF_Stationary': results['adf_stationary'],
            'KPSS_Stationary': results['kpss_stationary'],
            'ADF_pvalue': results['adf_pvalue'],
            'KPSS_pvalue': results['kpss_pvalue'],
            'Overall_Stationary': results['adf_stationary'] and results['kpss_stationary']
        })
    
    summary_df = pd.DataFrame(summary_data)
    print(summary_df.to_string(index=False, float_format='%.4f'))
    
    # Count stationary vs non-stationary
    stationary_count = summary_df['Overall_Stationary'].sum()
    total_count = len(summary_df)
    
    print(f"\nStationarity Summary:")
    print(f"Stationary series: {stationary_count}/{total_count}")
    print(f"Non-stationary series: {total_count - stationary_count}/{total_count}")
    
else:
    print("❌ No data available for stationarity testing")


In [None]:
# Cross-Correlation Analysis and Granger Causality Testing
print("=== CROSS-CORRELATION ANALYSIS AND GRANGER CAUSALITY TESTING ===")

if in_time_data is not None and not in_time_data.empty:
    # Cross-correlation analysis
    print("1. CROSS-CORRELATION ANALYSIS")
    print("-" * 40)
    
    # Calculate cross-correlations between target and features
    if target in in_time_data.columns:
        cross_correlations = {}
        
        for feature in feature_columns:
            if feature in in_time_data.columns:
                # Calculate cross-correlation at different lags
                target_series = in_time_data[target].dropna()
                feature_series = in_time_data[feature].dropna()
                
                # Align series
                common_index = target_series.index.intersection(feature_series.index)
                target_aligned = target_series.loc[common_index]
                feature_aligned = feature_series.loc[common_index]
                
                # Calculate cross-correlation
                correlation = np.corrcoef(target_aligned, feature_aligned)[0, 1]
                cross_correlations[feature] = correlation
                
                print(f"  {feature} ↔ {target}: {correlation:.4f}")
        
        # Sort by absolute correlation
        sorted_correlations = sorted(cross_correlations.items(), key=lambda x: abs(x[1]), reverse=True)
        
        print(f"\nTop correlations with {target}:")
        for feature, corr in sorted_correlations[:5]:
            direction = "positive" if corr > 0 else "negative"
            print(f"  {feature}: {corr:.4f} ({direction})")
    
    # Granger Causality Testing
    print(f"\n2. GRANGER CAUSALITY TESTING")
    print("-" * 40)
    
    if target in in_time_data.columns and len(feature_columns) > 0:
        print("Testing Granger causality from features to target...")
        
        granger_results = {}
        max_lags = min(10, len(in_time_data) // 20)  # Conservative lag selection
        
        for feature in feature_columns[:5]:  # Test top 5 features to avoid overloading
            if feature in in_time_data.columns:
                try:
                    # Prepare data for Granger causality test
                    target_series = in_time_data[target].dropna()
                    feature_series = in_time_data[feature].dropna()
                    
                    # Align series
                    common_index = target_series.index.intersection(feature_series.index)
                    target_aligned = target_series.loc[common_index]
                    feature_aligned = feature_series.loc[common_index]
                    
                    # Create DataFrame for Granger test
                    test_data = pd.DataFrame({
                        'target': target_aligned,
                        'feature': feature_aligned
                    }).dropna()
                    
                    if len(test_data) > max_lags * 2:  # Need sufficient data
                        # Perform Granger causality test
                        gc_result = grangercausalitytests(test_data[['target', 'feature']], 
                                                        maxlag=max_lags, 
                                                        verbose=False)
                        
                        # Extract p-values for different lags
                        p_values = []
                        for lag in range(1, min(max_lags + 1, len(gc_result) + 1)):
                            if lag in gc_result:
                                p_value = gc_result[lag][0]['ssr_ftest'][1]  # F-test p-value
                                p_values.append(p_value)
                        
                        if p_values:
                            min_p_value = min(p_values)
                            granger_results[feature] = {
                                'min_p_value': min_p_value,
                                'significant': min_p_value < 0.05,
                                'p_values': p_values
                            }
                            
                            significance = "Significant" if min_p_value < 0.05 else "Not significant"
                            print(f"  {feature} → {target}: p-value = {min_p_value:.4f} ({significance})")
                        
                except Exception as e:
                    print(f"  {feature} → {target}: Test failed - {str(e)[:50]}...")
        
        # Summary of Granger causality
        if granger_results:
            significant_features = [f for f, r in granger_results.items() if r['significant']]
            print(f"\nGranger Causality Summary:")
            print(f"  Features with significant causality to {target}: {len(significant_features)}")
            if significant_features:
                print(f"  Significant features: {significant_features}")
            else:
                print(f"  No significant Granger causality found")
    
    print(f"\n✅ Cross-correlation and Granger causality analysis completed")
    
else:
    print("❌ No data available for cross-correlation analysis")


In [None]:
# Summary and Next Steps
print("=== MULTIVARIATE TIME SERIES ANALYSIS COMPLETE ===")

if in_time_data is not None and not in_time_data.empty:
    print("✅ Multivariate time series analysis completed")
    print(f"✅ Dataset analyzed: {len(in_time_data)} observations")
    print(f"✅ Variables analyzed: {len(in_time_data.columns)}")
    print(f"✅ Time series decomposition performed")
    print(f"✅ Stationarity testing completed")
    print(f"✅ Cross-correlation analysis performed")
    print(f"✅ Granger causality testing completed")
    
    print(f"\n=== KEY FINDINGS ===")
    
    # Stationarity summary
    if 'stationarity_results' in locals():
        stationary_count = sum(1 for r in stationarity_results.values() 
                              if r['adf_stationary'] and r['kpss_stationary'])
        total_count = len(stationarity_results)
        print(f"• Stationary series: {stationary_count}/{total_count}")
    
    # Cross-correlation summary
    if 'cross_correlations' in locals():
        strong_correlations = sum(1 for corr in cross_correlations.values() if abs(corr) > 0.5)
        print(f"• Strong correlations (|r| > 0.5): {strong_correlations}/{len(cross_correlations)}")
    
    # Granger causality summary
    if 'granger_results' in locals():
        significant_causality = sum(1 for r in granger_results.values() if r['significant'])
        print(f"• Significant Granger causality: {significant_causality}/{len(granger_results)}")
    
    print(f"\n=== NEXT STEPS ===")
    print("1. Proceed to 03_multivariate_model_training.ipynb")
    print("   - VAR (Vector Autoregression) models")
    print("   - VARMA (Vector ARMA) models") 
    print("   - Machine learning models with multiple features")
    print("   - Deep learning approaches (LSTM, GRU)")
    print("   - Feature engineering for multivariate forecasting")
    
    print("\n2. Continue to 04_multivariate_model_evaluation.ipynb")
    print("   - Model comparison and selection")
    print("   - Ensemble methods for multivariate forecasting")
    print("   - Cross-validation for time series")
    print("   - Final model deployment recommendations")
    
    print(f"\n🎯 Ready to proceed with multivariate model training!")
    
else:
    print("❌ Analysis incomplete")
    print("Please ensure the data is loaded correctly from the previous notebook")
