# 01 - Multivariate Time Series Data Exploration

This notebook focuses on:
1. Loading and exploring multivariate time series datasets
2. Understanding multiple features per time step
3. Feature relationships and correlations
4. Time series characteristics and patterns
5. Data quality assessment and preprocessing


In [None]:
# Import libraries with robust error handling
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta

# Set style
plt.style.use('seaborn')
sns.set_palette("husl")

print("✅ All imports completed successfully!")


In [None]:
# Dataset Selection and Loading
print("=== MULTIVARIATE TIME SERIES DATASET SELECTION ===")

# Available monitoring stations
stations = {
    'Aotizhongxin': 'PRSA_Data_Aotizhongxin_20130301-20170228.csv',
    'Changping': 'PRSA_Data_Changping_20130301-20170228.csv',
    'Dingling': 'PRSA_Data_Dingling_20130301-20170228.csv',
    'Dongsi': 'PRSA_Data_Dongsi_20130301-20170228.csv',
    'Guanyuan': 'PRSA_Data_Guanyuan_20130301-20170228.csv',
    'Gucheng': 'PRSA_Data_Gucheng_20130301-20170228.csv',
    'Huairou': 'PRSA_Data_Huairou_20130301-20170228.csv',
    'Nongzhanguan': 'PRSA_Data_Nongzhanguan_20130301-20170228.csv',
    'Shunyi': 'PRSA_Data_Shunyi_20130301-20170228.csv',
    'Tiantan': 'PRSA_Data_Tiantan_20130301-20170228.csv',
    'Wanliu': 'PRSA_Data_Wanliu_20130301-20170228.csv',
    'Wanshouxigong': 'PRSA_Data_Wanshouxigong_20130301-20170228.csv'
}

# Dataset configuration
dataset_config = {
    'name': 'Beijing Multi-Site Air-Quality Data',
    'features': ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM'],
    'target': 'PM2.5',
    'time_col': 'datetime'
}

print("Available monitoring stations:")
for i, (station_name, filename) in enumerate(stations.items(), 1):
    print(f"{i:2d}. {station_name}")
print()

# Select station (default to Aotizhongxin)
selected_station = 'Aotizhongxin'  # Change this to select a different station
selected_file = stations[selected_station]

print(f"Selected station: {selected_station}")
print(f"File: {selected_file}")
print(f"Target variable: {dataset_config['target']}")
print(f"Number of features: {len(dataset_config['features'])}")

# Create dataset info for compatibility
dataset_info = {
    'name': f"{dataset_config['name']} - {selected_station}",
    'file': f'../data_multivariate/{selected_file}',
    'features': dataset_config['features'],
    'target': dataset_config['target'],
    'time_col': dataset_config['time_col'],
    'station': selected_station
}


In [None]:
# Load the selected dataset
print(f"=== LOADING {dataset_info['name'].upper()} ===")

try:
    # Load the dataset
    df = pd.read_csv(dataset_info['file'])
    print(f"✅ Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Display basic info
    print("\nDataset Info:")
    print(df.info())
    
    # Display first few rows
    print("\nFirst 5 rows:")
    print(df.head())
    
    # Display basic statistics
    print("\nBasic Statistics:")
    print(df.describe())
    
except FileNotFoundError:
    print(f"❌ File not found: {dataset_info['file']}")
    print("Please run the download script first:")
    print("python ../data_multivariate/download_multivariate_data.py")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")


In [None]:
# Data Preprocessing and Time Series Setup
print("=== DATA PREPROCESSING AND TIME SERIES SETUP ===")

if 'df' in locals() and not df.empty:
    # Handle time column
    time_col = dataset_info['time_col']
    
    # Convert to datetime (Beijing air quality data has separate year, month, day, hour columns)
    if 'year' in df.columns and 'month' in df.columns and 'day' in df.columns and 'hour' in df.columns:
        df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
        df = df.set_index('datetime')
        print(f"✅ Time columns converted to datetime and set as index")
        print(f"Date range: {df.index.min()} to {df.index.max()}")
        print(f"Time span: {df.index.max() - df.index.min()}")
    else:
        print(f"❌ Time columns not found in dataset")
        print(f"Available columns: {list(df.columns)}")
    
    # Check for missing values
    print("\nMissing Values:")
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing Percentage': missing_percent
    })
    
    print(missing_df[missing_df['Missing Count'] > 0])
    
    if missing_df['Missing Count'].sum() == 0:
        print("✅ No missing values found!")
    
    # Select features and target
    features = dataset_info['features']
    target = dataset_info['target']
    
    # Check if features exist in dataset
    available_features = [f for f in features if f in df.columns]
    missing_features = [f for f in features if f not in df.columns]
    
    print(f"\nFeature Availability:")
    print(f"Available features: {len(available_features)}/{len(features)}")
    print(f"Available: {available_features}")
    if missing_features:
        print(f"Missing: {missing_features}")
    
    # Check target variable
    if target in df.columns:
        print(f"✅ Target variable '{target}' found")
        print(f"Target statistics: {df[target].describe()}")
    else:
        print(f"❌ Target variable '{target}' not found")
        print(f"Available columns: {list(df.columns)}")
    
    # Create feature matrix
    if available_features and target in df.columns:
        feature_cols = available_features + [target]
        df_features = df[feature_cols].copy()
        
        print(f"\n✅ Feature matrix created with {len(feature_cols)} columns")
        print(f"Feature matrix shape: {df_features.shape}")
    else:
        print("❌ Cannot create feature matrix - missing features or target")
        df_features = df.copy()
    
else:
    print("❌ No dataset loaded. Please run the previous cell first.")
