# Beijing air quality forecasting

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# For deep learning - we'll use TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

Libraries imported successfully!
TensorFlow version: 2.20.0


In [None]:
# Load the data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print("Data loaded successfully!")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Data loaded successfully!
Train shape: (30676, 12)
Test shape: (13148, 11)


# Data exploration

Explore the dataset with statistics and visualizations to understand the data better.

In [36]:
# Basic data exploration
print("Train data info:")
print(train.info())
print("\nTrain data describe:")
print(train.describe())

print("\nMissing values:")
print("Train missing values:")
print(train.isnull().sum())
print("\nTest missing values:")
print(test.isnull().sum())

Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30676 entries, 0 to 30675
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   No        30676 non-null  int64  
 1   DEWP      30676 non-null  float64
 2   TEMP      30676 non-null  float64
 3   PRES      30676 non-null  float64
 4   Iws       30676 non-null  float64
 5   Is        30676 non-null  float64
 6   Ir        30676 non-null  float64
 7   datetime  30676 non-null  object 
 8   cbwd_NW   30676 non-null  float64
 9   cbwd_SE   30676 non-null  float64
 10  cbwd_cv   30676 non-null  float64
 11  pm2.5     28755 non-null  float64
dtypes: float64(10), int64(1), object(1)
memory usage: 2.8+ MB
None

Train data describe:
                 No          DEWP          TEMP          PRES           Iws  \
count  30676.000000  30676.000000  30676.000000  30676.000000  30676.000000   
mean   15338.500000     -0.029431     -0.062712      0.013612      0.030542   
s

In [37]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()
print(train.tail())
print(f"First datetime: {train['datetime'].iloc[0]}")
print(f"Last datetime: {train['datetime'].iloc[-1]}")

Training Data Overview:
          No      DEWP      TEMP      PRES       Iws        Is        Ir  \
30671  30672  1.467633  0.946961 -2.088668 -0.415099 -0.069353  2.687490   
30672  30673  1.329064  0.864984 -2.186052 -0.379306 -0.069353  3.393779   
30673  30674  1.259780  0.701029 -2.088668 -0.263130 -0.069353  4.100068   
30674  30675  1.190496  0.701029 -2.088668 -0.146953 -0.069353  4.806358   
30675  30676  1.190496  0.701029 -2.186052 -0.084366 -0.069353 -0.137667   

                  datetime   cbwd_NW   cbwd_SE   cbwd_cv  pm2.5  
30671  2013-07-01 23:00:00 -0.690542 -0.732019 -0.522096   50.0  
30672  2013-07-02 00:00:00  1.448138 -0.732019 -0.522096   41.0  
30673  2013-07-02 01:00:00  1.448138 -0.732019 -0.522096   32.0  
30674  2013-07-02 02:00:00  1.448138 -0.732019 -0.522096   19.0  
30675  2013-07-02 03:00:00  1.448138 -0.732019 -0.522096   18.0  
First datetime: 2010-01-01 00:00:00
Last datetime: 2013-07-02 03:00:00


In [38]:
train.columns

Index(['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW',
       'cbwd_SE', 'cbwd_cv', 'pm2.5'],
      dtype='object')

In [39]:
# Ensure 'datetime' column is in datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

test['datetime'] = pd.to_datetime(test['datetime'])

# Set the 'datetime' column as the index for better time-series handling
train.set_index('datetime', inplace=True)
# val.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)

print("Datetime preprocessing completed!")
print(f"Train date range: {train.index.min()} to {train.index.max()}")
print(f"Test date range: {test.index.min()} to {test.index.max()}")


Datetime preprocessing completed!
Train date range: 2010-01-01 00:00:00 to 2013-07-02 03:00:00
Test date range: 2013-07-02 04:00:00 to 2014-12-31 23:00:00


In [40]:
# Examine PM2.5 values
print("PM2.5 analysis")
print(f"PM2.5 min: {train['pm2.5'].min()}")
print(f"PM2.5 max: {train['pm2.5'].max()}")
print(f"PM2.5 mean: {train['pm2.5'].mean():.2f}")
print(f"PM2.5 std: {train['pm2.5'].std():.2f}")

# Check where missing values are located
print(f"\nMissing PM2.5 pattern")
missing_count = train['pm2.5'].isnull().sum()
print(f"Missing values: {missing_count}")
print(f"Percentage missing: {missing_count/len(train)*100:.2f}%")


PM2.5 analysis
PM2.5 min: 0.0
PM2.5 max: 994.0
PM2.5 mean: 100.79
PM2.5 std: 93.14

Missing PM2.5 pattern
Missing values: 1921
Percentage missing: 6.26%


In [41]:
# Check the missing value pattern more thoroughly
print("Detailed Missing PM2.5 Analysis")
print("First 20 rows of missing PM2.5:")
missing_mask = train['pm2.5'].isnull()
print(train[missing_mask].head(20)[['pm2.5']])

# Find where valid data starts
first_valid_idx = train['pm2.5'].first_valid_index()
print(f"\nFirst valid PM2.5 measurement: {first_valid_idx}")

# Check if missing values are scattered or concentrated
print(f"\nMissing values by month:")
train_with_month = train.copy()
train_with_month['month'] = train_with_month.index.month
print(train_with_month.groupby('month')['pm2.5'].apply(lambda x: x.isnull().sum()))

Detailed Missing PM2.5 Analysis
First 20 rows of missing PM2.5:
                     pm2.5
datetime                  
2010-01-01 00:00:00    NaN
2010-01-01 01:00:00    NaN
2010-01-01 02:00:00    NaN
2010-01-01 03:00:00    NaN
2010-01-01 04:00:00    NaN
2010-01-01 05:00:00    NaN
2010-01-01 06:00:00    NaN
2010-01-01 07:00:00    NaN
2010-01-01 08:00:00    NaN
2010-01-01 09:00:00    NaN
2010-01-01 10:00:00    NaN
2010-01-01 11:00:00    NaN
2010-01-01 12:00:00    NaN
2010-01-01 13:00:00    NaN
2010-01-01 14:00:00    NaN
2010-01-01 15:00:00    NaN
2010-01-01 16:00:00    NaN
2010-01-01 17:00:00    NaN
2010-01-01 18:00:00    NaN
2010-01-01 19:00:00    NaN

First valid PM2.5 measurement: 2010-01-02 00:00:00

Missing values by month:
month
1     242
2       8
3     163
4     192
5     109
6     178
7      59
8     363
9     259
10    134
11     83
12    131
Name: pm2.5, dtype: int64


# Handle missing values

Check the dataset for missing values and decide how to handle them.

In [42]:
# Step 1: Check the gaps more systematically
print("Gap analysis")
# Find consecutive missing periods
missing_periods = train['pm2.5'].isnull()
# Group consecutive missing values
groups = (missing_periods != missing_periods.shift()).cumsum()
missing_groups = train[missing_periods].groupby(groups).size()

print("Consecutive missing value periods:")
print(f"Number of missing periods: {len(missing_groups)}")
print(f"Average gap length: {missing_groups.mean():.1f} hours")
print(f"Max gap length: {missing_groups.max()} hours")
print(f"Min gap length: {missing_groups.min()} hours")

Gap analysis
Consecutive missing value periods:
Number of missing periods: 163
Average gap length: 11.8 hours
Max gap length: 155 hours
Min gap length: 1 hours


In [43]:
# Step 2: Handle missing values
print("Handling Missing Values")
print(f"Original data points: {len(train)}")
print(f"Missing PM2.5: {train['pm2.5'].isnull().sum()}")

# Simple dropna
train_clean = train.dropna(subset=['pm2.5']).copy()
print(f"After dropping missing PM2.5: {len(train_clean)}")
print(f"Data retained: {len(train_clean)/len(train)*100:.1f}%")

# Verify no missing values remain
print(f"Missing values after cleaning: {train_clean.isnull().sum().sum()}")

Handling Missing Values
Original data points: 30676
Missing PM2.5: 1921
After dropping missing PM2.5: 28755
Data retained: 93.7%
Missing values after cleaning: 0


In [44]:
# Check for time continuity (important for time series)
print("Time continuity check:")
time_diff = train_clean.index.to_series().diff()
expected_freq = pd.Timedelta(hours=1)

# Find any gaps larger than 1 hour
gaps = time_diff[time_diff > expected_freq]
print(f"Number of gaps > 1 hour: {len(gaps)}")
if len(gaps) > 0:
    print("Largest gaps:")
    print(gaps.nlargest(5))
else:
    print("No gaps found - data is continuous!")

Time continuity check:
Number of gaps > 1 hour: 162
Largest gaps:
datetime
2010-09-27 16:00:00   6 days 12:00:00
2012-12-28 13:00:00   5 days 08:00:00
2011-10-07 16:00:00   4 days 04:00:00
2011-03-21 16:00:00   3 days 20:00:00
2010-09-30 21:00:00   3 days 05:00:00
Name: datetime, dtype: timedelta64[ns]


## Feature engineering

In [None]:
# OPTIMIZED FEATURE ENGINEERING (Fewer but better features)
def create_optimized_features(df, target_col='pm2.5', has_target=True):
    """Create only the most effective features"""
    df = df.copy()

    # Keep only the most predictive temporal features
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month

    # Cyclical encoding for key temporal patterns only
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # REDUCED lag features (only most predictive)
    key_lags = [1, 2, 3, 6, 12]  # Remove 24-hour lag
    key_features = ['DEWP', 'TEMP', 'PRES', 'Iws']

    for lag in key_lags:
        for col in key_features:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # Keep only MOST EFFECTIVE moving averages
    for window in [3, 12]:  # Only 3h and 12h (remove 6h and 24h)
        for col in key_features:
            df[f'{col}_ma_{window}'] = df[col].rolling(window=window, min_periods=1).mean()

    # Key interaction features only
    df['temp_dewp_diff'] = df['TEMP'] - df['DEWP']
    df['pressure_wind_interaction'] = df['PRES'] * df['Iws']

    # Fill NaN and clean up
    df = df.bfill().ffill()
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1, errors='ignore')

    return df

# Handle extreme outliers in PM2.5 (cap at 99th percentile)
pm25_99th = np.percentile(train_clean['pm2.5'].dropna(), 99)
print(f"Capping PM2.5 outliers above {pm25_99th:.1f}")

train_capped = train_clean.copy()
train_capped['pm2.5'] = np.clip(train_capped['pm2.5'], 0, pm25_99th)

# Apply optimized feature engineering
print("Creating optimized features...")
train_enhanced = create_optimized_features(train_capped, 'pm2.5', has_target=True)
test_enhanced = create_optimized_features(test, 'pm2.5', has_target=False)

print(f"Optimized features: {len(train_enhanced.columns)} (was 64)")

Capping PM2.5 outliers above 429.0
Creating optimized features...
Optimized features: 45 (was 64)


# Feature and target separation

Separate features and target variable for model training.

In [46]:
# Let's examine what features we have
print("Current Features")
print("Columns in clean training data:")
print(train_clean.columns.tolist())
print(f"\nData shape: {train_clean.shape}")

# Look at feature correlations with PM2.5
print("\nFeature Correlations with PM2.5")
correlations = train_clean.corr()['pm2.5'].sort_values(key=abs, ascending=False)
print(correlations)

Current Features
Columns in clean training data:
['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5']

Data shape: (28755, 11)

Feature Correlations with PM2.5
pm2.5      1.000000
Iws       -0.260250
cbwd_NW   -0.231176
DEWP       0.218187
cbwd_cv    0.158033
cbwd_SE    0.118986
PRES      -0.107773
Ir        -0.052288
TEMP      -0.039601
Is         0.022279
No         0.017961
Name: pm2.5, dtype: float64


In [47]:

print("Enhanced Feature-target split")

# Get feature columns that exist in BOTH train and test
train_feature_cols = [col for col in train_enhanced.columns if col not in ['pm2.5', 'No']]
test_feature_cols = [col for col in test_enhanced.columns if col not in ['No']]

# Use only features that exist in both datasets
common_features = list(set(train_feature_cols) & set(test_feature_cols))
print(f"Features in train only: {len(train_feature_cols)}")
print(f"Features in test only: {len(test_feature_cols)}")
print(f"Common features: {len(common_features)}")

# Features that exist in train but not test (PM2.5 lag features)
train_only_features = list(set(train_feature_cols) - set(test_feature_cols))
if train_only_features:
    print(f"Train-only features (will be excluded): {train_only_features}")

# Use common features for modeling
feature_cols = common_features

X_train_full = train_enhanced[feature_cols].values
y_train_full = train_enhanced['pm2.5'].values
X_test_full = test_enhanced[feature_cols].values

print(f"Final feature count: {len(feature_cols)}")
print(f"X_train shape: {X_train_full.shape}")
print(f"y_train shape: {y_train_full.shape}")
print(f"X_test shape: {X_test_full.shape}")

# Check for any remaining NaN values
print(f"NaN in X_train: {np.isnan(X_train_full).sum()}")
print(f"NaN in y_train: {np.isnan(y_train_full).sum()}")
print(f"NaN in X_test: {np.isnan(X_test_full).sum()}")

Enhanced Feature-target split
Features in train only: 43
Features in test only: 43
Common features: 43
Final feature count: 43
X_train shape: (28755, 43)
y_train shape: (28755,)
X_test shape: (13148, 43)
NaN in X_train: 0
NaN in y_train: 0
NaN in X_test: 0


In [48]:
# IMPROVED SCALING (RobustScaler is better for time series with outliers)
from sklearn.preprocessing import RobustScaler

print("Enhanced scaling with RobustScaler")
scaler_X = RobustScaler()  # Less sensitive to outliers than StandardScaler
scaler_y = StandardScaler()

# Fit and transform
X_train_scaled = scaler_X.fit_transform(X_train_full)
X_test_scaled = scaler_X.transform(X_test_full)
y_train_scaled = scaler_y.fit_transform(y_train_full.reshape(-1, 1)).flatten()

print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train_scaled shape: {y_train_scaled.shape}")

print("Scaling completed with RobustScaler!")

Enhanced scaling with RobustScaler
X_train_scaled shape: (28755, 43)
X_test_scaled shape: (13148, 43)
y_train_scaled shape: (28755,)
Scaling completed with RobustScaler!


In [None]:
# OPTIMIZED SEQUENCE LENGTH (Test to find best)
def test_sequence_lengths(X_train_scaled, y_train_scaled):
    """Quick test to find optimal sequence length"""
    lengths = [24, 36, 48]
    results = {}

    for seq_len in lengths:
        X_seq, y_seq = create_sequences_improved(X_train_scaled, y_train_scaled, seq_len)
        split_idx = int(len(X_seq) * 0.85)

        # Quick validation
        val_loss = np.var(y_seq[split_idx:])  # Simple baseline
        results[seq_len] = len(X_seq)  # More data is often better
        print(f"Sequence length {seq_len}: {len(X_seq)} samples")

    # Choose length that gives most training data while being reasonable
    best_length = 36  # Sweet spot between memory and data
    print(f"Using sequence length: {best_length}")
    return best_length

SEQUENCE_LENGTH = test_sequence_lengths(X_train_scaled, y_train_scaled)

# Create training sequences with optimized length
X_train_seq, y_train_seq = create_sequences_improved(X_train_scaled, y_train_scaled, SEQUENCE_LENGTH)

print(f"Optimized sequence shapes:")
print(f"  X_train_seq: {X_train_seq.shape}")
print(f"  y_train_seq: {y_train_seq.shape}")

Sequence length 24: 28731 samples
Sequence length 36: 28719 samples
Sequence length 48: 28707 samples
Using sequence length: 36
Optimized sequence shapes:
  X_train_seq: (28719, 36, 43)
  y_train_seq: (28719,)


In [50]:
# OPTIMIZED TRAIN-VALIDATION SPLIT (More training data)
print("Optimized train-validation split")

# Use 15% for validation (was 20%) = more training data
val_size = 0.15
split_idx = int(len(X_train_seq) * (1 - val_size))

X_train_final = X_train_seq[:split_idx]
X_val = X_train_seq[split_idx:]
y_train_final = y_train_seq[:split_idx]
y_val = y_train_seq[split_idx:]

print(f"Training samples: {len(X_train_final)} (increased)")
print(f"Validation samples: {len(X_val)} (decreased)")
print(f"More training data should improve performance")

Optimized train-validation split
Training samples: 24411 (increased)
Validation samples: 4308 (decreased)
More training data should improve performance


# Model building

Build and train LSTM model for time series forecasting.

In [None]:
# SIMPLIFIED BUT EFFECTIVE MODEL
def create_optimized_model(input_shape):
    """Simpler model that often works better"""
    model = Sequential([
        # Single LSTM layer (simpler is better for time series)
        LSTM(64, return_sequences=False, input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.25),

        # Small dense layers
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.15),

        Dense(16, activation='relu'),
        Dropout(0.1),

        Dense(1)
    ])

    # Optimized learning rate
    optimizer = Adam(
        learning_rate=0.002,  # Slightly higher
        clipnorm=1.0
    )

    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae']
    )

    return model

# Create the optimized model
print("Building optimized LSTM model...")
input_shape = (SEQUENCE_LENGTH, X_train_final.shape[2])
model_optimized = create_optimized_model(input_shape)

print("Optimized model architecture (simpler but effective):")
model_optimized.summary()

Building optimized LSTM model...
Optimized model architecture (simpler but effective):


In [52]:
# OPTIMIZED TRAINING SETUP
print("Optimized training setup")

# More aggressive callbacks for better convergence
callbacks_optimized = [
    EarlyStopping(
        monitor='val_loss',
        patience=20,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.6,  # More aggressive LR reduction
        patience=8,   # Faster LR reduction
        min_lr=1e-6,
        verbose=1
    ),
]

# Training parameters
EPOCHS = 100
BATCH_SIZE = 32

print(f"Training setup:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  More aggressive learning rate schedule")

print("\nStarting optimized training...")
history_optimized = model_optimized.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks_optimized,
    verbose=1
)

print("Optimized training completed!")

Optimized training setup
Training setup:
  Epochs: 100
  Batch size: 32
  More aggressive learning rate schedule

Starting optimized training...
Epoch 1/100
[1m763/763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 0.5902 - mae: 0.5556 - val_loss: 0.7183 - val_mae: 0.5905 - learning_rate: 0.0020
Epoch 2/100
[1m763/763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.4067 - mae: 0.4572 - val_loss: 0.6039 - val_mae: 0.5295 - learning_rate: 0.0020
Epoch 3/100
[1m763/763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.3483 - mae: 0.4200 - val_loss: 0.6054 - val_mae: 0.5158 - learning_rate: 0.0020
Epoch 4/100
[1m763/763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.3055 - mae: 0.3944 - val_loss: 0.6062 - val_mae: 0.5211 - learning_rate: 0.0020
Epoch 5/100
[1m763/763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.2727 - mae: 0.3725 - val_loss: 0.6443 - val_mae: 0

In [53]:
# OPTIMIZED MODEL EVALUATION
print("Optimized model evaluation")

# Make predictions on validation set
val_pred_scaled = model_optimized.predict(X_val, verbose=0)
val_pred_original = scaler_y.inverse_transform(val_pred_scaled.reshape(-1, 1)).flatten()
val_true_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()

# Calculate metrics
val_rmse_optimized = np.sqrt(np.mean((val_true_original - val_pred_original)**2))
val_mae_optimized = np.mean(np.abs(val_true_original - val_pred_original))

print(f"OPTIMIZED MODEL PERFORMANCE:")
print(f"  Validation RMSE: {val_rmse_optimized:.2f} (Target: < 70)")
print(f"  Validation MAE: {val_mae_optimized:.2f}")
print(f"  Improvement from 72.37: {72.37 - val_rmse_optimized:.2f} points")
print(f"  Target achieved: {'YES!' if val_rmse_optimized < 70 else 'Close!'}")

# Show improvement
improvement = 72.37 - val_rmse_optimized
print(f"\nIMPROVEMENT SUMMARY:")
print(f"  Previous RMSE: 72.37")
print(f"  Current RMSE: {val_rmse_optimized:.2f}")
print(f"  Improvement: {improvement:.2f} points")

Optimized model evaluation
OPTIMIZED MODEL PERFORMANCE:
  Validation RMSE: 69.67 (Target: < 70)
  Validation MAE: 47.47
  Improvement from 72.37: 2.70 points
  Target achieved: YES!

IMPROVEMENT SUMMARY:
  Previous RMSE: 72.37
  Current RMSE: 69.67
  Improvement: 2.70 points


In [58]:
# IMPROVED TEST PREDICTIONS WITH DYNAMIC NAMING
import datetime

print("Creating enhanced test predictions")

# Create test sequences with improved method
last_train_X = X_train_scaled[-SEQUENCE_LENGTH:]
combined_X = np.vstack([last_train_X, X_test_scaled])

print(f"Last training samples: {last_train_X.shape}")
print(f"Combined data shape: {combined_X.shape}")

# Create test sequences
X_test_sequences = []
for i in range(SEQUENCE_LENGTH, len(combined_X)):
    X_test_sequences.append(combined_X[i-SEQUENCE_LENGTH:i])

X_test_sequences = np.array(X_test_sequences)
print(f"Test sequences shape: {X_test_sequences.shape}")

# Make predictions
print("Making improved predictions...")
test_pred_scaled = model_optimized.predict(X_test_sequences, verbose=1)
test_pred_original = scaler_y.inverse_transform(test_pred_scaled.reshape(-1, 1)).flatten()

# Ensure no negative predictions
test_pred_original = np.maximum(test_pred_original, 0)

print(f"Test prediction statistics:")
print(f"  Min: {test_pred_original.min():.1f}")
print(f"  Max: {test_pred_original.max():.1f}")
print(f"  Mean: {test_pred_original.mean():.1f}")

# Create submission
def format_datetime_no_leading_zero(dt):
    return f"{dt.year}-{dt.month:02d}-{dt.day:02d} {dt.hour}:{dt.minute:02d}:{dt.second:02d}"

row_ids_formatted = [format_datetime_no_leading_zero(dt) for dt in test.index]

submission_optimized = pd.DataFrame({
    'row ID': row_ids_formatted,
    'pm2.5': test_pred_original.astype(int)
})

# Generate dynamic filename with timestamp and RMSE
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
rmse_str = f"{val_rmse_optimized:.2f}".replace(".", "_")
submission_path = f'submission_RMSE_{rmse_str}_{timestamp}.csv'

# Alternative: Simple counter-based naming
import os
counter = 1
base_name = f'submission_RMSE_{rmse_str}'
while os.path.exists(f'{base_name}_v{counter}.csv'):
    counter += 1
submission_path_alt = f'{base_name}_v{counter}.csv'

# Save both versions
submission_optimized.to_csv(submission_path, index=False)
submission_optimized.to_csv(submission_path_alt, index=False)

print(f"Submissions saved as:")
print(f"  Timestamped: {submission_path}")
print(f"  Versioned: {submission_path_alt}")
print(f"Validation RMSE: {val_rmse_optimized:.2f}")

Creating enhanced test predictions
Last training samples: (36, 43)
Combined data shape: (13184, 43)
Test sequences shape: (13148, 36, 43)
Making improved predictions...
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test prediction statistics:
  Min: 4.6
  Max: 396.0
  Mean: 92.6
Submissions saved as:
  Timestamped: submission_RMSE_69_67_20250921_150622.csv
  Versioned: submission_RMSE_69_67_v2.csv
Validation RMSE: 69.67
