# Beijing air quality forecasting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# For deep learning - we'll use TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

Libraries imported successfully!
TensorFlow version: 2.20.0


In [None]:
# Load the data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print("Data loaded successfully!")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Data loaded successfully!
Train shape: (30676, 12)
Test shape: (13148, 11)


# Data exploration

Explore the dataset with statistics and visualizations to understand the data better.

In [3]:
# Basic data exploration
print("Train data info:")
print(train.info())
print("\nTrain data describe:")
print(train.describe())

print("\nMissing values:")
print("Train missing values:")
print(train.isnull().sum())
print("\nTest missing values:")
print(test.isnull().sum())

Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30676 entries, 0 to 30675
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   No        30676 non-null  int64  
 1   DEWP      30676 non-null  float64
 2   TEMP      30676 non-null  float64
 3   PRES      30676 non-null  float64
 4   Iws       30676 non-null  float64
 5   Is        30676 non-null  float64
 6   Ir        30676 non-null  float64
 7   datetime  30676 non-null  object 
 8   cbwd_NW   30676 non-null  float64
 9   cbwd_SE   30676 non-null  float64
 10  cbwd_cv   30676 non-null  float64
 11  pm2.5     28755 non-null  float64
dtypes: float64(10), int64(1), object(1)
memory usage: 2.8+ MB
None

Train data describe:
                 No          DEWP          TEMP          PRES           Iws  \
count  30676.000000  30676.000000  30676.000000  30676.000000  30676.000000   
mean   15338.500000     -0.029431     -0.062712      0.013612      0.030542   
s

In [5]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()
print(train.tail())
print(f"First datetime: {train['datetime'].iloc[0]}")
print(f"Last datetime: {train['datetime'].iloc[-1]}")

Training Data Overview:
          No      DEWP      TEMP      PRES       Iws        Is        Ir  \
30671  30672  1.467633  0.946961 -2.088668 -0.415099 -0.069353  2.687490   
30672  30673  1.329064  0.864984 -2.186052 -0.379306 -0.069353  3.393779   
30673  30674  1.259780  0.701029 -2.088668 -0.263130 -0.069353  4.100068   
30674  30675  1.190496  0.701029 -2.088668 -0.146953 -0.069353  4.806358   
30675  30676  1.190496  0.701029 -2.186052 -0.084366 -0.069353 -0.137667   

                  datetime   cbwd_NW   cbwd_SE   cbwd_cv  pm2.5  
30671  2013-07-01 23:00:00 -0.690542 -0.732019 -0.522096   50.0  
30672  2013-07-02 00:00:00  1.448138 -0.732019 -0.522096   41.0  
30673  2013-07-02 01:00:00  1.448138 -0.732019 -0.522096   32.0  
30674  2013-07-02 02:00:00  1.448138 -0.732019 -0.522096   19.0  
30675  2013-07-02 03:00:00  1.448138 -0.732019 -0.522096   18.0  
First datetime: 2010-01-01 00:00:00
Last datetime: 2013-07-02 03:00:00


In [6]:
train.columns

Index(['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW',
       'cbwd_SE', 'cbwd_cv', 'pm2.5'],
      dtype='object')

In [7]:
# Ensure 'datetime' column is in datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

test['datetime'] = pd.to_datetime(test['datetime'])

# Set the 'datetime' column as the index for better time-series handling
train.set_index('datetime', inplace=True)
# val.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)

print("Datetime preprocessing completed!")
print(f"Train date range: {train.index.min()} to {train.index.max()}")
print(f"Test date range: {test.index.min()} to {test.index.max()}")


Datetime preprocessing completed!
Train date range: 2010-01-01 00:00:00 to 2013-07-02 03:00:00
Test date range: 2013-07-02 04:00:00 to 2014-12-31 23:00:00


In [8]:
# Examine PM2.5 values
print("PM2.5 analysis")
print(f"PM2.5 min: {train['pm2.5'].min()}")
print(f"PM2.5 max: {train['pm2.5'].max()}")
print(f"PM2.5 mean: {train['pm2.5'].mean():.2f}")
print(f"PM2.5 std: {train['pm2.5'].std():.2f}")

# Check where missing values are located
print(f"\nMissing PM2.5 pattern")
missing_count = train['pm2.5'].isnull().sum()
print(f"Missing values: {missing_count}")
print(f"Percentage missing: {missing_count/len(train)*100:.2f}%")


PM2.5 analysis
PM2.5 min: 0.0
PM2.5 max: 994.0
PM2.5 mean: 100.79
PM2.5 std: 93.14

Missing PM2.5 pattern
Missing values: 1921
Percentage missing: 6.26%


In [9]:
# Check the missing value pattern more thoroughly
print("Detailed Missing PM2.5 Analysis")
print("First 20 rows of missing PM2.5:")
missing_mask = train['pm2.5'].isnull()
print(train[missing_mask].head(20)[['pm2.5']])

# Find where valid data starts
first_valid_idx = train['pm2.5'].first_valid_index()
print(f"\nFirst valid PM2.5 measurement: {first_valid_idx}")

# Check if missing values are scattered or concentrated
print(f"\nMissing values by month:")
train_with_month = train.copy()
train_with_month['month'] = train_with_month.index.month
print(train_with_month.groupby('month')['pm2.5'].apply(lambda x: x.isnull().sum()))

Detailed Missing PM2.5 Analysis
First 20 rows of missing PM2.5:
                     pm2.5
datetime                  
2010-01-01 00:00:00    NaN
2010-01-01 01:00:00    NaN
2010-01-01 02:00:00    NaN
2010-01-01 03:00:00    NaN
2010-01-01 04:00:00    NaN
2010-01-01 05:00:00    NaN
2010-01-01 06:00:00    NaN
2010-01-01 07:00:00    NaN
2010-01-01 08:00:00    NaN
2010-01-01 09:00:00    NaN
2010-01-01 10:00:00    NaN
2010-01-01 11:00:00    NaN
2010-01-01 12:00:00    NaN
2010-01-01 13:00:00    NaN
2010-01-01 14:00:00    NaN
2010-01-01 15:00:00    NaN
2010-01-01 16:00:00    NaN
2010-01-01 17:00:00    NaN
2010-01-01 18:00:00    NaN
2010-01-01 19:00:00    NaN

First valid PM2.5 measurement: 2010-01-02 00:00:00

Missing values by month:
month
1     242
2       8
3     163
4     192
5     109
6     178
7      59
8     363
9     259
10    134
11     83
12    131
Name: pm2.5, dtype: int64


# Handle missing values

Check the dataset for missing values and decide how to handle them.

In [10]:
# Step 1: Check the gaps more systematically
print("Gap analysis")
# Find consecutive missing periods
missing_periods = train['pm2.5'].isnull()
# Group consecutive missing values
groups = (missing_periods != missing_periods.shift()).cumsum()
missing_groups = train[missing_periods].groupby(groups).size()

print("Consecutive missing value periods:")
print(f"Number of missing periods: {len(missing_groups)}")
print(f"Average gap length: {missing_groups.mean():.1f} hours")
print(f"Max gap length: {missing_groups.max()} hours")
print(f"Min gap length: {missing_groups.min()} hours")

Gap analysis
Consecutive missing value periods:
Number of missing periods: 163
Average gap length: 11.8 hours
Max gap length: 155 hours
Min gap length: 1 hours


In [11]:
# Step 2: Handle missing values
print("Handling Missing Values")
print(f"Original data points: {len(train)}")
print(f"Missing PM2.5: {train['pm2.5'].isnull().sum()}")

# Simple dropna
train_clean = train.dropna(subset=['pm2.5']).copy()
print(f"After dropping missing PM2.5: {len(train_clean)}")
print(f"Data retained: {len(train_clean)/len(train)*100:.1f}%")

# Verify no missing values remain
print(f"Missing values after cleaning: {train_clean.isnull().sum().sum()}")

Handling Missing Values
Original data points: 30676
Missing PM2.5: 1921
After dropping missing PM2.5: 28755
Data retained: 93.7%
Missing values after cleaning: 0


In [12]:
# Check for time continuity (important for time series)
print("Time continuity check:")
time_diff = train_clean.index.to_series().diff()
expected_freq = pd.Timedelta(hours=1)

# Find any gaps larger than 1 hour
gaps = time_diff[time_diff > expected_freq]
print(f"Number of gaps > 1 hour: {len(gaps)}")
if len(gaps) > 0:
    print("Largest gaps:")
    print(gaps.nlargest(5))
else:
    print("No gaps found - data is continuous!")

Time continuity check:
Number of gaps > 1 hour: 162
Largest gaps:
datetime
2010-09-27 16:00:00   6 days 12:00:00
2012-12-28 13:00:00   5 days 08:00:00
2011-10-07 16:00:00   4 days 04:00:00
2011-03-21 16:00:00   3 days 20:00:00
2010-09-30 21:00:00   3 days 05:00:00
Name: datetime, dtype: timedelta64[ns]


## Feature engineering

In [13]:
# CORRECTED ENHANCED FEATURE ENGINEERING
def create_enhanced_features(df, target_col='pm2.5', has_target=True):
    """Create additional features for better prediction"""
    df = df.copy()

    # Temporal features with cyclical encoding
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month

    # Cyclical encoding (important for LSTM)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # Weather interaction features
    df['temp_dewp_diff'] = df['TEMP'] - df['DEWP']
    df['wind_pressure_ratio'] = df['Iws'] / (df['PRES'] + 1e-8)  # Avoid division by zero

    # Moving averages (capture trends)
    for window in [3, 6, 12, 24]:
        for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
            df[f'{col}_ma_{window}'] = df[col].rolling(window=window, min_periods=1).mean()

    # Lag features (very important for time series)
    for lag in [1, 2, 3, 6, 12, 24]:
        for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # ONLY create target lag features if target exists (training data)
    if has_target and target_col in df.columns:
        for lag in [1, 2, 3, 6, 12]:
            df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)

    # Fill NaN values created by lag features
    df = df.bfill().ffill()

    # Drop original categorical hour/day columns
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1, errors='ignore')

    return df

# Apply enhanced feature engineering with correct flags
print("Creating enhanced features...")
train_enhanced = create_enhanced_features(train_clean, 'pm2.5', has_target=True)   # Has PM2.5
test_enhanced = create_enhanced_features(test, 'pm2.5', has_target=False)          # No PM2.5

print(f"Original train features: {len(train_clean.columns)}")
print(f"Enhanced train features: {len(train_enhanced.columns)}")
print(f"Enhanced test features: {len(test_enhanced.columns)}")

Creating enhanced features...
Original train features: 11
Enhanced train features: 64
Enhanced test features: 58


# Feature and target separation

Separate features and target variable for model training.

In [14]:
# Let's examine what features we have
print("Current Features")
print("Columns in clean training data:")
print(train_clean.columns.tolist())
print(f"\nData shape: {train_clean.shape}")

# Look at feature correlations with PM2.5
print("\nFeature Correlations with PM2.5")
correlations = train_clean.corr()['pm2.5'].sort_values(key=abs, ascending=False)
print(correlations)

Current Features
Columns in clean training data:
['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5']

Data shape: (28755, 11)

Feature Correlations with PM2.5
pm2.5      1.000000
Iws       -0.260250
cbwd_NW   -0.231176
DEWP       0.218187
cbwd_cv    0.158033
cbwd_SE    0.118986
PRES      -0.107773
Ir        -0.052288
TEMP      -0.039601
Is         0.022279
No         0.017961
Name: pm2.5, dtype: float64


In [16]:

print("Enhanced Feature-target split")

# Get feature columns that exist in BOTH train and test
train_feature_cols = [col for col in train_enhanced.columns if col not in ['pm2.5', 'No']]
test_feature_cols = [col for col in test_enhanced.columns if col not in ['No']]

# Use only features that exist in both datasets
common_features = list(set(train_feature_cols) & set(test_feature_cols))
print(f"Features in train only: {len(train_feature_cols)}")
print(f"Features in test only: {len(test_feature_cols)}")
print(f"Common features: {len(common_features)}")

# Features that exist in train but not test (PM2.5 lag features)
train_only_features = list(set(train_feature_cols) - set(test_feature_cols))
if train_only_features:
    print(f"Train-only features (will be excluded): {train_only_features}")

# Use common features for modeling
feature_cols = common_features

X_train_full = train_enhanced[feature_cols].values
y_train_full = train_enhanced['pm2.5'].values
X_test_full = test_enhanced[feature_cols].values

print(f"Final feature count: {len(feature_cols)}")
print(f"X_train shape: {X_train_full.shape}")
print(f"y_train shape: {y_train_full.shape}")
print(f"X_test shape: {X_test_full.shape}")

# Check for any remaining NaN values
print(f"NaN in X_train: {np.isnan(X_train_full).sum()}")
print(f"NaN in y_train: {np.isnan(y_train_full).sum()}")
print(f"NaN in X_test: {np.isnan(X_test_full).sum()}")

Enhanced Feature-target split
Features in train only: 62
Features in test only: 57
Common features: 57
Train-only features (will be excluded): ['pm2.5_lag_12', 'pm2.5_lag_2', 'pm2.5_lag_3', 'pm2.5_lag_1', 'pm2.5_lag_6']
Final feature count: 57
X_train shape: (28755, 57)
y_train shape: (28755,)
X_test shape: (13148, 57)
NaN in X_train: 0
NaN in y_train: 0
NaN in X_test: 0


In [17]:
# IMPROVED SCALING (RobustScaler is better for time series with outliers)
from sklearn.preprocessing import RobustScaler

print("Enhanced scaling with RobustScaler")
scaler_X = RobustScaler()  # Less sensitive to outliers than StandardScaler
scaler_y = StandardScaler()

# Fit and transform
X_train_scaled = scaler_X.fit_transform(X_train_full)
X_test_scaled = scaler_X.transform(X_test_full)
y_train_scaled = scaler_y.fit_transform(y_train_full.reshape(-1, 1)).flatten()

print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train_scaled shape: {y_train_scaled.shape}")

print("Scaling completed with RobustScaler!")

Enhanced scaling with RobustScaler
X_train_scaled shape: (28755, 57)
X_test_scaled shape: (13148, 57)
y_train_scaled shape: (28755,)
Scaling completed with RobustScaler!


In [18]:
# IMPROVED SEQUENCE CREATION (longer sequences capture more patterns)
SEQUENCE_LENGTH = 48  # Increased from 24 to 48 hours (captures daily + weekly patterns)

def create_sequences_improved(X, y, sequence_length):
    """Create sequences with better memory efficiency"""
    X_seq, y_seq = [], []

    for i in range(sequence_length, len(X)):
        X_seq.append(X[i-sequence_length:i])
        y_seq.append(y[i])

    return np.array(X_seq), np.array(y_seq)

print("Creating improved sequences...")
print(f"Using sequence length: {SEQUENCE_LENGTH} hours (was 24)")

# Create training sequences
X_train_seq, y_train_seq = create_sequences_improved(X_train_scaled, y_train_scaled, SEQUENCE_LENGTH)

print(f"Sequence shapes:")
print(f"  X_train_seq: {X_train_seq.shape}")
print(f"  y_train_seq: {y_train_seq.shape}")

print(f"Samples lost due to sequence creation: {len(X_train_scaled) - len(X_train_seq)}")
print(f"Remaining training samples: {len(X_train_seq)}")

Creating improved sequences...
Using sequence length: 48 hours (was 24)
Sequence shapes:
  X_train_seq: (28707, 48, 57)
  y_train_seq: (28707,)
Samples lost due to sequence creation: 48
Remaining training samples: 28707


In [19]:
# IMPROVED TRAIN-VALIDATION SPLIT (time-aware)
print("Enhanced train-validation split")

# Use 20% for validation (temporal split - last 20% of data)
val_size = 0.20
split_idx = int(len(X_train_seq) * (1 - val_size))

X_train_final = X_train_seq[:split_idx]
X_val = X_train_seq[split_idx:]
y_train_final = y_train_seq[:split_idx]
y_val = y_train_seq[split_idx:]

print(f"Training samples: {len(X_train_final)}")
print(f"Validation samples: {len(X_val)}")
print(f"Validation split: {len(X_val) / len(X_train_seq) * 100:.1f}%")

print(f"Final shapes for training:")
print(f"  X_train: {X_train_final.shape}")
print(f"  y_train: {y_train_final.shape}")
print(f"  X_val: {X_val.shape}")
print(f"  y_val: {y_val.shape}")

Enhanced train-validation split
Training samples: 22965
Validation samples: 5742
Validation split: 20.0%
Final shapes for training:
  X_train: (22965, 48, 57)
  y_train: (22965,)
  X_val: (5742, 48, 57)
  y_val: (5742,)


# Model building

Build and train LSTM model for time series forecasting.

In [20]:
# IMPROVED MODEL ARCHITECTURE
from tensorflow.keras.layers import BatchNormalization, GRU

def create_improved_lstm_model(input_shape):
    """Create enhanced LSTM model with better architecture"""
    model = Sequential([
        # First LSTM layer with more units
        LSTM(96, return_sequences=True, input_shape=input_shape),
        BatchNormalization(),  # Stabilizes training
        Dropout(0.3),

        # Second LSTM layer
        LSTM(64, return_sequences=True),
        BatchNormalization(),
        Dropout(0.3),

        # Third LSTM layer (final)
        LSTM(32, return_sequences=False),
        BatchNormalization(),
        Dropout(0.3),

        # Dense layers with batch normalization
        Dense(48, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(24, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(12, activation='relu'),
        Dropout(0.1),

        # Output layer
        Dense(1)
    ])

    # Enhanced optimizer with gradient clipping
    optimizer = Adam(
        learning_rate=0.001,
        clipnorm=1.0  # Prevents exploding gradients
    )

    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae', lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]
    )

    return model

# Create the improved model
print("Building improved LSTM model...")
input_shape = (SEQUENCE_LENGTH, X_train_final.shape[2])
model_improved = create_improved_lstm_model(input_shape)

print("Enhanced model architecture:")
model_improved.summary()

Building improved LSTM model...
Enhanced model architecture:


In [21]:
# ENHANCED TRAINING SETUP
from tensorflow.keras.callbacks import ModelCheckpoint

print("Enhanced training setup")

# Better callbacks
callbacks_improved = [
    EarlyStopping(
        monitor='val_loss',
        patience=25,  # Increased patience
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=12,  # Reduce LR more aggressively
        min_lr=1e-7,
        verbose=1
    ),
]

# Training parameters
EPOCHS = 120  # More epochs with early stopping
BATCH_SIZE = 32

print(f"Training setup:")
print(f"  Epochs: {EPOCHS} (with early stopping)")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Training samples: {len(X_train_final)}")
print(f"  Validation samples: {len(X_val)}")

print("\nStarting enhanced training...")
history_improved = model_improved.fit(
    X_train_final, y_train_final,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks_improved,
    verbose=1
)

print("Enhanced training completed!")

Enhanced training setup
Training setup:
  Epochs: 120 (with early stopping)
  Batch size: 32
  Training samples: 22965
  Validation samples: 5742

Starting enhanced training...
Epoch 1/120
[1m718/718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 40ms/step - lambda: 1.1520 - loss: 0.9592 - mae: 0.7098 - val_lambda: 0.8405 - val_loss: 0.7591 - val_mae: 0.5747 - learning_rate: 0.0010
Epoch 2/120
[1m718/718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - lambda: 1.1375 - loss: 0.5328 - mae: 0.5198 - val_lambda: 0.8408 - val_loss: 0.7229 - val_mae: 0.5597 - learning_rate: 0.0010
Epoch 3/120
[1m718/718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 39ms/step - lambda: 1.1728 - loss: 0.4394 - mae: 0.4631 - val_lambda: 0.8569 - val_loss: 0.7040 - val_mae: 0.5366 - learning_rate: 0.0010
Epoch 4/120
[1m718/718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 40ms/step - lambda: 1.1882 - loss: 0.3900 - mae: 0.4341 - val_lambda: 0.8517 - val_loss: 0.71

In [None]:
# ENHANCED MODEL EVALUATION
print("Enhanced model evaluation")

# Make predictions on validation set
val_pred_scaled = model_improved.predict(X_val, verbose=0)
val_pred_original = scaler_y.inverse_transform(val_pred_scaled.reshape(-1, 1)).flatten()
val_true_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()

# Calculate metrics
val_rmse_improved = np.sqrt(np.mean((val_true_original - val_pred_original)**2))
val_mae_improved = np.mean(np.abs(val_true_original - val_pred_original))

print(f"IMPROVED MODEL PERFORMANCE:")
print(f"  Validation RMSE: {val_rmse_improved:.2f} (Target: < 70)")
print(f"  Validation MAE: {val_mae_improved:.2f}")
print(f"  Improvement target: {'ACHIEVED!' if val_rmse_improved < 70 else 'Need more work'}")

# Prediction statistics
print(f"\nPrediction statistics:")
print(f"  Min prediction: {val_pred_original.min():.1f}")
print(f"  Max prediction: {val_pred_original.max():.1f}")
print(f"  Mean prediction: {val_pred_original.mean():.1f}")
print(f"  Std prediction: {val_pred_original.std():.1f}")

print(f"\nActual statistics:")
print(f"  Min actual: {val_true_original.min():.1f}")
print(f"  Max actual: {val_true_original.max():.1f}")
print(f"  Mean actual: {val_true_original.mean():.1f}")
print(f"  Std actual: {val_true_original.std():.1f}")

Enhanced model evaluation
IMPROVED MODEL PERFORMANCE:
  Validation RMSE: 72.37 (Target: < 70)
  Validation MAE: 45.93
  Improvement target: Need more work

Prediction statistics:
  Min prediction: 20.3
  Max prediction: 469.2
  Mean prediction: 105.9
  Std prediction: 80.5

Actual statistics:
  Min actual: 4.0
  Max actual: 886.0
  Mean actual: 111.6
  Std actual: 106.9


In [23]:
# IMPROVED TEST PREDICTIONS
print("Creating enhanced test predictions")

# Create test sequences with improved method
last_train_X = X_train_scaled[-SEQUENCE_LENGTH:]
combined_X = np.vstack([last_train_X, X_test_scaled])

print(f"Last training samples: {last_train_X.shape}")
print(f"Combined data shape: {combined_X.shape}")

# Create test sequences
X_test_sequences = []
for i in range(SEQUENCE_LENGTH, len(combined_X)):
    X_test_sequences.append(combined_X[i-SEQUENCE_LENGTH:i])

X_test_sequences = np.array(X_test_sequences)
print(f"Test sequences shape: {X_test_sequences.shape}")

# Make predictions
print("Making improved predictions...")
test_pred_scaled = model_improved.predict(X_test_sequences, verbose=1)
test_pred_original = scaler_y.inverse_transform(test_pred_scaled.reshape(-1, 1)).flatten()

# Ensure no negative predictions
test_pred_original = np.maximum(test_pred_original, 0)

print(f"Test prediction statistics:")
print(f"  Min: {test_pred_original.min():.1f}")
print(f"  Max: {test_pred_original.max():.1f}")
print(f"  Mean: {test_pred_original.mean():.1f}")

# Create submission
def format_datetime_no_leading_zero(dt):
    return f"{dt.year}-{dt.month:02d}-{dt.day:02d} {dt.hour}:{dt.minute:02d}:{dt.second:02d}"

row_ids_formatted = [format_datetime_no_leading_zero(dt) for dt in test.index]

submission_improved = pd.DataFrame({
    'row ID': row_ids_formatted,
    'pm2.5': test_pred_original.astype(int)
})

# Save submission
submission_path = 'improved_submission.csv'
submission_improved.to_csv(submission_path, index=False)
print(f"Improved submission saved as: {submission_path}")
print(f"Expected RMSE improvement: {val_rmse_improved:.2f} (was ~76)")

Creating enhanced test predictions
Last training samples: (48, 57)
Combined data shape: (13196, 57)
Test sequences shape: (13148, 48, 57)
Making improved predictions...
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step
Test prediction statistics:
  Min: 19.0
  Max: 468.1
  Mean: 93.2
Improved submission saved as: improved_submission.csv
Expected RMSE improvement: 72.37 (was ~76)
