In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('./data/history_data.csv')

# Display the first few rows of the data
data.head()


Unnamed: 0,DateTime,AQI,CO,NO,NO2,O3,SO2,PM2.5,PM10,NH3
0,2020-11-25 01:00:00,5,2296.45,0.1,63.06,12.16,25.99,417.26,457.27,4.81
1,2020-11-25 02:00:00,5,2323.15,0.73,71.97,10.01,29.8,415.69,457.57,6.59
2,2020-11-25 03:00:00,5,2616.88,8.16,87.74,15.38,37.19,430.46,477.49,8.36
3,2020-11-25 04:00:00,5,2216.34,8.27,80.2,62.23,54.84,383.96,422.14,5.07
4,2020-11-25 05:00:00,5,1895.9,9.72,71.97,85.12,50.55,297.8,331.37,8.49


In [2]:
# Drop non-numeric columns (e.g., DateTime)
data = data.select_dtypes(include=[np.number])

# Fill missing values in numeric columns with the mean
data.fillna(data.mean(), inplace=True)

# List of pollutants
pollutants = ['PM10', 'PM2.5', 'CO', 'SO2', 'NO2', 'O3']

# Dictionary to store scaled data for each pollutant
scalers = {}
scaled_data = {}

# Scale the data for each pollutant individually
for pollutant in pollutants:
    scaler = MinMaxScaler()
    scaled_data[pollutant] = scaler.fit_transform(data[[pollutant]])
    scalers[pollutant] = scaler  # Save the scaler for inverse transform later

# Display scaled data for PM10
pd.DataFrame(scaled_data['PM10'], columns=['PM10']).head()

Unnamed: 0,PM10
0,0.95482
1,0.954847
2,0.956666
3,0.951612
4,0.943323


In [5]:
SEQ_LENGTH = 60  # Number of time steps for each sequence

# Function to create sequences for a specific pollutant
def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])  # The sequence of 60 time steps
        labels.append(data[i + seq_length])  # The value to predict at time step t+1
    return np.array(sequences), np.array(labels)

# Create sequences and labels for each pollutant
sequences = {}
labels = {}
for pollutant in pollutants:
    sequences[pollutant], labels[pollutant] = create_sequences(scaled_data[pollutant], SEQ_LENGTH)

# Verify shape of sequences and labels for PM10
print(sequences['PM10'].shape, labels['PM10'].shape)


(32599, 60, 1) (32599, 1)


In [6]:
from sklearn.model_selection import train_test_split

# Dictionary to store training and testing data for each pollutant
train_test_data = {}

# Split the data into training and testing sets for each pollutant
for pollutant in pollutants:
    X_train, X_test, y_train, y_test = train_test_split(sequences[pollutant], labels[pollutant], test_size=0.2, random_state=42)
    train_test_data[pollutant] = (X_train, X_test, y_train, y_test)

# Verify shape for PM10's training and test sets
X_train_pm10, X_test_pm10, y_train_pm10, y_test_pm10 = train_test_data['PM10']
print(X_train_pm10.shape, y_train_pm10.shape, X_test_pm10.shape, y_test_pm10.shape)


(26079, 60, 1) (26079, 1) (6520, 60, 1) (6520, 1)


In [7]:
# Function to create an LSTM model for each pollutant
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))  # Predicting a single pollutant value
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Create models for each pollutant
models = {}
for pollutant in pollutants:
    input_shape = (SEQ_LENGTH, 1)  # Each pollutant has one feature (its own values)
    models[pollutant] = create_lstm_model(input_shape)
    models[pollutant].summary()

# Train each model for 25 epochs
history = {}
for pollutant in pollutants:
    X_train, X_test, y_train, y_test = train_test_data[pollutant]
    history[pollutant] = models[pollutant].fit(X_train, y_train, epochs=25, batch_size=64, validation_data=(X_test, y_test))


  super().__init__(**kwargs)


Epoch 1/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 47ms/step - loss: 0.0372 - val_loss: 2.2933e-04
Epoch 2/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 47ms/step - loss: 0.0039 - val_loss: 8.7605e-05
Epoch 3/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 47ms/step - loss: 0.0033 - val_loss: 3.5699e-04
Epoch 4/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - loss: 0.0030 - val_loss: 1.1618e-04
Epoch 5/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 47ms/step - loss: 0.0026 - val_loss: 7.3610e-05
Epoch 6/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - loss: 0.0023 - val_loss: 9.0014e-05
Epoch 7/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 48ms/step - loss: 0.0019 - val_loss: 7.6567e-05
Epoch 8/25
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 49ms/step - loss: 0.0016 - val_loss: 1.0

In [9]:
# Function to predict the next 8 hours for each pollutant
def predict_next_8_hours(model, scaler, current_values, seq_length=SEQ_LENGTH, forecast_hours=8):
    predictions = []
    current_input = np.reshape(current_values, (1, seq_length, 1))  # Reshape directly

    for _ in range(forecast_hours):
        predicted_value = model.predict(current_input)
        predicted_value = scaler.inverse_transform(predicted_value)
        predictions.append(predicted_value[0][0])

        # Update input for the next prediction
        current_input = np.append(current_input[:, 1:, :], predicted_value.reshape(1, 1, 1), axis=1)

    return predictions

# Example current values (last 60 values for each pollutant)
current_values_pm10 = scaled_data['PM10'][-60:]
current_values_pm25 = scaled_data['PM2.5'][-60:]
current_values_co = scaled_data['CO'][-60:]
current_values_so2 = scaled_data['SO2'][-60:]
current_values_no2 = scaled_data['NO2'][-60:]
current_values_o3 = scaled_data['O3'][-60:]

# Predict next 8 hours for PM10
predictions_pm10 = predict_next_8_hours(models['PM10'], scalers['PM10'], current_values_pm10)
print(f"Predicted PM10 values for the next 8 hours: {predictions_pm10}")

# Predict for all other pollutants
predictions_pm25 = predict_next_8_hours(models['PM2.5'], scalers['PM2.5'], current_values_pm25)
predictions_co = predict_next_8_hours(models['CO'], scalers['CO'], current_values_co)
predictions_so2 = predict_next_8_hours(models['SO2'], scalers['SO2'], current_values_so2)
predictions_no2 = predict_next_8_hours(models['NO2'], scalers['NO2'], current_values_no2)
predictions_o3 = predict_next_8_hours(models['O3'], scalers['O3'], current_values_o3)

# Display predictions for all pollutants
print(f"Predicted PM2.5 values: {predictions_pm25}")
print(f"Predicted CO values: {predictions_co}")
print(f"Predicted SO2 values: {predictions_so2}")
print(f"Predicted NO2 values: {predictions_no2}")
print(f"Predicted O3 values: {predictions_o3}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Predicted PM10 values for the next 8 hours: [73.169815, 11355.776, 13673.027, 15223.809, 15907.783, 16305.9795, 16527.76, 16662.3]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1