In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**I could not download the CPCB data so I am implmenting LSTM forecasting on this Kaggle dataset**

In [None]:
df_train = pd.read_csv("/kaggle/input/lstm-datasets-multivariate-univariate/LSTM-Multivariate_pollution.csv").drop(columns=["date"])

print(df_train.shape)

df_train.head()

In [None]:
df_test = pd.read_csv("/kaggle/input/lstm-datasets-multivariate-univariate/pollution_test_data1.csv")

print(df_test.shape)

df_test.head()

In [None]:
# Checking null values
print(df_train.isnull().sum() , "\n -------- \n" , df_test.isnull().sum() )

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a subplot for each variable
fig = make_subplots(rows=4, cols=2, subplot_titles=('Pollution', 'Dew', 'Temperature', 'Pressure', 'Wind Direction', 'Wind Speed', 'Snow', 'Rain'))

# Define color palette
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880']

# Add a histogram for each variable
fig.add_trace(go.Histogram(x=df_train['pollution'], nbinsx=20, marker_color=colors[0]), row=1, col=1)
fig.add_trace(go.Histogram(x=df_train['dew'], nbinsx=20, marker_color=colors[1]), row=1, col=2)
fig.add_trace(go.Histogram(x=df_train['temp'], nbinsx=20, marker_color=colors[2]), row=2, col=1)
fig.add_trace(go.Histogram(x=df_train['press'], nbinsx=20, marker_color=colors[3]), row=2, col=2)
fig.add_trace(go.Histogram(x=df_train['wnd_dir'], nbinsx=20, marker_color=colors[4]), row=3, col=1)
fig.add_trace(go.Histogram(x=df_train['wnd_spd'], nbinsx=20, marker_color=colors[5]), row=3, col=2)
fig.add_trace(go.Histogram(x=df_train['snow'], nbinsx=20, marker_color=colors[6]), row=4, col=1)
fig.add_trace(go.Histogram(x=df_train['rain'], nbinsx=20, marker_color=colors[7]), row=4, col=2)

# Update layout
fig.update_layout(height=1000, width=1200, title_text='Distribution of Variables', showlegend=False)

# Customize axis labels and titles
fig.update_xaxes(title_text='Value', row=1, col=1)
fig.update_xaxes(title_text='Value', row=1, col=2)
fig.update_xaxes(title_text='Value', row=2, col=1)
fig.update_xaxes(title_text='Value', row=2, col=2)
fig.update_xaxes(title_text='Value', row=3, col=1)
fig.update_xaxes(title_text='Value', row=3, col=2)
fig.update_xaxes(title_text='Value', row=4, col=1)
fig.update_xaxes(title_text='Value', row=4, col=2)

fig.update_yaxes(title_text='Frequency', row=1, col=1)
fig.update_yaxes(title_text='Frequency', row=1, col=2)
fig.update_yaxes(title_text='Frequency', row=2, col=1)
fig.update_yaxes(title_text='Frequency', row=2, col=2)
fig.update_yaxes(title_text='Frequency', row=3, col=1)
fig.update_yaxes(title_text='Frequency', row=3, col=2)
fig.update_yaxes(title_text='Frequency', row=4, col=1)
fig.update_yaxes(title_text='Frequency', row=4, col=2)

# Customize subplot titles
fig.update_annotations(font_size=16)

# Show the plot
fig.show()

In [None]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
values = df_train.values

# specify columns to plot
groups = [1, 2, 3]
i = 1

# plot each column
fig, axs = plt.subplots(len(groups), 1, figsize=(20, 14), facecolor='white')
for group, ax in zip(groups, axs):
    ax.plot(values[:, group], color=cm.viridis(group/len(groups)))
    ax.set_title(df_train.columns[group], y=0.75, loc='right', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(True)
    i += 1

plt.suptitle('Time Series Plot of Selected Variables', fontsize=24)
plt.tight_layout()
plt.show()

In [None]:
df_train_scaled = df_train.copy()
df_test_scaled = df_test.copy()

# Define the mapping dictionary
mapping = {'NE': 0, 'SE': 1, 'NW': 2, 'cv': 3}

# Replace the string values with numerical values
df_train_scaled['wnd_dir'] = df_train_scaled['wnd_dir'].map(mapping)
df_test_scaled['wnd_dir'] = df_test_scaled['wnd_dir'].map(mapping)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create a scaler object
scaler = MinMaxScaler()


# Define the columns to scale
columns = (['pollution', 'dew', 'temp', 'press', 'wnd_spd',
       'snow', 'rain' , "wnd_dir"])


# Scale the selected columns to the range 0-1
df_train_scaled[columns] = scaler.fit_transform(df_train_scaled[columns])
df_test_scaled[columns] = scaler.transform(df_test_scaled[columns])


# Show the scaled data
df_train_scaled.head()

In [None]:
# Define the window size
window_size = 10

#-------------------------TrainSet---------------------------------

# Initialize empty lists to store X and Y
X_sequences = []
Y_values = []

# Iterate through the DataFrame to create sequences
for i in range(len(df_train_scaled) - window_size):
    X_seq = df_train_scaled.iloc[i:i+window_size].values
    Y_val = df_train_scaled.iloc[i+window_size]['pollution']
    X_sequences.append(X_seq)
    Y_values.append(Y_val)

# Convert the lists to NumPy arrays for modeling
X_train = np.array(X_sequences)
y_train = np.array(Y_values)

#-------------------------TestSet---------------------------------

# Initialize empty lists to store X and Y
X_sequences = []
Y_values = []

# Iterate through the DataFrame to create sequences
for i in range(len(df_test_scaled) - window_size):
    X_seq = df_test_scaled.iloc[i:i+window_size].values
    Y_val = df_test_scaled.iloc[i+window_size]['pollution']
    X_sequences.append(X_seq)
    Y_values.append(Y_val)

# Convert the lists to NumPy arrays for modeling
X_test = np.array(X_sequences)
y_test = np.array(Y_values)

#-----------------------------------------------------------------
print("Train size : " , X_train.shape , y_train.shape,"\n ------- \n"
      "Test Size : ",X_test.shape , y_test.shape)

In [None]:
import torch
import torch.nn as nn

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32).reshape(-1,1))
y_test = torch.from_numpy(y_test.astype(np.float32).reshape(-1,1))

In [None]:
# Define LSTM Model

class LSTM(nn.Module):
    def __init__(self , input_size , hidden_size , num_layers , output_size):
        super(LSTM , self).__init__()
        self.input = input_size
        self.output = output_size
        self.H = hidden_size
        self.L = num_layers
        
        self.LSTM = nn.LSTM(input_size = self.input , hidden_size  = self.H , 
                            num_layers = self.L , batch_first = True)
        
        self.fc   = nn.Linear(self.H , self.output)
        
    def forward(self , x):
        # Initialize hidden states for each layer and batch
        h0 = torch.zeros(self.L , x.size(0) , self.H ).to(device)
        c0 = torch.zeros(self.L , x.size(0) , self.H ).to(device)
        
        # Forward Pass
        out , _ = self.LSTM(x , (h0,c0))
    
        out = self.fc(out[:, -1, :])
        
        return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
num_layer = 16
hidden_size = 32
input_size = 10
output_size = 1

model = LSTM(input_size = input_size , hidden_size= hidden_size,
 num_layers = num_layer, output_size = output_size).to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

print(model)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

# Define batch size
batch_size = 128

# Define data loaders for training and testing data in batches
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)

test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, drop_last=True)

In [None]:
num_epochs = 10

train_losses = []  # To store train losses
test_losses = []   # To store test losses

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    train_loss = 0.0
    
    for i, (inputs, targets) in enumerate(train_loader):
        # Forward pass
        inputs = inputs.reshape(-1, 8, 10).to(device)
        targets = targets.reshape(-1, 1).to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        # Accumulate train loss
        train_loss += loss.item()

    # Calculate and store the average train loss for this epoch
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}")

    # Validation (testing) after each epoch
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.reshape(-1, 8, 10).to(device)
            targets = targets.reshape(-1, 1).to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Accumulate test loss
            test_loss += loss.item()

    # Calculate and store the average test loss for this epoch
    avg_test_loss = test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] - Test Loss: {avg_test_loss:.4f}")

In [None]:
with torch.no_grad():
    predicted_values = model(X_test.reshape(-1, 8, 10).to(device)).cpu().numpy()

# Create a scatter plot of true values vs. predicted values
plt.figure(figsize=(6, 6))
plt.plot(df_test_scaled["pollution"]  , label='True Values',color='blue')
plt.plot(predicted_values , label='Predicted Values', color='red')
