In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
df = pd.read_csv('hour.csv', parse_dates=True)

In [3]:
df.head()

Unnamed: 0,Datetime,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,01-01-2015 01:00,Ahmedabad,,,1.0,40.01,36.37,,1.0,122.07,,0.0,0.0,0.0,,
1,01-01-2015 02:00,Ahmedabad,,,0.02,27.75,19.73,,0.02,85.9,,0.0,0.0,0.0,,
2,01-01-2015 03:00,Ahmedabad,,,0.08,19.32,11.08,,0.08,52.83,,0.0,0.0,0.0,,
3,01-01-2015 04:00,Ahmedabad,,,0.3,16.45,9.2,,0.3,39.53,153.58,0.0,0.0,0.0,,
4,01-01-2015 05:00,Ahmedabad,,,0.12,14.9,7.85,,0.12,32.63,,0.0,0.0,0.0,,


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

In [5]:
# Load data
df = pd.read_csv('hour.csv', parse_dates=['Datetime'])

# Drop rows with NaN values
df_cleaned = df.dropna()

In [6]:
df_cleaned.head()

Unnamed: 0,Datetime,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
50888,25-11-2017 09:00,Amaravati,104.0,148.5,1.93,23.0,13.75,9.8,0.1,15.3,117.62,0.3,10.4,0.23,155.0,Moderate
50889,25-11-2017 10:00,Amaravati,94.5,142.0,1.33,16.25,9.75,9.65,0.1,17.0,136.23,0.28,7.1,0.15,159.0,Moderate
50890,25-11-2017 11:00,Amaravati,82.75,126.5,1.47,14.83,9.07,9.7,0.1,15.4,149.92,0.2,4.55,0.08,173.0,Moderate
50893,25-11-2017 14:00,Amaravati,68.5,117.0,1.35,13.6,8.35,7.4,0.1,21.8,161.7,0.1,2.3,0.0,191.0,Moderate
50894,25-11-2017 15:00,Amaravati,69.25,112.25,1.52,11.8,7.55,9.25,0.1,21.38,161.68,0.1,2.35,0.0,191.0,Moderate


In [7]:
# Split data into train and test sets
train_size = int(len(df_cleaned) * 0.8)
train, test = df_cleaned[:train_size], df_cleaned[train_size:]

In [8]:
# Scaling data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_cleaned['AQI'].values.reshape(-1,1))
seq_length = 3

In [9]:
# Define function to create sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, seq_length)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [10]:
# Define LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')

In [11]:
# Training LSTM model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model_lstm.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
predictions_lstm = model_lstm.predict(X_test)
predictions_lstm = scaler.inverse_transform(predictions_lstm).flatten()
y_test = scaler.inverse_transform(y_test).flatten()

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


In [12]:
# Calculate evaluation metrics
rmse_lstm = sqrt(mean_squared_error(y_test, predictions_lstm))
mse_lstm = mean_squared_error(y_test, predictions_lstm)
mae_lstm = mean_absolute_error(y_test, predictions_lstm)
r2_lstm = r2_score(y_test, predictions_lstm)

print('LSTM RMSE:', rmse_lstm)
print('LSTM MSE:', mse_lstm)
print('LSTM MAE:', mae_lstm)
print('LSTM R2 Score:', r2_lstm)

LSTM RMSE: 9.30046946759039
LSTM MSE: 86.49873231758107
LSTM MAE: 3.060911877549179
LSTM R2 Score: 0.9756904166238399


In [13]:
import pickle

# Save the trained LSTM model
with open('aqi_model1.pkl', 'wb') as f:
    pickle.dump(model_lstm, f)

In [14]:
from datetime import timedelta

# Get the last datetime in the dataset
last_datetime = df_cleaned['Datetime'].iloc[-1]

# Generate predictions for the next 100 time steps
future_predictions = []

# Take the last sequence from the test data as the starting point for predictions
current_sequence = X_test[-1]

In [15]:
print(last_datetime)
print(current_sequence)

30-06-2020 17:00
[[0.04447439]
 [0.04447439]
 [0.04447439]]


In [16]:
for i in range(100):
    # Reshape the current sequence for model prediction
    current_sequence_reshaped = current_sequence.reshape(1, seq_length, 1)
    
    # Predict the next AQI value
    next_prediction_scaled = model_lstm.predict(current_sequence_reshaped)[0][0]
    
    # Inverse transform the predicted value to the original scale
    next_prediction = scaler.inverse_transform([[next_prediction_scaled]])[0][0]
    
    # Append the prediction to the list
    future_predictions.append(next_prediction)
    
    # Update the current sequence by removing the first element and appending the predicted value
    current_sequence = np.append(current_sequence[1:], [[next_prediction_scaled]], axis=0)

# Convert future predictions to numpy array
future_predictions = np.array(future_predictions)

# Define AQI buckets
def get_aqi_bucket(aqi_value):
    if aqi_value <= 50:
        return 'Good'
    elif aqi_value <= 100:
        return 'Moderate'
    elif aqi_value <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi_value <= 200:
        return 'Unhealthy'
    elif aqi_value <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

# Print datetime, AQI, and AQI bucket for the next 100 predictions
for i in range(100):
    next_datetime = last_datetime + timedelta(hours=i+1)
    next_aqi = future_predictions[i]
    aqi_bucket = get_aqi_bucket(next_aqi)
    print("Datetime:", next_datetime, "| AQI:", next_aqi, "| AQI Bucket:", aqi_bucket)





TypeError: can only concatenate str (not "datetime.timedelta") to str

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
import pickle

# Load the trained LSTM model
with open('aqi_model.pkl', 'rb') as f:
    model_lstm = pickle.load(f)

# Load data
df = pd.read_csv('hour.csv', parse_dates=['Datetime'])

# Drop rows with NaN values
df_cleaned = df.dropna()

# Scaling data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_cleaned['AQI'].values.reshape(-1, 1))
seq_length = 10

# Function to create sequences
def create_sequences(data, seq_length):
    X = []
    for i in range(len(data) - seq_length + 1):
        X.append(data[i:i + seq_length])
    return np.array(X)

# Generate sequences for prediction
last_sequence = scaled_data[-seq_length:]
X_pred = create_sequences(last_sequence, seq_length)

# Make predictions
predictions_lstm = []
for _ in range(1000):
    pred = model_lstm.predict(X_pred.reshape(1, seq_length, 1))[0, 0]
    predictions_lstm.append(pred)
    X_pred = np.roll(X_pred, -1)
    X_pred[-1] = pred

# Inverse transform the predictions
predictions_lstm = scaler.inverse_transform(np.array(predictions_lstm).reshape(-1, 1))

# Create a DataFrame for predictions
pred_df = pd.DataFrame(predictions_lstm, columns=['Predicted_AQI'])

# Store predictions in a CSV file
pred_df.to_csv('predictions.csv', index=True)


