In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
import pandas as pd

file_path = "/kaggle/input/weather-forcasting-dataset/air.csv"

# Load the dataset into a DataFrame
df = pd.read_csv(file_path)

In [None]:
df.head(3)

In [None]:
# Data Preprocessing
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df = df.set_index('date')
df = df.interpolate()  # Impute missing values with interpolation
df.head()


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Assuming 'df' is your DataFrame
plt.figure(figsize=(20, 14))

# Iterate over each column
for i, column in enumerate(df.columns):
    plt.subplot(len(df.columns), 1, i+1)
    plt.plot(df.index, df[column], color=cm.plasma(i/len(df.columns)))
    plt.xlabel('Index')
    plt.title(column, y=0.75, loc='right', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="darkgrid")

# Assuming 'df' is your DataFrame
fig, axs = plt.subplots(3, 2, figsize=(24, 14))

# Histograms for each specified column
sns.histplot(data=df, x="pollution_today", kde=True, color="skyblue", ax=axs[0, 0])
sns.histplot(data=df, x="dew", kde=True, color="olive", ax=axs[0, 1])
sns.histplot(data=df, x="temp", kde=True, color="gold", ax=axs[1, 0])
sns.histplot(data=df, x="press", kde=True, color="teal", ax=axs[1, 1])
sns.histplot(data=df, x="wnd_spd", kde=True, color="steelblue", ax=axs[2, 0])
sns.histplot(data=df, x="rain", kde=True, color="goldenrod", ax=axs[2, 1])

plt.show()


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame
# Separate features (X) and target variable (y)
X = df.drop('pollution_today', axis=1)
y = df['pollution_today']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
import numpy as np

# Reshape data for LSTM
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Assuming 'df' is your DataFrame
# ... (Data preprocessing and feature scaling)

# Reshape data for LSTM
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define the model
model = Sequential()
model.add(LSTM(32, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(16, return_sequences=False))
model.add(Dense(1))

# Compile the model
model.compile(loss='mse', optimizer=Adam(learning_rate=0.001), metrics=['RootMeanSquaredError'])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Train the model with callbacks
history = model.fit(
    X_train_reshaped, y_train,
    epochs=100,  # Adjust as needed
    validation_data=(X_test_reshaped, y_test),
    callbacks=[early_stopping, checkpoint],
    verbose=1
)


In [None]:
# Evaluate the model on the test set
loss, rmse = model.evaluate(X_test_reshaped, y_test)
print(f'Test Loss: {loss}, Test RMSE: {rmse}')

# Plot training history
plt.figure(figsize=(12, 6))

# Plot training & validation loss values
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot training & validation RMSE values
plt.subplot(1, 2, 2)
plt.plot(history.history['root_mean_squared_error'], label='Training RMSE')
plt.plot(history.history['val_root_mean_squared_error'], label='Validation RMSE')
plt.title('Model RMSE')
plt.xlabel('Epoch')
plt.ylabel('RMSE')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_reshaped)

# Plot predicted vs actual pollution values
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red', linewidth=2)
plt.title('Predicted vs Actual Pollution Values')
plt.xlabel('Actual Pollution')
plt.ylabel('Predicted Pollution')
plt.show()
