In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input
import warnings

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
# Read data from CSV file
file_path = 'GlobalTemperatures.csv'
df = pd.read_csv(file_path)

# Impute LandAverageTemperature and LandAverageTemperatureUncertainty with mean
df['LandAverageTemperature'].fillna(df['LandAverageTemperature'].mean(), inplace=True)
df['LandAverageTemperatureUncertainty'].fillna(df['LandAverageTemperatureUncertainty'].mean(), inplace=True)

# For columns with 1200 missing values, drop those rows
cols_to_dropna = ['LandMaxTemperature', 'LandMaxTemperatureUncertainty', 'LandMinTemperature', 'LandMinTemperatureUncertainty', 'LandAndOceanAverageTemperature', 'LandAndOceanAverageTemperatureUncertainty']
df.dropna(subset=cols_to_dropna, inplace=True)

In [3]:
# Add Year and Month columns based on 'dt' column
df['Year'] = pd.to_datetime(df['dt']).dt.year
df['Month'] = pd.to_datetime(df['dt']).dt.month

# Prepare X (features) and y (target)
X = df.drop(['LandAverageTemperature', 'dt'], axis=1)
y = df['LandAverageTemperature']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale X and y using MinMaxScaler
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

# Fit and transform the training data
X_train_scaled = scaler_x.fit_transform(X_train)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))

# Only transform the testing data
X_test_scaled = scaler_x.transform(X_test)
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

# Reshape X_train_scaled and X_test_scaled for RNN input
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Print the shapes to verify
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1593, 9, 1)
X_test_scaled shape: (399, 9, 1)
y_train_scaled shape: (1593, 1)
y_test_scaled shape: (399, 1)


In [4]:
# Define the RNN model
model = Sequential()
model.add(Input(shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))
model.add(SimpleRNN(100, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

model.summary()

In [5]:

# Train the model
history = model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32, validation_split=0.2, verbose=2)


Epoch 1/100
40/40 - 2s - 52ms/step - loss: 0.0426 - val_loss: 9.7061e-04
Epoch 2/100
40/40 - 0s - 4ms/step - loss: 0.0093 - val_loss: 5.4814e-04
Epoch 3/100
40/40 - 0s - 4ms/step - loss: 0.0075 - val_loss: 0.0013
Epoch 4/100
40/40 - 0s - 4ms/step - loss: 0.0066 - val_loss: 3.6992e-04
Epoch 5/100
40/40 - 0s - 5ms/step - loss: 0.0057 - val_loss: 5.6041e-04
Epoch 6/100
40/40 - 0s - 5ms/step - loss: 0.0060 - val_loss: 7.9040e-04
Epoch 7/100
40/40 - 0s - 5ms/step - loss: 0.0059 - val_loss: 0.0015
Epoch 8/100
40/40 - 0s - 5ms/step - loss: 0.0049 - val_loss: 0.0040
Epoch 9/100
40/40 - 0s - 5ms/step - loss: 0.0051 - val_loss: 5.6402e-04
Epoch 10/100
40/40 - 0s - 5ms/step - loss: 0.0045 - val_loss: 8.2935e-04
Epoch 11/100
40/40 - 0s - 5ms/step - loss: 0.0048 - val_loss: 2.2765e-04
Epoch 12/100
40/40 - 0s - 5ms/step - loss: 0.0041 - val_loss: 0.0013
Epoch 13/100
40/40 - 0s - 5ms/step - loss: 0.0041 - val_loss: 0.0014
Epoch 14/100
40/40 - 0s - 5ms/step - loss: 0.0039 - val_loss: 4.3582e-04
Epoch 

In [6]:
# Make predictions
predictions = model.predict(X_test_scaled)
predictions = scaler_y.inverse_transform(predictions)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [7]:
# Compare predictions with actual values
actual = scaler_y.inverse_transform(y_test_scaled)
for i in range(len(predictions)):
    print(f"Actual: {actual[i][0]}, Predicted: {predictions[i][0]}")

Actual: 3.88, Predicted: 3.7857725620269775
Actual: 8.689, Predicted: 8.243406295776367
Actual: 13.622, Predicted: 13.289313316345215
Actual: 2.335, Predicted: 2.3843777179718018
Actual: 5.952999999999999, Predicted: 5.8288893699646
Actual: 4.869, Predicted: 4.867486953735352
Actual: 14.768, Predicted: 14.285703659057617
Actual: 7.423999999999999, Predicted: 7.027111053466797
Actual: 4.103, Predicted: 4.018638610839844
Actual: 4.519, Predicted: 4.443005084991455
Actual: 14.021, Predicted: 13.749194145202637
Actual: 14.034, Predicted: 13.50745964050293
Actual: 4.85, Predicted: 4.809767246246338
Actual: 9.453, Predicted: 9.00403118133545
Actual: 6.222, Predicted: 5.966836929321289
Actual: 14.445, Predicted: 14.094795227050781
Actual: 11.062, Predicted: 10.827873229980469
Actual: 2.455, Predicted: 2.234165906906128
Actual: 6.273, Predicted: 6.081531524658203
Actual: 4.303, Predicted: 4.237070083618164
Actual: 5.066, Predicted: 5.376184463500977
Actual: 10.941999999999998, Predicted: 10.56

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Calculate metrics
mae = mean_absolute_error(actual, predictions)
mse = mean_squared_error(actual, predictions)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.24827897800239998
Mean Squared Error (MSE): 0.09677026520194304
Root Mean Squared Error (RMSE): 0.31107919442152193
