In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
data = pd.read_csv('/weather-automated-sensors-dataset.csv')
data = data.dropna(subset=["Air Temperature"])


In [10]:
# Convert "Measurement Timestamp" to datetime
data["Measurement Timestamp"] = pd.to_datetime(data["Measurement Timestamp"], errors='coerce')

# Extract datetime features and create cyclical representations
data["Month"] = data["Measurement Timestamp"].dt.month
data["Hour"] = data["Measurement Timestamp"].dt.hour
data["Month_sin"] = np.sin(2 * np.pi * data["Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Month"] / 12)
data["Hour_sin"] = np.sin(2 * np.pi * data["Hour"] / 24)
data["Hour_cos"] = np.cos(2 * np.pi * data["Hour"] / 24)

# Drop unnecessary columns
data = data.drop(columns=["Station Name", "Measurement Timestamp", "Measurement Timestamp Label", "Measurement ID", "Month", "Hour"])


  data["Measurement Timestamp"] = pd.to_datetime(data["Measurement Timestamp"], errors='coerce')


In [11]:
imputer = SimpleImputer(strategy="mean")
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [12]:
X = data_imputed.drop(columns=["Air Temperature"])
y = data_imputed["Air Temperature"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Initialize base models
gbm = GradientBoostingRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42, n_estimators=150, learning_rate=0.1, max_depth=4)
rf = RandomForestRegressor(random_state=42, n_estimators=100)

# Define stacking model with Linear Regression as the meta-model
stacking_regressor = StackingRegressor(
    estimators=[
        ('gbm', gbm),
        ('gbr', gbr),
        ('rf', rf)
    ],
    final_estimator=LinearRegression(),
    n_jobs=-1
)


In [14]:
# Train the stacking model
stacking_regressor.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = stacking_regressor.predict(X_test)

# Calculate RMSE, R2 Score, and Accuracy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Define tolerance and calculate accuracy
tolerance = 2.0  # Define tolerance level in temperature units
accuracy = np.mean(np.abs(y_test - y_pred) <= tolerance) * 100

print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)
print("Accuracy within tolerance ±2.0 units:", accuracy, "%")


Root Mean Squared Error (RMSE): 1.6062053753451515
R-squared (R2) Score: 0.9735088597189322
Accuracy within tolerance ±2.0 units: 87.02386998476383 %
