# Predicting Rainfall Timing using SKLearn Custom Script in Sagemaker

In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd
import numpy as np

# Use the same AWS profile as your CLI (usually "default")
boto_session = boto3.Session(profile_name="default")  # <-- Change "default" if you're using a different named profile

sess = sagemaker.Session(boto_session=boto_session)

region = sess.boto_session.region_name
bucket = 'alkoofisagemaker'  # Replace with your S3 bucket name
print("Using bucket " + bucket)
print("Using region:", region)

creds = boto_session.get_credentials().get_frozen_credentials()
print("Access Key ID:", creds.access_key)


sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\mohds\AppData\Local\sagemaker\sagemaker\config.yaml
Using bucket alkoofisagemaker
Using region: us-east-1
Access Key ID: ASIAS6LOH5LOS55CIVFR


In [2]:
df = pd.read_csv("bahrain_hourly_weather_unix.csv")

In [3]:
df.head()

Unnamed: 0,date,temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,rain,weather_code,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m
0,1262293200,19.077,76.14247,19.978504,0.0,0.0,2,4.693825,5.76,184.39862,180.0,12.599999
1,1262296800,18.927,76.85831,18.396755,0.0,0.0,1,14.512064,16.1198,293.38525,293.70264,21.24
2,1262300400,18.776999,76.83494,16.63012,0.0,0.0,2,25.212852,28.257132,313.26434,314.48392,31.319998
3,1262304000,18.577,77.05224,15.724358,0.0,0.0,2,29.671074,33.56312,324.3814,324.6051,38.16
4,1262307600,18.526999,74.83259,15.654724,0.0,0.0,2,28.555965,32.25166,326.3099,326.84207,39.6


In [4]:
df.columns

Index(['date', 'temperature_2m', 'relative_humidity_2m',
       'apparent_temperature', 'precipitation', 'rain', 'weather_code',
       'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m',
       'wind_direction_100m', 'wind_gusts_10m'],
      dtype='object')

In [5]:
df.shape

(134400, 12)

### Feature Engineering

In [6]:
# Check for missing values
df.isnull().mean() * 100

date                    0.0
temperature_2m          0.0
relative_humidity_2m    0.0
apparent_temperature    0.0
precipitation           0.0
rain                    0.0
weather_code            0.0
wind_speed_10m          0.0
wind_speed_100m         0.0
wind_direction_10m      0.0
wind_direction_100m     0.0
wind_gusts_10m          0.0
dtype: float64

In [7]:
# Create a target variable: time until next rain event (in hours)
print("\nCreating target variable: hours_until_rain...")

# Sort by date to ensure chronological order
df = df.sort_values('date')

# Initialize target column with a large value (e.g., 168 hours = 1 week)
df['hours_until_rain'] = 168.0  # Default to a week if no rain in the forecast period

# Find the next rain event for each row
for i in range(len(df) - 1):
    if df.iloc[i]['rain'] > 0:  # Current row has rain
        df.loc[df.index[i], 'hours_until_rain'] = 0  # It's raining now
    else:
        # Find the next time it rains and calculate the hours until then
        next_rain_indices = df.iloc[i+1:]['rain'] > 0
        if any(next_rain_indices):
            next_rain_idx = df.iloc[i+1:][next_rain_indices].index[0]
            time_diff = df.loc[next_rain_idx, 'date'] - df.iloc[i]['date']
            hours_diff = time_diff / 3600  # Convert seconds to hours
            if hours_diff <= 168:  # Only predict up to a week ahead
                df.loc[df.index[i], 'hours_until_rain'] = hours_diff

print("Target variable created. Distribution of hours_until_rain:")
print(df['hours_until_rain'].describe())


Creating target variable: hours_until_rain...
Target variable created. Distribution of hours_until_rain:
count    134400.000000
mean        142.005521
std          51.734771
min           0.000000
25%         168.000000
50%         168.000000
75%         168.000000
max         168.000000
Name: hours_until_rain, dtype: float64


In [8]:
# Get all features
features = list(df.columns)
print("All columns:", features)

All columns: ['date', 'temperature_2m', 'relative_humidity_2m', 'apparent_temperature', 'precipitation', 'rain', 'weather_code', 'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m', 'wind_direction_100m', 'wind_gusts_10m', 'hours_until_rain']


In [9]:
# Set the target column for prediction
target_column = 'hours_until_rain'

# Remove the target from features list
features.remove(target_column)
print(f"Target column: {target_column}")
print(f"Features to use for prediction: {features}")

Target column: hours_until_rain
Features to use for prediction: ['date', 'temperature_2m', 'relative_humidity_2m', 'apparent_temperature', 'precipitation', 'rain', 'weather_code', 'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m', 'wind_direction_100m', 'wind_gusts_10m']


In [10]:
# Create feature and target datasets
X = df[features]
y = df[target_column]

In [11]:
X.head()

Unnamed: 0,date,temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,rain,weather_code,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m
0,1262293200,19.077,76.14247,19.978504,0.0,0.0,2,4.693825,5.76,184.39862,180.0,12.599999
1,1262296800,18.927,76.85831,18.396755,0.0,0.0,1,14.512064,16.1198,293.38525,293.70264,21.24
2,1262300400,18.776999,76.83494,16.63012,0.0,0.0,2,25.212852,28.257132,313.26434,314.48392,31.319998
3,1262304000,18.577,77.05224,15.724358,0.0,0.0,2,29.671074,33.56312,324.3814,324.6051,38.16
4,1262307600,18.526999,74.83259,15.654724,0.0,0.0,2,28.555965,32.25166,326.3099,326.84207,39.6


In [12]:
y.head()

0    168.0
1    168.0
2    168.0
3    168.0
4    168.0
Name: hours_until_rain, dtype: float64

In [13]:
X.shape

(134400, 12)

In [14]:
# For regression, show basic statistics instead of value counts
y.describe()

count    134400.000000
mean        142.005521
std          51.734771
min           0.000000
25%         168.000000
50%         168.000000
75%         168.000000
max         168.000000
Name: hours_until_rain, dtype: float64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(114240, 12)
(20160, 12)
(114240,)
(20160,)


In [17]:
trainX = pd.DataFrame(X_train)
trainX[target_column] = y_train

testX = pd.DataFrame(X_test)
testX[target_column] = y_test

In [18]:
print(trainX.shape)
print(testX.shape)

(114240, 13)
(20160, 13)


In [19]:
trainX.head()

Unnamed: 0,date,temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,rain,weather_code,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,hours_until_rain
62073,1485756000,16.8095,45.48967,10.895726,0.0,0.0,0,32.142982,38.773182,344.40714,344.93146,48.96,68.0
87881,1578664800,18.2595,84.53886,18.560339,0.0,0.0,1,11.013882,15.03835,11.309895,11.04091,21.599998,5.0
21940,1341277200,28.276999,79.91031,33.07177,0.0,0.0,0,10.990322,9.220499,301.60745,321.34018,15.84,168.0
16910,1323169200,17.677,48.99551,13.653103,0.0,0.0,0,22.039528,24.344624,308.36752,308.3952,31.319998,168.0
118383,1688472000,39.5095,18.621447,35.206913,0.0,0.0,2,36.2082,43.037693,342.64587,342.47433,54.0,168.0


In [20]:
trainX.isnull().sum()

date                    0
temperature_2m          0
relative_humidity_2m    0
apparent_temperature    0
precipitation           0
rain                    0
weather_code            0
wind_speed_10m          0
wind_speed_100m         0
wind_direction_10m      0
wind_direction_100m     0
wind_gusts_10m          0
hours_until_rain        0
dtype: int64

In [21]:
testX.isnull().sum()

date                    0
temperature_2m          0
relative_humidity_2m    0
apparent_temperature    0
precipitation           0
rain                    0
weather_code            0
wind_speed_10m          0
wind_speed_100m         0
wind_direction_10m      0
wind_direction_100m     0
wind_gusts_10m          0
hours_until_rain        0
dtype: int64

In [22]:
trainX.to_csv("train-rain-v1.csv", index=False)
testX.to_csv("test-rain-v1.csv", index=False)

In [None]:
bucket

In [23]:
# Send data to S3. SageMaker will take the training data from S3

sk_prefix = "rain-prediction-model"
trainpath = sess.upload_data(
    path="train-rain-v1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-rain-v1.csv", bucket=bucket, key_prefix=sk_prefix
)

print(f"Train data uploaded to {trainpath}")
print(f"Test data uploaded to {testpath}")

Train data uploaded to s3://alkoofisagemaker/rain-prediction-model/train-rain-v1.csv
Test data uploaded to s3://alkoofisagemaker/rain-prediction-model/test-rain-v1.csv


## Building the Rainfall Prediction Model

In [24]:
%%writefile rain_prediction_script.py

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# Loading the model    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# Function for making predictions
def predict_fn(input_data, model):
    return model.predict(input_data)

# Starting the execution    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)
    parser.add_argument("--max_depth", type=int, default=None)
    parser.add_argument("--min_samples_split", type=int, default=2)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-rain-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-rain-v1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)  # Assuming the target column is the last column

    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Feature columns: ')
    print(features)
    print()

    print("Target column is: ", label)
    print()

    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("Training RandomForest Regressor Model for Rainfall Prediction...")
    print()
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, 
        random_state=args.random_state, 
        max_depth=args.max_depth,
        min_samples_split=args.min_samples_split,
        verbose=3, 
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()

    y_pred_test = model.predict(X_test)
    
    # Use regression metrics for evaluating the model
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print(f'[TESTING] Root Mean Squared Error: {test_rmse:.4f} hours')
    print(f'[TESTING] Mean Absolute Error: {test_mae:.4f} hours')
    print(f'[TESTING] R² Score: {test_r2:.4f}')
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\n---- TOP 10 IMPORTANT FEATURES FOR RAINFALL PREDICTION ----")
    print(feature_importance.head(10))
    
    # Analyze predictions by time range
    print("\n---- PREDICTION ACCURACY BY TIME RANGE ----")
    # Group predictions by hour ranges
    bins = [0, 1, 3, 6, 12, 24, 48, np.inf]
    labels = ['0-1h', '1-3h', '3-6h', '6-12h', '12-24h', '24-48h', '48h+']
    y_test_binned = pd.cut(y_test, bins=bins, labels=labels)
    
    for time_range in labels:
        range_indices = y_test_binned == time_range
        if sum(range_indices) > 0:
            range_rmse = np.sqrt(mean_squared_error(
                y_test[range_indices], y_pred_test[range_indices]
            ))
            range_mae = mean_absolute_error(
                y_test[range_indices], y_pred_test[range_indices]
            )
            count = sum(range_indices)
            print(f"Time range {time_range}: {count} samples, RMSE: {range_rmse:.2f}h, MAE: {range_mae:.2f}h")


Writing rain_prediction_script.py


### Create an instance in SageMaker by assigning a machine

In [25]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="rain_prediction_script.py",
    role="arn:aws:iam::202631539421:role/SageMakerExecutionRole",  # Replace with your role ARN
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-rainfall-unix-time",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 42,
        "max_depth": 20,  # Optimized for time series prediction
        "min_samples_split": 5  # Helps prevent overfitting
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)

### Launch the train job

In [26]:
# Launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-rainfall-unix-time-2025-05-07-13-33-59-173


2025-05-07 13:34:10 Starting - Starting the training job...
2025-05-07 13:34:26 Starting - Preparing the instances for training...
2025-05-07 13:34:50 Downloading - Downloading input data...
2025-05-07 13:35:20 Downloading - Downloading the training image...
2025-05-07 13:36:01 Training - Training image download completed. Training in progress..2025-05-07 13:36:05,164 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-05-07 13:36:05,169 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-07 13:36:05,218 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-05-07 13:36:05,419 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-07 13:36:05,436 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-07 13:36:05,452 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-07 13:

### View more info about the trained model

In [27]:
sm_boto3 = boto3.client("sagemaker")

sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2025-05-07 13:37:34 Starting - Preparing the instances for training
2025-05-07 13:37:34 Downloading - Downloading the training image
2025-05-07 13:37:34 Training - Training image download completed. Training in progress.
2025-05-07 13:37:34 Uploading - Uploading generated training model
2025-05-07 13:37:34 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-202631539421/RF-rainfall-unix-time-2025-05-07-13-33-59-173/output/model.tar.gz


#### View model output

In [28]:
artifact

's3://sagemaker-us-east-1-202631539421/RF-rainfall-unix-time-2025-05-07-13-33-59-173/output/model.tar.gz'

#### Create another folder location for model deployment

In [29]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Rainfall-Prediction-Model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::202631539421:role/SageMakerExecutionRole",  # Replace with your role ARN
    entry_point="rain_prediction_script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [30]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x15174e16b10>

In [31]:
model_name

'Rainfall-Prediction-Model-2025-05-07-13-38-11'

### Deploy model for rainfall predictions

In [32]:
# Endpoints deployment
endpoint_name = "Rainfall-Prediction-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Rainfall-Prediction-2025-05-07-13-38-20


INFO:sagemaker:Creating model with name: Rainfall-Prediction-Model-2025-05-07-13-38-11
INFO:sagemaker:Creating endpoint-config with name Rainfall-Prediction-2025-05-07-13-38-20
INFO:sagemaker:Creating endpoint with name Rainfall-Prediction-2025-05-07-13-38-20


---------!

In [33]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x1517bf5e990>

In [34]:
endpoint_name

'Rainfall-Prediction-2025-05-07-13-38-20'

### Make predictions on test data

In [35]:
# Get the first few samples from test data
test_samples = testX[features][0:5].values.tolist()
test_samples

[[1472209200.0,
  34.777,
  64.24512,
  44.00996,
  0.0,
  0.0,
  0.0,
  4.379589,
  2.1897945,
  350.53775,
  279.46225,
  11.159999],
 [1576994400.0,
  17.6595,
  74.930954,
  15.880474,
  0.0,
  0.0,
  1.0,
  19.09358,
  23.950148,
  315.7638,
  317.43668,
  29.88],
 [1414526400.0,
  26.876999,
  77.81426,
  30.398685,
  0.0,
  0.0,
  0.0,
  12.229406,
  13.004922,
  47.385983,
  48.366554,
  19.44],
 [1533333600.0,
  31.3095,
  57.11801,
  34.40632,
  0.0,
  0.0,
  0.0,
  11.726277,
  20.966715,
  162.12122,
  164.05453,
  18.0],
 [1562576400.0,
  40.9095,
  28.52163,
  41.888184,
  0.0,
  0.0,
  0.0,
  28.036118,
  31.559088,
  347.3914,
  348.81827,
  46.079998]]

In [36]:
# Make predictions
predictions = predictor.predict(test_samples)
print("Predicted hours until rain:")
print(predictions)

# Compare with actual values
actual_values = testX[target_column][0:5].values
print("\nActual hours until rain:")
print(actual_values)

# Calculate differences
print("\nPrediction error (hours):")
for i, (pred, actual) in enumerate(zip(predictions, actual_values)):
    print(f"Sample {i+1}: Predicted {pred:.2f}, Actual {actual:.2f}, Error {pred-actual:.2f} hours")

Predicted hours until rain:
[168.         131.75715583 158.79451918 168.         167.99031123]

Actual hours until rain:
[168. 168. 168. 168. 168.]

Prediction error (hours):
Sample 1: Predicted 168.00, Actual 168.00, Error 0.00 hours
Sample 2: Predicted 131.76, Actual 168.00, Error -36.24 hours
Sample 3: Predicted 158.79, Actual 168.00, Error -9.21 hours
Sample 4: Predicted 168.00, Actual 168.00, Error 0.00 hours
Sample 5: Predicted 167.99, Actual 168.00, Error -0.01 hours


### Create a function to interpret the predictions

In [37]:
def interpret_rain_prediction(hours_until_rain):
    """Interpret the rainfall prediction in a human-readable format."""
    if hours_until_rain <= 0:
        return "It's currently raining or expected to rain very soon."
    elif hours_until_rain <= 1:
        return f"Rain expected within the next hour (in approximately {hours_until_rain:.1f} hours)."
    elif hours_until_rain <= 3:
        return f"Rain expected soon (in approximately {hours_until_rain:.1f} hours)."
    elif hours_until_rain <= 6:
        return f"Rain expected within the next few hours (in approximately {hours_until_rain:.1f} hours)."
    elif hours_until_rain <= 12:
        return f"Rain expected later today (in approximately {hours_until_rain:.1f} hours)."
    elif hours_until_rain <= 24:
        return f"Rain expected within a day (in approximately {hours_until_rain:.1f} hours)."
    else:
        return f"Rain not expected soon (expected in approximately {hours_until_rain:.1f} hours / {hours_until_rain/24:.1f} days)."

# Apply the interpretation function to the predictions
for i, pred in enumerate(predictions):
    print(f"Sample {i+1}: {interpret_rain_prediction(pred)}")

Sample 1: Rain not expected soon (expected in approximately 168.0 hours / 7.0 days).
Sample 2: Rain not expected soon (expected in approximately 131.8 hours / 5.5 days).
Sample 3: Rain not expected soon (expected in approximately 158.8 hours / 6.6 days).
Sample 4: Rain not expected soon (expected in approximately 168.0 hours / 7.0 days).
Sample 5: Rain not expected soon (expected in approximately 168.0 hours / 7.0 days).


### Test with real-time data

In [38]:
# Example of how to use the model with new data
# The order and number of features must match the training data (all columns except hours_until_rain)

# This example matches the Bahrain weather dataset structure:
# [date, temperature_2m, relative_humidity_2m, apparent_temperature, precipitation, 
#  rain, weather_code, wind_speed_10m, wind_speed_100m, wind_direction_10m, 
#  wind_direction_100m, wind_gusts_10m]

new_weather_data = [
    [1620000000, 35.2, 65.0, 38.4, 0.0, 0.0, 0, 12.5, 16.8, 180.0, 185.0, 14.2]
]

prediction = predictor.predict(new_weather_data)
print(f"Prediction for new data: {prediction[0]:.2f} hours until rain")
print(interpret_rain_prediction(prediction[0]))

Prediction for new data: 167.99 hours until rain
Rain not expected soon (expected in approximately 168.0 hours / 7.0 days).


### Delete endpoint to avoid running costs

In [None]:
# Always delete the endpoint when done to avoid ongoing charges
sm_boto3.delete_endpoint(EndpointName=endpoint_name)
print(f"Endpoint {endpoint_name} deleted successfully")

## Summary and Next Steps

In this notebook, we've created a machine learning model that predicts when it will rain based on weather data with Unix timestamps. The model:

1. Creates a target variable 'hours_until_rain' to predict time until next rainfall
2. Uses RandomForest regression for prediction
3. Keeps all original features including weather_code
4. Is trained and deployed using Amazon SageMaker

Potential improvements:
- Add cyclical time features (sine/cosine transformations of time) to capture daily and seasonal patterns
- Try different algorithms like XGBoost or deep learning models
- Experiment with hyperparameter tuning
- Incorporate additional weather data sources
- Create a simple API or web interface for real-time predictions
