In [1]:
#!/usr/bin/env python3

__author__ = "Gönül Aycı"
__email__ = "aycignl@gmail.com"
__license__ = "MIT"
__copyright__ = "Copyright 2024, https://github.com/aycignl/PracticalSnippets"


In [None]:
# Use Bike Sharing Dataset [1]
!pip install ucimlrepo


In [3]:
import pandas as pd
from typing import Tuple
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import mlflow
import mlflow.sklearn
from ucimlrepo import fetch_ucirepo


In [4]:
def preprocess_data(X: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the DataFrame by converting specified columns to categorical
    types with renamed categories, handling date differences, and denormalizing
    weather-related features. This preprocessing is Python implementation of
    preprocessing of Bike Sharing Dataset in R by Christoph Molnar [2].

    Parameters:
        X (pd.DataFrame): The input DataFrame to preprocess.

    Returns:
        pd.DataFrame: The preprocessed DataFrame.
    """
    # Ensure X is a copy to avoid SettingWithCopyWarning when modifying
    X = X.copy()

    # Drop columns that are not used while training a model
    X = X.drop(columns = ["dteday", "atemp", "windspeed"], axis=1)

    # Define mappings for categories
    mappings = {
        "weekday": (range(7), ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]),
        "holiday": ([0, 1], ["NO HOLIDAY", "HOLIDAY"]),
        "workingday": ([0, 1], ["NO WORKING DAY", "WORKING DAY"]),
        "season": (range(1, 5), ["WINTER", "SPRING", "SUMMER", "FALL"]),
        "weathersit": (range(1, 4), ["GOOD", "MISTY", "RAIN/SNOW/STORM"]),
        "mnth": (range(1, 13), ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]),
    }

    # Convert to categorical and rename
    for col, (categories, labels) in mappings.items():
        X[col] = pd.Categorical(X[col], categories=categories, ordered=True).rename_categories(labels)

    # Handle year conversion and date operations
    X["yr"] = X["yr"].replace({0: 2011, 1: 2012}).astype("category")

    # Denormalize weather features
    X["temp"] = X["temp"] * (39 + 8) - 8
    X["hum"] = X["hum"] * 100

    # Rename columns to make them readable
    X = X.rename(columns={"yr":"year",
                          "mnth":"month",
                          "hr":"hour",
                          "weathersit":"weather",
                          "hum":"humidity"
                          })

    return X

def fetch_and_preprocess_data() -> Tuple[pd.DataFrame, pd.Series]:
    """Fetches the bike-sharing dataset using the ucimlrepo package and
    preprocesses it using the preprocess_dataframe function.

    Returns:
        Tuple[pd.DataFrame, pd.Series]: The preprocessed features as a DataFrame
        and targets as a Series.
    """
    bike_sharing_dataset = fetch_ucirepo(id=275)
    X, y = bike_sharing_dataset.data.features, bike_sharing_dataset.data.targets

    X_preprocessed = preprocess_data(X)

    return X_preprocessed, y


In [5]:
X_preprocessed, y = fetch_and_preprocess_data()
X_preprocessed.head()


Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,humidity
0,WINTER,2011,JAN,0,NO HOLIDAY,SAT,NO WORKING DAY,GOOD,3.28,81.0
1,WINTER,2011,JAN,1,NO HOLIDAY,SAT,NO WORKING DAY,GOOD,2.34,80.0
2,WINTER,2011,JAN,2,NO HOLIDAY,SAT,NO WORKING DAY,GOOD,2.34,80.0
3,WINTER,2011,JAN,3,NO HOLIDAY,SAT,NO WORKING DAY,GOOD,3.28,75.0
4,WINTER,2011,JAN,4,NO HOLIDAY,SAT,NO WORKING DAY,GOOD,3.28,75.0


In [None]:
# Define categorical columns for one-hot encoding
categorical_columns = ['season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weather']

# Apply OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # Passthrough numerical columns as is
)

rng_seed = 33
num_estimator = 100

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=num_estimator, random_state=rng_seed))
])

# Flatten y to 1D array to avoid DataConversionWarning
y = np.ravel(y)

# Splitting the dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=rng_seed)

y_train = y_train.ravel()

# Start MLflow experiment
mlflow.set_experiment("Bike Sharing Rental Count Prediction")

with mlflow.start_run():
    # Fit the model
    model.fit(X_train, y_train)

    # Predictions on the Test set
    # The goal is to predict bike rental count hourly
    y_predictions = model.predict(X_test)

    # Evaluation of model performance using R_square
    test_r2 = r2_score(y_test, y_predictions)

    # Logging parameters, metrics, and model
    mlflow.log_params({"n_estimators": num_estimator})
    mlflow.log_metrics({"Test R2": test_r2})

    # Log model
    mlflow.sklearn.log_model(model, "Random_Forest_Regressor")


In [7]:
print(f"Test R^2: {test_r2:.2f}")


Test R^2: 0.94


# References <br>
1.   Fanaee-T,Hadi. (2013). Bike Sharing Dataset. UCI Machine Learning Repository. https://doi.org/10.24432/C5W894.
2.   Preprocessing in R: https://github.com/christophM/interpretable-ml-book/blob/master/R/get-bike-sharing-dataset.R
3.   MLflow: https://github.com/mlflow/mlflow