In [84]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [85]:
# Import data
data = pd.read_csv('training_data_vt2025.csv')

In [86]:
def pre_processing(data):
    # Make copy of dataset
    data_processed = data.copy()

    # Create new summertime feature
    data_processed['is_summer'] = ((data_processed['month'] >= 3) & (data_processed['month'] <= 11)).astype(int)

    # Normalize calendar data using cosine encoding
    num_decimals = 6
    data_processed['day_of_week'] = np.round(np.cos(2 * np.pi * data_processed['day_of_week'] / 7.0), num_decimals)
    data_processed['month'] = np.round(np.cos(2 * np.pi * data_processed['month'] / 12), num_decimals)
    data_processed['hour_of_day'] = np.round(np.cos(2 * np.pi * data_processed['hour_of_day'] / 24.0), num_decimals)

    # Give target feature numerical values
    data_processed['increase_stock'] = data_processed['increase_stock'].replace({'high_bike_demand': 1, 'low_bike_demand': 0})

    # Create binary category of features
    data_processed['is_raining'] = (data_processed['precip'] != 0).astype(int)
    data_processed['is_snowing'] = (data_processed['snowdepth'] != 0).astype(int)
    data_processed['is_visible'] = (data_processed['visibility'] != 16).astype(int)

    # Drop columns
    data_processed = data_processed.drop(columns=['holiday', 'snow', 'summertime', 'snowdepth', 'precip', 'visibility', 'summertime'])

    # Standardize temp, dew and windspeed
    cols_to_standardize = ['temp', 'dew', 'windspeed']
    scaler_std = StandardScaler()
    data_processed[cols_to_standardize] = np.round(scaler_std.fit_transform(data_processed[cols_to_standardize]), num_decimals)

    # Normalize humidity and cloudcover
    cols_to_normalize = ['humidity', 'cloudcover']
    scaler_nor = MinMaxScaler()
    data_processed[cols_to_normalize] = np.round(scaler_nor.fit_transform(data_processed[cols_to_normalize]), num_decimals)
    return data_processed

In [87]:
new_data = pre_processing(data)

  data_processed['increase_stock'] = data_processed['increase_stock'].replace({'high_bike_demand': 1, 'low_bike_demand': 0})


In [88]:
# Save the processed data
new_data.to_csv("preprocessed_data.csv", index=False)

print("Preprocessing complete. File saved as preprocessed_data.csv")

Preprocessing complete. File saved as preprocessed_data.csv


In [91]:
new_data.sample(10)

Unnamed: 0,hour_of_day,day_of_week,month,weekday,temp,dew,humidity,windspeed,cloudcover,increase_stock,is_summer,is_raining,is_snowing,is_visible
740,0.0,-0.222521,-0.866025,0,-0.82168,-0.284412,0.801166,-1.068127,0.793,0,1,0,0,0
501,-0.258819,0.62349,-0.866025,1,0.851844,1.062447,0.704545,0.531001,0.813,0,1,0,0,0
1234,-0.5,1.0,0.5,1,-0.141474,-0.294388,0.454902,0.169907,0.964,1,0,0,0,0
1433,-0.5,-0.900969,0.0,1,0.160841,-1.63127,0.012018,0.685755,0.446,1,1,0,0,0
1017,-0.258819,-0.900969,0.866025,1,0.096059,-0.274435,0.377439,0.389143,0.244,1,1,0,0,0
803,0.5,0.62349,0.866025,0,-0.098286,0.50375,0.891599,0.028049,0.875,0,1,0,0,1
175,0.707107,1.0,-0.5,1,-0.346615,-0.982783,0.256426,1.652969,0.464,0,1,0,0,0
1221,0.965926,0.62349,-0.866025,0,-0.605742,-0.483946,0.561875,-0.191186,0.244,0,1,0,0,0
1056,-0.0,1.0,1.0,1,-1.296745,-1.651223,0.299262,-0.281459,0.244,0,0,0,0,0
508,-0.866025,-0.222521,-1.0,1,1.305316,1.551307,0.75,0.440727,0.889,0,1,0,0,1
