In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Import data
data = pd.read_csv('training_data_vt2025.csv')

In [3]:
def pre_processing(data):
    # Make copy of dataset
    data_processed = data.copy()

    # Create new summertime feature
    data_processed['is_summer'] = ((data_processed['month'] >= 3) & (data_processed['month'] <= 11)).astype(int)

    # Normalize calendar data using cosine encoding
    num_decimals = 6
    data_processed['day_of_week'] = np.round(np.cos(2 * np.pi * data_processed['day_of_week'] / 7.0), num_decimals)
    data_processed['month'] = np.round(np.cos(2 * np.pi * data_processed['month'] / 12), num_decimals)
    data_processed['hour_of_day'] = np.round(np.cos(2 * np.pi * data_processed['hour_of_day'] / 24.0), num_decimals)

    # Give target feature numerical values
    data_processed['increase_stock'] = data_processed['increase_stock'].replace({'high_bike_demand': 1, 'low_bike_demand': 0})

    # Create binary category of features
    data_processed['is_raining'] = (data_processed['precip'] != 0).astype(int)
    data_processed['is_snowing'] = (data_processed['snowdepth'] != 0).astype(int)
    data_processed['is_visible'] = (data_processed['visibility'] != 16).astype(int)

    # Drop columns
    data_processed = data_processed.drop(columns=['holiday', 'snow', 'snowdepth', 'precip', 'visibility', 'summertime'])

    # Standardize temp, dew and windspeed
    cols_to_standardize = ['temp', 'dew', 'windspeed']
    scaler_std = StandardScaler()
    data_processed[cols_to_standardize] = np.round(scaler_std.fit_transform(data_processed[cols_to_standardize]), num_decimals)

    # Normalize humidity and cloudcover
    cols_to_normalize = ['humidity', 'cloudcover']
    scaler_nor = MinMaxScaler()
    data_processed[cols_to_normalize] = np.round(scaler_nor.fit_transform(data_processed[cols_to_normalize]), num_decimals)
    return data_processed

In [4]:
new_data = pre_processing(data)

  data_processed['increase_stock'] = data_processed['increase_stock'].replace({'high_bike_demand': 1, 'low_bike_demand': 0})


In [5]:
# Save the processed data
new_data.to_csv("preprocessed_data.csv", index=False)

print("Preprocessing complete. File saved as preprocessed_data.csv")

Preprocessing complete. File saved as preprocessed_data.csv


In [6]:
new_data.sample(10)

Unnamed: 0,hour_of_day,day_of_week,month,weekday,temp,dew,humidity,windspeed,cloudcover,increase_stock,is_summer,is_raining,is_snowing,is_visible
900,-0.258819,-0.900969,-0.0,1,0.927423,1.371725,0.852094,-0.397525,0.946,0,1,1,0,1
1068,-0.866025,1.0,-0.866025,1,0.700687,0.114658,0.333889,0.376246,0.829,1,1,0,0,0
859,-0.258819,-0.222521,-0.5,0,0.873438,1.441563,0.928248,-0.294356,0.941,0,1,0,0,0
120,-1.0,0.62349,-1.0,0,1.478066,1.202121,0.499524,1.137122,0.466,1,1,0,0,0
586,0.707107,-0.222521,0.866025,1,0.527936,1.232051,1.0,-0.487798,1.0,0,1,1,0,1
889,0.0,0.62349,-0.5,0,-1.167182,-0.693457,0.732746,-0.823099,0.446,0,1,0,0,0
1022,0.866025,-0.900969,-0.866025,1,-0.217052,0.394006,0.887197,-0.99075,0.968,0,1,1,0,0
670,-0.5,1.0,0.0,1,-0.82168,-0.992759,0.406592,-0.294356,0.0,1,1,0,0,0
898,0.258819,-0.900969,-0.866025,1,1.100174,1.501423,0.830081,0.556793,0.513,0,1,0,0,0
1435,-0.707107,0.62349,-1.0,1,1.305316,1.501423,0.721204,0.376246,0.244,0,1,0,0,0
