# Notebook: Individual Models

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import holidays
from datetime import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, SplineTransformer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

from catboost import CatBoostRegressor

Defining All Classes We will need

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy.reset_index(drop=True)
        return X_copy[['counter_id','date', 'site_id','log_bike_count']] 

class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'])
        X_copy['year'] = X_copy['date'].dt.year
        X_copy['month'] = X_copy['date'].dt.month
        #X_copy['week'] = X_copy['date'].dt.isocalendar().week
        X_copy['weekday'] = (X_copy['date'].dt.dayofweek + 1)
        #X_copy['day'] = X_copy['date'].dt.day
        X_copy['hr'] = X_copy['date'].dt.hour
        X_copy['hr_sin'] = np.sin(X_copy.hr*(2.*np.pi/24))
        X_copy['hr_cos'] = np.cos(X_copy.hr*(2.*np.pi/24))
        X_copy.drop('hr', axis=1, inplace=True)
        X_copy = X_copy.sort_values('date')
        X_copy['track_id'] = X_copy.index
        return X_copy


class AddRestrictionLevel(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # Define date ranges and corresponding restriction levels
        date_ranges = [
            ('16/10/2020', '17/10/2020'),
            ('17/10/2020', '28/11/2020'),
            ('28/11/2020', '15/12/2020'),
            ('15/12/2020', '16/01/2021'),
            ('16/01/2021', '19/03/2021'),
            ('19/03/2021', '03/05/2021'),
            ('03/05/2021', '09/06/2021'),
            ('09/06/2021', '20/06/2021'),
            ('20/06/2021', '30/06/2021')
        ]
        
        restriction_levels = [3, 5, 4, 2, 1, 5, 4, 2, 1, 0] #85

        # Convert date strings to datetime objects
        date_ranges = [(pd.to_datetime(start, dayfirst=True), pd.to_datetime(end, dayfirst=True)) for start, end in date_ranges]

        # Add restriction_level column based on date ranges
        X_copy['restriction_level'] = 0  # Default value
        for level, (start_date, end_date) in zip(restriction_levels, date_ranges):
            mask = (X_copy['date'] >= start_date) & (X_copy['date'] < end_date)
            X_copy.loc[mask, 'restriction_level'] = level

        return X_copy

class HolidaysFR(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        is_holiday = lambda date: 1 if date in holidays.FR() else 0
        is_weekend = lambda day: 1 if day in (6,7) else 0
        school_holiday  = lambda date, school_hols: 1 if any(start <= date <= end for start, end in school_hols) else 0
        
        Autumn_20 = (dt(2020, 10, 18), dt(2023, 11, 1))
        Xmas_20 = (dt(2020, 12, 20), dt(2021, 1, 3))
        Winter_21 = (dt(2021, 2, 14), dt(2021, 2, 28))
        Spring_21 = (dt(2021, 4, 11), dt(2021, 4, 25))
        Summer_21 = (dt(2021, 7, 7), dt(2021, 9, 7))

        school_hols = [Autumn_20, Xmas_20, Winter_21, Spring_21, Summer_21]
        
        X_copy = X.copy()
        X_copy['is_Holiday'] = X_copy['date'].apply(is_holiday)
        X_copy['is_Weekend'] = X_copy['weekday'].apply(is_weekend)
        X_copy['is_School_Holiday'] = X_copy['date'].apply(lambda date: school_holiday(date, school_hols))
        #X_copy.drop(columns='date', inplace=True)
        return X_copy

class MergeWeatherCovid(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        data = pd.read_csv(os.path.join("..", "Datasets", "weather_data_cleaned.csv"))
        data['date'] = pd.to_datetime(data['date']).astype('datetime64[us]')
        merged_data = pd.merge_asof(X, data, on='date')
        #merged_data.drop(columns='date', inplace=True)
        return merged_data

class MergeMultimodal(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        # Import Multimodal Data
        mult_df = pd.read_csv(os.path.join("..", "Datasets", "multimodal_data.csv"))
        mult_df['date'] = pd.to_datetime(mult_df['date']).astype('datetime64[us]')
        # Averaging and scaling the count
        mult_df = pd.DataFrame(mult_df.groupby(['date'])['count'].sum()).reset_index()
        scaler = StandardScaler()
        numerical_columns = mult_df.select_dtypes(include='number').columns
        mult_df[numerical_columns] = scaler.fit_transform(mult_df[numerical_columns])
        # Merging data
        merged_data = pd.merge_asof(X_copy, mult_df, on='date')
        merged_data = merged_data.rename(columns={'count': 'average_multimodal_count'})
        #merged_data.drop(columns='date', inplace=True)
        return merged_data
    
class SplitBySite(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        sub_dataframes = []

        unique_site_ids = X_copy['site_id'].unique()

        for site_id in unique_site_ids:
            sub_df = X_copy[X_copy['site_id'] == site_id].copy()
            sub_dataframes.append(sub_df)

        return sub_dataframes
    

class EncodeCounter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        encoded_dataframes = []

        for i in range(len(X_copy)):
            X_copy[i]['counter_id'] = X_copy[i]['counter_id'].astype('object')
            encoded_df = pd.get_dummies(X_copy[i], columns=['counter_id'], dtype=int, drop_first=True)
            encoded_dataframes.append(encoded_df)
        return encoded_dataframes
    

class DropOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        cleaned_dataframes = []
        for i in range(len(X_copy)):
            mean_value = X_copy[i]['log_bike_count'].mean()
            std_dev = X_copy[i]['log_bike_count'].std()
            # #Define a threshold
            threshold = 3
            # # Identify outliers
            outliers = (X_copy[i]['log_bike_count'] - mean_value).abs() > threshold * std_dev
            # # Drop outliers
            mask = outliers==False
            cleaned_dataframes.append(X_copy[i][mask])
        return cleaned_dataframes

class ModelGen(BaseEstimator, TransformerMixin):
    def __init__(self, model=CatBoostRegressor(iterations=100, depth=5, learning_rate=0.1),
                 random_state=42, save_path=(os.path.join("..", "Trained_Models"))):
        self.model = model
        self.random_state = random_state
        self.best_model = None
        self.best_performance = float('inf')  # or -np.inf if you are maximizing a metric
        self.save_path = save_path

    def fit(self, X, y=None):
        self.best_model = None  # Clear previous best model
        
        # Initialize KFold with 10 folds
        kf = KFold(n_splits=10, shuffle=True, random_state=self.random_state)
        
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model = self.model
            model.set_params(random_seed=self.random_state)  # Set random state if provided
            model.fit(X_train, y_train, verbose=False)  # You can adjust verbosity as needed

            # Evaluate the model using mean squared error
            y_pred = model.predict(X_test)
            performance = mean_squared_error(y_test, y_pred)

            # Update the best model if the current one is better
            if performance < self.best_performance:
                self.best_model = model
                self.best_performance = performance

        # Save the best model to the desktop
        model_filename = "model_catboost.joblib"
        model_path = os.path.join(os.path.expanduser(self.save_path), model_filename)
        joblib.dump(self.best_model, model_path)

        return self

def add_prediction_column(X):
    for df in X:
        # Extract the first value of the column 'site_id'
        site_id_value = df['site_id'].iloc[0]
        #df.drop(columns='log_bike_count', inplace=True)
        
        # Construct the path for the model file
        model_path = os.path.join("..", "Trained_Models", f"site_id_{site_id_value}_model_catboost.joblib")
        
        # Check if the model file exists
        if os.path.exists(model_path):
            # Load the model
            model = joblib.load(model_path)
            # Add a column 'prediction' to the DataFrame with model predictions
            df['prediction'] = model.predict(df.drop(columns=['log_bike_count', 'site_id', 'date', 'track_id'], axis=1))
            #df.drop('log_bike_count', inplace=True)
        else:
            print(f"Model file not found for site_id {site_id_value}")
    out = pd.concat(X, ignore_index=True)
    out.drop(columns='log_bike_count', inplace=True)
    
    return out


# class ModelGenGridSearch(BaseEstimator, TransformerMixin):
#     def __init__(self, model=RandomForestRegressor(n_estimators=200, n_jobs=-1), random_state=42, save_path=(os.path.join("..", "Trained_Models"))):
#         self.model = model
#         self.random_state = random_state
#         self.models = []
#         self.save_path = save_path

#     def fit(self, X, y=None):
#         self.models = []  # Clear previous models

#         for idx, df in enumerate(X):
#             X_train = df.drop(columns=['log_bike_count', 'site_id', 'date', 'track_id'], axis=1) 
#             y_train = df['log_bike_count']

#             # Set random state if provided
#             self.model.set_params(random_state=self.random_state)
#             print('kfold_')

#             # Use 5-fold cross-validation
#             kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)

#             # Hyperparameter tuning using GridSearchCV
#             param_grid = {
#                 'max_depth': [None, 10, 20, 30],
#                 'min_samples_split': [2, 5, 10],
#                 # 'min_samples_leaf': [1, 2, 4],
#                 # 'max_features': ['auto', 'sqrt', 'log2']
#             }

#             grid_search = GridSearchCV(self.model, param_grid, cv=kf, scoring='neg_mean_squared_error')
#             grid_search.fit(X_train, y_train)

#             # Get the best model from GridSearchCV
#             best_model = grid_search.best_estimator_

#             # Save the trained model to the desktop
#             model_filename = f"site_ID_{df['site_id'].iloc[0]}_model.joblib"
#             model_path = os.path.join(os.path.expanduser(self.save_path), model_filename)
#             joblib.dump(best_model, model_path)

#             self.models.append(best_model)

#         return self
    


Creating our pipeline with the following steps:

In [3]:
# Preprocessor Pipeline
preprocessor = Pipeline([
    ('column_selector', ColumnSelector()),
    ('date_formatter', DateFormatter()),
    ('add_restriction_level', AddRestrictionLevel()),
    ('holidays_fr', HolidaysFR()),
    ('MergeWeatherCovid', MergeWeatherCovid()),
    ('MergeMultimodal', MergeMultimodal())
])

spliter = Pipeline([
    ('SplitBySite', SplitBySite()),
    ('EncodeCounter', EncodeCounter()),
    ('DropOutliers',DropOutliers())
])

# Combined Pipeline
combined_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('spliter', spliter)
])

Now we train our models and store them

In [4]:
# Import training set
df = pd.read_parquet(os.path.join("..", "Datasets", "train.parquet"))

# Preprocess training set
A = combined_pipeline.fit_transform(df)

# Train Models
ModelGen().fit(A)

Making a function to add predictions

In [5]:
def add_prediction_column(X):
    for df in X:
        # Extract the first value of the column 'site_id'
        site_id_value = df['site_id'].iloc[0]
        #df.drop(columns='log_bike_count', inplace=True)
        
        # Construct the path for the model file
        model_path = os.path.join("..", "Trained_Models", f"site_id_{site_id_value}_model_catboost.joblib")
        
        # Check if the model file exists
        if os.path.exists(model_path):
            # Load the model
            model = joblib.load(model_path)
            # Add a column 'prediction' to the DataFrame with model predictions
            df['prediction'] = model.predict(df.drop(columns=['log_bike_count', 'site_id', 'date', 'track_id'], axis=1))
            #df.drop('log_bike_count', inplace=True)
        else:
            print(f"Model file not found for site_id {site_id_value}")
    out = pd.concat(X, ignore_index=True)
    out.drop(columns='log_bike_count', inplace=True)
    
    return out

Now on full set

In [6]:
# Load Test Set
df_test = pd.read_parquet(os.path.join("..", "Datasets", "final_test.parquet"))
df_test['log_bike_count'] = 0

# Run Preprocessing Pipeline
df_test_preprocessed = combined_pipeline.fit_transform(df_test)
df_test_preprocessed = add_prediction_column(df_test_preprocessed)
df_sorted = df_test_preprocessed.sort_values(by='track_id')
df_sorted.rename(columns={'track_id': 'Id', 'prediction': 'log_bike_count'}, inplace=True)


# Extract the selected columns
selected_columns = ['Id', 'log_bike_count']
result_df = df_sorted[selected_columns]

# Save the DataFrame to a CSV file
result_df.to_csv(os.path.join("..", "Code", "submission52.csv"), index=False)