In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas.api.types as ptypes
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import optuna
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# activity key, value pair

activitis_dict = {"Indoor climbing" : 1, "Run" : 2, "Strength training" : 3, "Swim" : 4, "Bike" : 5, "Dancing" : 6, 
             "Stairclimber" : 7, "Spinning" : 8, "Walking" : 9, "HIIT" : 10, "Outdoor Bike" : 11, "Walk" : 12, "Aerobic Workout" : 13,
             "Tennis" : 14, "Workout" : 15, "Hike" : 16, "Zumba": 17, "Sport" : 18, "Yoga" : 19, "Swimming" : 20, "Weights" : 21,
             "Running" : 22, "Cycling" : 23, "CoreTraining" : 24}

# Dataset Processing and some explotation

In [3]:
#%%writefile dataset_preprocessing.py
class Explore_Dataset_and_Post_Processing:
    def __init__(self, train_csv_path):
        assert train_csv_path is not None, "train_csv_path cannot be none"
        self.train_csv_path = train_csv_path
        self.train_df = pd.read_csv(self.train_csv_path)
    
    # some general functions
    def get_shape(self):
        return self.train_df.shape
    
    def get_df_description(self):
        return self.train_df.describe()
    
    def get_column_wise_null(self):
        """
        return the number of values that are null
        """
        return self.train_df.isnull().sum()
    
    def get_percentage_of_missing_values(self): 
        """
        returns the pecentage of missing values
        """
        # getting the percentage of missing values
        missing_values = self.train_df.isnull().sum()
        total_rows = self.train_df.shape[0]
        missing_percentage = (missing_values / total_rows) * 100

        results_df = pd.DataFrame({'Column': missing_values.index,
                                  'Missing Values': missing_values.values,
                                  'Missing Percentage (%)': missing_percentage.values})

        results_df = results_df.sort_values(by='Missing Percentage (%)', ascending=False)
        return results_df
    
    def pp_convert_datetime(self):
        """
        converts the time column to date and time
        """
        assert ptypes.is_datetime64_any_dtype(self.train_df["time"]) is not True, "DateTime is already converted" 
        self.train_df.time = pd.to_datetime(self.train_df.time, format='%H:%M:%S')
        return self
        
    def pp_replace_null_values(self):
        """
        replace all null values with 0
        """
        # check if nan values are present
        has_nan = self.train_df.isnull().values.any()
        if not has_nan:
            print("No existing null values in the dataframe.")
        else:
            self.train_df = self.train_df.fillna(0)
        return self
            
    def pp_drop_id_pnum_column(self):
        self.train_df = self.train_df.drop(["id", "p_num", "time"], axis=1)
        return self
        
    def pp_map_activities_to_number(self, activity_dict):
        """
        Fuction to map the activity names to integers
        """
        # get all the columns with name activity
        try:
            activity_columns = list(self.train_df.filter(like='activity').columns)
            self.train_df[activity_columns] = self.train_df[activity_columns].replace(activitis_dict)
        except AssertionError as e:
            print(f"AssertionError caught: {e}")
        return self
            
    def pp_replace_nan_with_mean(self):
        """
        Function to replace the nan values with mean
        """
        # get the numerical columns in the dataset
        numerical_columns = self.train_df.select_dtypes(include = ['number']).columns
        
        # now replace for every column with nan values the  mean of that column
        self.train_df[numerical_columns] = self.train_df[numerical_columns].apply(lambda col: col.fillna(col.mean()))
        return self
        
    def pp_normalize_numerical_values(self):
        """
        Normlaize the numerical values. Here we do min ma normalization
        """
        # get the numerical columns in the dataset
        numerical_columns = list(self.train_df.select_dtypes(include = ['number']).columns)
        for column in numerical_columns:
            self.train_df[column] = (self.train_df[column] - self.train_df[column].min()) / (self.train_df[column].max() - self.train_df[column].min())
            
        return self
        
    def pp_impute_numeical_columns(self):
        numerical_columns = list(self.train_df.select_dtypes(include = ['number']).columns)
        try:
            numerical_columns.remove('bg+1:00')
        except:
            pass
        imputer = SimpleImputer(strategy='mean')
        self.train_df[numerical_columns] = imputer.fit_transform(self.train_df[numerical_columns])
        scaler = StandardScaler()
        self.train_df[numerical_columns] = scaler.fit_transform(self.train_df[numerical_columns])
        return self
        
    def pp_encode_categorical_columns(self):
        categorical_cols = [col for col in self.train_df.columns if 'activity' in col]
        for col in categorical_cols:
            self.train_df[col] = self.train_df[col].fillna('None')
            le = LabelEncoder()
            self.train_df[col] = le.fit_transform(self.train_df[col])
        return self

    def pp_drop_columns_with_missing_values(self, missing_df,pecentage):
        """
        missing_value_df : o/p of the function obtained from "get_percentage_of_missing_values"
        percentage : column with missing values >= pecentage is dropped for training and testing
        """
        # list of columns with missing values  > 90
        filtered_columns = missing_df[missing_df['Missing Percentage (%)'] > pecentage]['Column'].tolist()
        # now drop all these columns
        self.train_df = self.train_df.drop(filtered_columns, axis = 1)
        return self

    def get_final_df(self):
        return self.train_df
            

# Unit Tests if needed 

In [None]:
class Unit_Tests:
    def __init__(self, df):
        self.df = df
        
    def check_nan_replaced_with_mean(self):
        # first check no Nan values
        numerical_columns = self.df.select_dtypes(include = ['number']).columns
        for col in numerical_columns:
            if self.df[col].isna().sum() == 0:
                return True
            else:
                return False

        
    def check_values_are_normalized(self):
        # check if values are normalized
        numerical_columns = self.df.select_dtypes(include = ['number']).columns
        for col in numerical_columns:
            has_greater_than_one = (self.df[col] > 1).any()
            if has_greater_than_one:
                return False
            
        return True

In [None]:
## This part will be the Unit tests column
train_csv_path = "/kaggle/input/brist1d/train.csv"
obj_ex = Explore_Dataset_and_Post_Processing(train_csv_path)
obj_ex.pp_normalize_numerical_values()

obj_utest = Unit_Tests(df = obj_ex.train_df)
obj_utest.check_values_are_normalized()

# Dataset Creation

In [4]:
class DatasetCreation:
    def __init__(self, train_df, activities_dict, train_ratio, test_ratio):
        self.actvities_dict = activities_dict
        """
        few assertions and checks needed to be done
        """
        # check if all nun values are non eistent
        if train_df.isnull().values.any():
            raise ValueError("There should not be NaN in training data")
        
        # assert that id and p_num is dropped
        columns = list(train_df.columns)
        assert "id" not in columns, "There should not be the id column"
        assert "p_num" not in columns, "There should not be the p_num column"
        
        # check that all the activities column have dtype float64
        activity_columns = list(train_df.filter(like='activity').columns)
        for col in activity_columns:
            for value in train_df[col]:
                if value == 0:
                    pass
                elif value not in self.actvities_dict.values():
                    raise AssertionError(f"Value {value} in column {col} was not correctly mapped!")
        
        # check if train and test ratio is float
        assert isinstance(train_ratio, float), f"Expected float, but got {type(x).__name__}"
        assert isinstance(test_ratio, float), f"Expected float, but got {type(x).__name__}"
        
        self.train_df = train_df
        self.train_ratio = train_ratio
        self.test_ratio = test_ratio
        
    def return_dataset(self):
        train_X = self.train_df.drop(['bg+1:00'], axis = 1)
        train_Y = self.train_df['bg+1:00']
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(train_X, train_Y, test_size=self.test_ratio, random_state=42)
        
        return X_train_split, X_val_split, y_train_split, y_val_split

In [5]:
train_csv_path = "/kaggle/input/brist1d/train.csv"
obj_data_pipeline = Explore_Dataset_and_Post_Processing(train_csv_path)

# Simple training Data creation pipeline
"""
Here the order of preprocessing is very important
"""
# get the percentage of missing values df
missing_values_df = obj_data_pipeline.get_percentage_of_missing_values()
print(missing_values_df.head())
# preporcessing data
processed_train_df = (obj_data_pipeline
                     .pp_drop_id_pnum_column()
                     .pp_drop_columns_with_missing_values(missing_values_df, 90)
                     .pp_replace_null_values()
                     .pp_map_activities_to_number(activitis_dict)
                     .get_final_df())

print("[!] Number of columns in processed df : ", len(processed_train_df.columns))
obj_data_create = DatasetCreation(processed_train_df, activities_dict=activitis_dict, train_ratio=0.8, test_ratio=0.2)
x_train, x_val, y_train, y_val  = obj_data_create.return_dataset()

print("Type of X_train : ", type(x_train))

  self.train_df = pd.read_csv(self.train_csv_path)


         Column  Missing Values  Missing Percentage (%)
164  carbs-4:30          174496               98.571945
170  carbs-4:00          174492               98.569685
161  carbs-4:45          174491               98.569121
155  carbs-5:15          174490               98.568556
152  carbs-5:30          174490               98.568556
[!] Number of columns in processed df :  361
Type of X_train :  <class 'pandas.core.frame.DataFrame'>


# XGBOOST 

## Using Optuna to do hyper parameter Tuning

In [6]:
"""
Using optuna for hyperparameter tuning
## Refrence : https://www.kaggle.com/code/danishyousuf19/blood-glucose-prediction
"""
def objective(trial):
    # Suggest hyperparameters for tuning
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1500),
        'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
        'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 3, 10),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
        'tree_method': 'hist',  
        'device':'cuda'
    }
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_val, label=y_val)
    
    # Train the model
    model = xgb.train(params, dtrain, evals=[(dvalid, 'validation')], num_boost_round=1500, early_stopping_rounds=35, verbose_eval=False)
    
    # Predict on the validation set
    y_pred_valid = model.predict(dvalid)
    
    # Calculate RMSE on the validation set
    rmse = mean_squared_error(y_val, y_pred_valid, squared=False)
    print("RMSE = ",rmse)
    return rmse

In [7]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")



[I 2024-10-24 19:17:00,671] A new study created in memory with name: no-name-3cd6e28c-9b7d-43dc-9d79-4b5a308d4790
  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:17:15,907] Trial 0 finished with value: 2.024354083763068 and parameters: {'n_estimators': 280, 'eta': 0.005610999581349067, 'max_depth': 4, 'min_child_weight': 6.732421978084352, 'subsample': 0.4579535446609093, 'colsample_bytree': 0.41276034452813515, 'lambda': 9.417673580906056, 'alpha': 0.27857016032567916}. Best is trial 0 with value: 2.024354083763068.


RMSE =  2.024354083763068


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:17:37,548] Trial 1 finished with value: 1.6716266417045211 and parameters: {'n_estimators': 446, 'eta': 0.03284739698780513, 'max_depth': 7, 'min_child_weight': 0.0001808867136084749, 'subsample': 0.5697140988912217, 'colsample_bytree': 0.8340459026945859, 'lambda': 7.026470309076461, 'alpha': 1.8887328970170574}. Best is trial 1 with value: 1.6716266417045211.


RMSE =  1.6716266417045211


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:17:48,166] Trial 2 finished with value: 1.8436881899825033 and parameters: {'n_estimators': 1033, 'eta': 0.28342236736710313, 'max_depth': 4, 'min_child_weight': 0.00135781165057396, 'subsample': 0.3818438830799879, 'colsample_bytree': 0.5724945333212526, 'lambda': 3.436898210845372, 'alpha': 2.2136278072369326}. Best is trial 1 with value: 1.6716266417045211.


RMSE =  1.8436881899825033


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:20:22,286] Trial 3 finished with value: 1.4067348589586193 and parameters: {'n_estimators': 365, 'eta': 0.020093639162481303, 'max_depth': 14, 'min_child_weight': 0.005971759564101484, 'subsample': 0.6261651700121292, 'colsample_bytree': 0.11624704554396792, 'lambda': 8.533782245808666, 'alpha': 1.6511090194768308}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  1.4067348589586193


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:20:35,477] Trial 4 finished with value: 2.0022552477262283 and parameters: {'n_estimators': 988, 'eta': 0.01295490318417191, 'max_depth': 3, 'min_child_weight': 0.00018446308517487808, 'subsample': 0.4480767077763906, 'colsample_bytree': 0.6866538725961736, 'lambda': 8.863206496010884, 'alpha': 5.5788871107512605}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  2.0022552477262283


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:20:50,858] Trial 5 finished with value: 2.0264543336791365 and parameters: {'n_estimators': 1083, 'eta': 0.003947190375589293, 'max_depth': 4, 'min_child_weight': 0.00048382670999554155, 'subsample': 0.4811782113913951, 'colsample_bytree': 0.8961236366203157, 'lambda': 7.153323175179169, 'alpha': 8.851277392582915}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  2.0264543336791365


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:22:37,648] Trial 6 finished with value: 1.4565191486048044 and parameters: {'n_estimators': 1270, 'eta': 0.017691951967627063, 'max_depth': 12, 'min_child_weight': 0.12860010578413308, 'subsample': 0.7447449303522637, 'colsample_bytree': 0.48382313198666227, 'lambda': 6.999764989480865, 'alpha': 0.0002651965500755721}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  1.4565191486048044


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:25:39,472] Trial 7 finished with value: 1.4374974943251968 and parameters: {'n_estimators': 680, 'eta': 0.06504394026646809, 'max_depth': 13, 'min_child_weight': 0.3277154899522797, 'subsample': 0.8054254468059143, 'colsample_bytree': 0.8860938697617259, 'lambda': 9.752164556268536, 'alpha': 0.0027953917422771407}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  1.4374974943251968


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:26:04,664] Trial 8 finished with value: 2.046590236099163 and parameters: {'n_estimators': 1183, 'eta': 0.0011970903909900858, 'max_depth': 7, 'min_child_weight': 1.9859239752361009, 'subsample': 0.808280760095446, 'colsample_bytree': 0.9232109769518929, 'lambda': 9.144816350715685, 'alpha': 0.001761489458836818}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  2.046590236099163


  'eta': trial.suggest_loguniform('eta', 0.0005, 0.3),  # learning rate
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10),
  'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 3, 10),
  'alpha': trial.suggest_loguniform('alpha', 1e-4, 10),
Parameters: { "n_estimators" } are not used.

[I 2024-10-24 19:26:22,804] Trial 9 finished with value: 1.5965784107519423 and parameters: {'n_estimators': 1089, 'eta': 0.10564276341237655, 'max_depth': 6, 'min_child_weight': 0.011599105526639494, 'subsample': 0.6426160592229997, 'colsample_bytree': 0.7377528865624307, 'lambda': 3.4815114640226796, 'alpha': 0.011377044509344326}. Best is trial 3 with value: 1.4067348589586193.


RMSE =  1.5965784107519423
Best hyperparameters: {'n_estimators': 365, 'eta': 0.020093639162481303, 'max_depth': 14, 'min_child_weight': 0.005971759564101484, 'subsample': 0.6261651700121292, 'colsample_bytree': 0.11624704554396792, 'lambda': 8.533782245808666, 'alpha': 1.6511090194768308}


In [8]:
## Defining the best params obtained from the previous step
best_params = {
    'n_estimators': 365,
    'eta': 0.020093639162481303,
    'max_depth': 14,
    'min_child_weight': 0.005971759564101484,
    'subsample': 0.62616517001,
    'colsample_bytree': 0.11624704554396792,
    'lambda': 8.533782245808666, 
    'alpha': 1.6511090194768308
}

## Class creating the XGBoost Model
### Depending on the best params received above

In [9]:
class Model_XGBoost:
    def __init__(self, params, X, Y, random_state,
                 objective = 'reg:squarederror', 
                 eval_metric = 'rmse', 
                 device = 'cuda', 
                 tree_method = 'hist',
                 booster = 'gbtree'):
        """
        X : training data X
        Y : training data Y
        params : dictionary of paramteres that were predicted above
        """
        # Train an XGBoost model
        params["objective"] = objective
        params["eval_metric"] = eval_metric
        params["device"] = device
        params["tree_method"] = tree_method
        params["booster"] = booster
        self.params = params
        self.objective = objective
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.X = X
        self.Y = Y
    
    def get_xgb_model(self):
        """
        Using the XGBoost model directly with paramteres we got from optuna
        """
        dtrain = xgb.DMatrix(self.X, label=self.Y)
        model = xgb.train(self.params, dtrain, num_boost_round=1500)
        return model
        
    def get_Regressor_model(self):
        xgb_model = XGBRegressor(objective = self.objective,
                                      n_estimators = self.n_estimators, 
                                      random_state = self.random_state)
        return xgb_model
    
    def fit(self):
        self.xgb_model.fit(self.X, self.Y)

    def xgb_predictions(self, model, x_val = None):
        assert x_val is not None, "Validation data cannot be None"
        xgb_predictions = model.predict(x_val)
        return xgb_predictions
    
    def xgb_rmse(self, y_val, predictions):
        xgb_rmse = np.sqrt(mean_squared_error(y_val, predictions))
        return xgb_rmse

In [10]:
"""
Class to implement cross validation incase it is needed
"""
from sklearn.model_selection import cross_val_score, KFold
class CrossValidation:
    def __init__(self, num_of_splits):
        self.kf = KFold(n_splits=num_of_splits, shuffle=True, random_state=42)
    def get_cv(self):
        return self.kf

## Training and prediction

In [12]:
# here is the experiment to see how number of estimators effect the RMSE
n_estimators = [100]
rmse = []
time_taken = []
implement_cross_validation = False

if implement_cross_validation == False:
    print("[*] Going for Normal XGBOOST")
    #for i in n_estimators:
    start_time = datetime.now()
    obj_xgb_model = Model_XGBoost(best_params,
                             random_state= 42,
                             X = x_train,
                             Y = y_train)

    #xgb_model.fit()
    model = obj_xgb_model.get_xgb_model()
    d_val = xgb.DMatrix(x_val)
    predictions = model.predict(d_val)
    #rmse.append(obj_xgb_model.xgb_rmse(y_val, predictions))
    rmse = obj_xgb_model.xgb_rmse(y_val, predictions)
    print("RMSE : ", rmse)
    end_time = datetime.now()
    #time_taken.append(end_time - start_time)
    print("Total time taken : ", end_time - start_time)
else:
    print("[*] Going fo Cross validation")
    
    start_time = datetime.now()
    obj_xg = Model_XGBoost(objective= 'reg:squarederror',
                             n_estimators= i,
                             random_state= 42,
                             X = x_train,
                             Y = x_train)

    boost_model = obj_xg.get_model()

    obj_cv = CrossValidation(num_of_splits=5)
    kf = obj_cv.get_cv()
    
    for train_index, validation_index in kf.split(x_train):
        X_train, X_val = x_train.iloc[train_index], x_train.iloc[validation_index]
        Y_train, Y_val = y_train.iloc[train_index], y_train.iloc[validation_index]
        boost_model.fit(X_train, Y_train)
        models.append(boost_model)
        predictions = boost_model.predict(x_val)
        rmse = obj_xg.xgb_rmse(y_val, predictions)
        val_predictions_rmse.append(rmse)
        print("Prediction : ", rmse)
    end_time = datetime.now()
    time_taken.append(end_time - start_time)
    

[*] Going for Normal XGBOOST


Parameters: { "n_estimators" } are not used.



RMSE :  1.4067348589586193
Total time taken :  0:02:30.511217


In [13]:
# saveing the model
model.save_model("/kaggle/working/xgb_best_model_24_10_2024.bin")



In [None]:
# some utility stuff for plotting
import matplotlib.pyplot as plt
# Plotting RMSE vs n_estimators
plt.figure(figsize=(14, 6))

# First subplot for RMSE vs n_estimators
plt.subplot(1, 2, 1)
plt.plot(n_estimators, rmse, marker='o', color='b')
plt.title('RMSE vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('RMSE')
plt.grid(True)

# Second subplot for Time taken vs n_estimators
time_taken_in_seconds = [t.total_seconds() for t in time_taken]
plt.subplot(1, 2, 2)
plt.plot(n_estimators, time_taken_in_seconds, marker='o', color='r')
plt.title('Time Taken vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Time Taken (seconds)')
plt.grid(True)

# Show the plots
plt.tight_layout()
plt.show()

### From the above graph we can observe the following
1. RMSE decreases non-linearly with the increase in the number of estimators
2. Time increases linearly with the number of estimators

# Test And Submission 

Here this is just for my test and finalizing which model will I be using.

I am using a separate notebook for submission

In [None]:
final_xgb_model = Model_XGBoost(objective= 'reg:squarederror',
                             n_estimators= 700,
                             random_state= 42,
                             X = x_train,
                             Y = y_train)
final_xgb_model.fit()
final_xgb_model.xgb_model.save_model("/kaggle/working/xgb_est_700_20_10_2024.bin")

In [None]:
# load_submission_df
submission_df = pd.read_csv("/kaggle/input/brist1d/sample_submission.csv")

# load test_csv
test_csv_path = "/kaggle/input/brist1d/test.csv"
obj_ex_test = Explore_Dataset_and_Post_Processing(test_csv_path)

# Simple training Data creation pipeline
obj_ex_test.pp_convert_datetime()
obj_ex_test.pp_drop_id_pnum_column()
#obj_ex.pp_replace_nan_with_mean()
#obj_ex.pp_normalize_numerical_values()
#obj_ex.pp_replace_null_values()
#obj_ex.pp_map_activities_to_number(activitis_dict)
obj_ex_test.pp_impute_numeical_columns()
obj_ex_test.pp_encode_categorical_columns()
obj_ex_test.pp_replace_null_values()

# Choose the best model based on validation performance and make predictions on the test set

test_predictions = xgb_model.predict(obj_ex_test.train_df)

# Create a submission DataFrame
submission_df['bg+1:00'] = test_predictions

# Save the submission file
submission_df.to_csv('submission.csv', index=False)



## The 700 model  is not helping at all