In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *

# Data

To make valid comparison across different methods, we split the original `stack_train` into new train and validation data sets.

In [3]:
# Import data


y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))



In [5]:
stack_train.columns

Index(['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
       'pH, water, unfiltered, field, standard units (Maximum)',
       'pH, water, unfiltered, field, standard units (Minimum)',
       'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
       'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
       'Temperature, water, degrees Celsius (Mean)',
       'Temperature, water, degrees Celsius (Minimum)',
       'Temperature, water, degrees Celsius (Maximum)', 'Date', 'Location_ID'],
      dtype='object')

In [7]:
stack_train = stack_train.astype(column_data_types)
stack_test = stack_test.astype(column_data_types)

In [8]:
stack_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15651 entries, 0 to 15650
Data columns (total 13 columns):
 #   Column                                                                                                Non-Null Count  Dtype  
---  ------                                                                                                --------------  -----  
 0   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)  15651 non-null  float64
 1   pH, water, unfiltered, field, standard units (Maximum)                                                15651 non-null  float64
 2   pH, water, unfiltered, field, standard units (Minimum)                                                15651 non-null  float64
 3   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)  15651 non-null  float64
 4   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)   

# Feature Engineering

In [10]:
stack_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10434 entries, 0 to 10433
Data columns (total 13 columns):
 #   Column                                                                                                Non-Null Count  Dtype  
---  ------                                                                                                --------------  -----  
 0   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)  10434 non-null  float64
 1   pH, water, unfiltered, field, standard units (Maximum)                                                10434 non-null  float64
 2   pH, water, unfiltered, field, standard units (Minimum)                                                10434 non-null  float64
 3   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)  10434 non-null  float64
 4   Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)   

## NOTE: 

Pretend we have the best model already.

This is a TODO!

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID']  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # You can use other scalers as well
categorical_transformer = OneHotEncoder(drop='first')  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed = preprocessor.transform(stack_test)

# Converting the preprocessed data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train_preprocessed, label=y_train)
dval = xgb.DMatrix(X_test_preprocessed, label=y_test)

In [15]:
# Defining XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 7,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Training the XGBoost model
model_xgb = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'validation')], early_stopping_rounds=10)

# Making predictions
y_pred_xgb = model_xgb.predict(dval)

# Calculating RMSE
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGBoost RMSE:", rmse_xgb)

Parameters: { "n_estimators" } are not used.

[0]	validation-rmse:0.14855
[1]	validation-rmse:0.13389
[2]	validation-rmse:0.12068
[3]	validation-rmse:0.10880
[4]	validation-rmse:0.09815
[5]	validation-rmse:0.08857
[6]	validation-rmse:0.07997
[7]	validation-rmse:0.07222
[8]	validation-rmse:0.06529
[9]	validation-rmse:0.05908
[10]	validation-rmse:0.05351
[11]	validation-rmse:0.04849
[12]	validation-rmse:0.04402
[13]	validation-rmse:0.04004
[14]	validation-rmse:0.03647
[15]	validation-rmse:0.03330
[16]	validation-rmse:0.03047
[17]	validation-rmse:0.02797
[18]	validation-rmse:0.02573
[19]	validation-rmse:0.02378
[20]	validation-rmse:0.02206
[21]	validation-rmse:0.02057
[22]	validation-rmse:0.01927
[23]	validation-rmse:0.01814
[24]	validation-rmse:0.01716
[25]	validation-rmse:0.01629
[26]	validation-rmse:0.01554
[27]	validation-rmse:0.01491
[28]	validation-rmse:0.01436
[29]	validation-rmse:0.01387
[30]	validation-rmse:0.01349
[31]	validation-rmse:0.01315
[32]	validation-rmse:0.01287
[33]	va

<xgboost.core.Booster at 0x13599e1d0>