# Catboost


In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [2]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import pandas as pd

df_all = pd.read_csv('combined_ema.csv')

columns_to_drop = [
    'stressor_partner', 'stressor_fam', 'stressor_breakdown',
    'stressor_money', 'stressor_selfcare', 'stressor_health',
    'stressor_otherhealth', 'stressor_household', 'stressor_child',
    'stressor_discrimination', 'stressor_none', 'moststressful',
    'moststressful_time', 'work_location', 'attend_fidam', 'attend_fidpm',
    'attend_hasp', 'attend_pgy1did', 'attend_pgy2did', 'attend_pgy3did',
    'attend_none', 'work_start', 'work_end', 'jobperformance_best',
    'jobsatisfaction', "jobperformance2","date","Sleep1BeginTimestamp",
    'Sleep1EndTimestamp', 'Sleep2BeginTimestamp', 'Sleep2EndTimestamp', 'Sleep3BeginTimestamp', 'Sleep3EndTimestamp',


]

aud = [f"audio_{i}" for i in range(87)]

columns_to_drop.extend(aud)

df_all.drop(columns=columns_to_drop, inplace=True, errors='ignore')


import numpy as np

df_all['jobperformance'] = df_all['jobperformance'].replace(' ', np.nan)

df_all.dropna(subset=['jobperformance'], inplace=True)

df_all['jobperformance'] = df_all['jobperformance'].astype(float)

Y = df_all['jobperformance'].astype(float)  # Ensure it is numeric
X = df_all.drop(['jobperformance'], axis=1)

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
for col in X_train.select_dtypes(include=['category']).columns:
    X_train[col] = X_train[col].astype(str).fillna('missing')
    X_val[col] = X_val[col].astype(str).fillna('missing')

model = CatBoostRegressor(verbose=False)
model.fit(
    X_train, Y_train,
    cat_features=[X_train.columns.get_loc(c) for c in X_train.select_dtypes(['object']).columns]  # Now it expects object type since all are converted to string
)

predictions = model.predict(X_val)



In [3]:
from sklearn.metrics import mean_squared_error, r2_score


import numpy as np

def concordance_correlation_coefficient(y_true, y_pred):
    """
    Compute the Concordance Correlation Coefficient for two arrays.
    """
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)

    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    covariance = np.mean((y_pred - mean_pred) * (y_true - mean_true))

    ccc = (2 * covariance) / (var_pred + var_true + (mean_pred - mean_true) ** 2)

    return ccc


train_predictions = model.predict(X_train)

val_predictions = model.predict(X_val)

train_mse = mean_squared_error(Y_train, train_predictions)
train_r2 = r2_score(Y_train, train_predictions)
print("Training Data - Mean Squared Error: {:.2f}".format(train_mse))
print("Training Data - R^2 Score: {:.2f}".format(train_r2))

val_mse = mean_squared_error(Y_val, val_predictions)
val_r2 = r2_score(Y_val, val_predictions)
print("Validation Data - Mean Squared Error: {:.2f}".format(val_mse))
print("Validation Data - R^2 Score: {:.2f}".format(val_r2))



train_ccc = concordance_correlation_coefficient(Y_train, train_predictions)
val_ccc = concordance_correlation_coefficient(Y_val, val_predictions)

print("Training Data - CCC: {:.2f}".format(train_ccc))
print("Validation Data - CCC: {:.2f}".format(val_ccc))


Training Data - Mean Squared Error: 0.27
Training Data - R^2 Score: 0.79
Validation Data - Mean Squared Error: 0.81
Validation Data - R^2 Score: 0.43
Training Data - CCC: 0.87
Validation Data - CCC: 0.64


In [4]:
X_val.keys()

Index(['id', 'completed_ts', 'Cardio_caloriesOut', 'Cardio_max', 'Cardio_min',
       'Cardio_minutes', 'Fat Burn_caloriesOut', 'Fat Burn_max',
       'Fat Burn_min', 'Fat Burn_minutes', 'NumberSteps',
       'Out of Range_caloriesOut', 'Out of Range_max', 'Out of Range_min',
       'Out of Range_minutes', 'Peak_caloriesOut', 'Peak_max', 'Peak_min',
       'Peak_minutes', 'RestingHeartRate', 'Sleep1Efficiency',
       'Sleep1MinutesAwake', 'Sleep1MinutesStageDeep',
       'Sleep1MinutesStageLight', 'Sleep1MinutesStageRem',
       'Sleep1MinutesStageWake', 'Sleep2Efficiency', 'Sleep2MinutesAwake',
       'Sleep2MinutesStageDeep', 'Sleep2MinutesStageLight',
       'Sleep2MinutesStageRem', 'Sleep2MinutesStageWake', 'Sleep3Efficiency',
       'Sleep3MinutesAwake', 'Sleep3MinutesStageDeep',
       'Sleep3MinutesStageLight', 'Sleep3MinutesStageRem',
       'Sleep3MinutesStageWake', 'SleepMinutesAsleep', 'SleepMinutesInBed',
       'SleepPerDay', 'survey_type', 'delivered_ts', 'started_ts', '

# XGBoost

In [5]:
X_train_xg = X_train.copy()
X_val_xg = X_val.copy()
Y_train_xg = Y_train.copy()
Y_val_xg = Y_val.copy()

In [6]:
X_train_xg = X_train_xg.astype('category')
X_val_xg = X_val.astype('category')

In [7]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
for column in X_train_xg.columns:
    if X_train_xg[column].dtype == 'object':
        X_train_xg[column].fillna('missing', inplace=True)
        X_val_xg[column].fillna('missing', inplace=True)

        X_train_xg[column] = label_enc.fit_transform(X_train_xg[column])
        X_val_xg[column] = label_enc.transform(X_val_xg[column])


In [8]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.6, learning_rate = 0.1,
                             max_depth = 5, alpha = 10, n_estimators = 100,
                                 enable_categorical=True,)

xgb_model.fit(X_train_xg, Y_train_xg)

xgb_train_predictions = xgb_model.predict(X_train_xg)

xgb_val_predictions = xgb_model.predict(X_val_xg)

xgb_train_mse = mean_squared_error(Y_train_xg, xgb_train_predictions)
xgb_train_r2 = r2_score(Y_train_xg, xgb_train_predictions)

xgb_val_mse = mean_squared_error(Y_val_xg, xgb_val_predictions)
xgb_val_r2 = r2_score(Y_val_xg, xgb_val_predictions)

print("XGBoost Training Data - Mean Squared Error: {:.2f}".format(xgb_train_mse))
print("XGBoost Training Data - R² Score: {:.2f}".format(xgb_train_r2))
print("XGBoost Validation Data - Mean Squared Error: {:.2f}".format(xgb_val_mse))
print("XGBoost Validation Data - R² Score: {:.2f}".format(xgb_val_r2))

def concordance_correlation_coefficient(y_true, y_pred):
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    covariance = np.mean((y_pred - mean_pred) * (y_true - mean_true))
    ccc = (2 * covariance) / (var_pred + var_true + (mean_pred - mean_true) ** 2)
    return ccc

xgb_train_ccc = concordance_correlation_coefficient(Y_train_xg, xgb_train_predictions)
xgb_val_ccc = concordance_correlation_coefficient(Y_val_xg, xgb_val_predictions)

print("XGBoost Training Data - CCC: {:.2f}".format(xgb_train_ccc))
print("XGBoost Validation Data - CCC: {:.2f}".format(xgb_val_ccc))


XGBoost Training Data - Mean Squared Error: 0.44
XGBoost Training Data - R² Score: 0.66
XGBoost Validation Data - Mean Squared Error: 1.12
XGBoost Validation Data - R² Score: 0.22
XGBoost Training Data - CCC: 0.75
XGBoost Validation Data - CCC: 0.34


# LightGBM

In [9]:
Xt_backup = X_train.copy()
Xv_backup = X_val.copy()
Yt_backup = Y_train.copy()
Yv_backup = Y_val.copy()

In [10]:
X_train_l = X_train.copy()
X_val_l = X_val.copy()
Y_train_l = Y_train.copy()
Y_val_l = Y_val.copy()

In [12]:
categorical_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        X_train[column] = X_train[column].astype('category')
        X_val[column] = X_val[column].astype('category')
        categorical_features.append(column)


In [13]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize LightGBM
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=100,
    categorical_feature=categorical_features  # Ensure categorical features are specified
)

# Fit the model
lgb_model.fit(
    X_train, Y_train,
    eval_set=[(X_val, Y_val)],
    eval_metric='mse',
)

# Predictions
lgb_train_predictions = lgb_model.predict(X_train)
lgb_val_predictions = lgb_model.predict(X_val)



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 542, number of used features: 37
[LightGBM] [Info] Start training from score 2.966790


In [14]:
# Metrics
print("LightGBM")
train_mse = mean_squared_error(Y_train, lgb_train_predictions)
train_r2 = r2_score(Y_train, lgb_train_predictions)
val_mse = mean_squared_error(Y_val, lgb_val_predictions)
val_r2 = r2_score(Y_val, lgb_val_predictions)

print("Training MSE: {:.3f}, R2: {:.2f}".format(train_mse, train_r2))
print("Validation MSE: {:.3f}, R2: {:.2f}".format(val_mse, val_r2))

train_ccc = concordance_correlation_coefficient(Y_train, lgb_train_predictions)
val_ccc = concordance_correlation_coefficient(Y_val_l, lgb_val_predictions)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))


LightGBM
Training MSE: 0.269, R2: 0.79
Validation MSE: 0.863, R2: 0.39
Training CCC: 0.867
Validation CCC: 0.600


# Sk-learn

In [15]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_prepared = preprocessor.fit_transform(X_train)
X_val_prepared = preprocessor.transform(X_val)


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_prepared, Y_train)
rf_train_preds = rf_model.predict(X_train_prepared)
rf_val_preds = rf_model.predict(X_val_prepared)

# Support Vector Machine
svm_model = SVR()
svm_model.fit(X_train_prepared, Y_train)
svm_train_preds = svm_model.predict(X_train_prepared)
svm_val_preds = svm_model.predict(X_val_prepared)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_prepared, Y_train)
lr_train_preds = lr_model.predict(X_train_prepared)
lr_val_preds = lr_model.predict(X_val_prepared)

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_prepared, Y_train)
lasso_train_preds = lasso_model.predict(X_train_prepared)
lasso_val_preds = lasso_model.predict(X_val_prepared)

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(name, train_actuals, train_preds, val_actuals, val_preds):
    train_mse = mean_squared_error(train_actuals, train_preds)
    train_r2 = r2_score(train_actuals, train_preds)
    val_mse = mean_squared_error(val_actuals, val_preds)
    val_r2 = r2_score(val_actuals, val_preds)
    print(f"{name} - Training MSE: {train_mse:.3f}, R²: {train_r2:.2f}")
    print(f"{name} - Validation MSE: {val_mse:.3f}, R²: {val_r2:.2f}")

evaluate_model('Random Forest', Y_train, rf_train_preds, Y_val, rf_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, rf_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, rf_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))

evaluate_model('SVM', Y_train, svm_train_preds, Y_val, svm_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, svm_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, svm_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))
evaluate_model('Linear Regression', Y_train, lr_train_preds, Y_val, lr_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, lr_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, lr_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))
evaluate_model('Lasso', Y_train, lasso_train_preds, Y_val, lasso_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, lasso_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, lasso_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))




Random Forest - Training MSE: 0.112, R²: 0.91
Random Forest - Validation MSE: 0.793, R²: 0.44
Training CCC: 0.946
Validation CCC: 0.596
SVM - Training MSE: 1.278, R²: 0.00
SVM - Validation MSE: 1.426, R²: -0.00
Training CCC: 0.009
Validation CCC: 0.007
Linear Regression - Training MSE: 0.018, R²: 0.99
Linear Regression - Validation MSE: 0.932, R²: 0.35
Training CCC: 0.993
Validation CCC: 0.623
Lasso - Training MSE: 1.251, R²: 0.02
Lasso - Validation MSE: 1.458, R²: -0.02
Training CCC: 0.043
Validation CCC: 0.015
