# Catboost


In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [11]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import pandas as pd

df_all = pd.read_csv('combined_ema.csv')

columns_to_drop = [
    'stressor_partner', 'stressor_fam', 'stressor_breakdown',
    'stressor_money', 'stressor_selfcare', 'stressor_health',
    'stressor_otherhealth', 'stressor_household', 'stressor_child',
    'stressor_discrimination', 'stressor_none', 'moststressful',
    'moststressful_time', 'work_location', 'attend_fidam', 'attend_fidpm',
    'attend_hasp', 'attend_pgy1did', 'attend_pgy2did', 'attend_pgy3did',
    'attend_none', 'work_start', 'work_end', 'jobperformance_best',
    'jobsatisfaction', "jobperformance2","date","Sleep1BeginTimestamp",
    'Sleep1EndTimestamp', 'Sleep2BeginTimestamp', 'Sleep2EndTimestamp', 'Sleep3BeginTimestamp', 'Sleep3EndTimestamp'
]

df_all.drop(columns=columns_to_drop, inplace=True, errors='ignore')


import numpy as np

df_all['jobperformance'] = df_all['jobperformance'].replace(' ', np.nan)

df_all.dropna(subset=['jobperformance'], inplace=True)

df_all['jobperformance'] = df_all['jobperformance'].astype(float)

Y = df_all['jobperformance'].astype(float)  # Ensure it is numeric
X = df_all.drop(['jobperformance'], axis=1)

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
for col in X_train.select_dtypes(include=['category']).columns:
    X_train[col] = X_train[col].astype(str).fillna('missing')
    X_val[col] = X_val[col].astype(str).fillna('missing')

model = CatBoostRegressor(verbose=False)
model.fit(
    X_train, Y_train,
    cat_features=[X_train.columns.get_loc(c) for c in X_train.select_dtypes(['object']).columns]  # Now it expects object type since all are converted to string
)

predictions = model.predict(X_val)



In [12]:
from sklearn.metrics import mean_squared_error, r2_score


import numpy as np

def concordance_correlation_coefficient(y_true, y_pred):
    """
    Compute the Concordance Correlation Coefficient for two arrays.
    """
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)

    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    covariance = np.mean((y_pred - mean_pred) * (y_true - mean_true))

    ccc = (2 * covariance) / (var_pred + var_true + (mean_pred - mean_true) ** 2)

    return ccc


train_predictions = model.predict(X_train)

val_predictions = model.predict(X_val)

train_mse = mean_squared_error(Y_train, train_predictions)
train_r2 = r2_score(Y_train, train_predictions)
print("Training Data - Mean Squared Error: {:.2f}".format(train_mse))
print("Training Data - R^2 Score: {:.2f}".format(train_r2))

val_mse = mean_squared_error(Y_val, val_predictions)
val_r2 = r2_score(Y_val, val_predictions)
print("Validation Data - Mean Squared Error: {:.2f}".format(val_mse))
print("Validation Data - R^2 Score: {:.2f}".format(val_r2))



train_ccc = concordance_correlation_coefficient(Y_train, train_predictions)
val_ccc = concordance_correlation_coefficient(Y_val, val_predictions)

print("Training Data - CCC: {:.2f}".format(train_ccc))
print("Validation Data - CCC: {:.2f}".format(val_ccc))


Training Data - Mean Squared Error: 0.22
Training Data - R^2 Score: 0.83
Validation Data - Mean Squared Error: 0.73
Validation Data - R^2 Score: 0.48
Training Data - CCC: 0.90
Validation Data - CCC: 0.67


In [13]:
X_val.keys()

Index(['id', 'completed_ts', 'Cardio_caloriesOut', 'Cardio_max', 'Cardio_min',
       'Cardio_minutes', 'Fat Burn_caloriesOut', 'Fat Burn_max',
       'Fat Burn_min', 'Fat Burn_minutes',
       ...
       'audio_86', 'survey_type', 'delivered_ts', 'started_ts', 'activity',
       'location', 'atypical', 'stress', 'sleepquant', 'sleepqual'],
      dtype='object', length=137)

# XGBoost

In [14]:
X_train_xg = X_train.copy()
X_val_xg = X_val.copy()
Y_train_xg = Y_train.copy()
Y_val_xg = Y_val.copy()

In [15]:
X_train_xg = X_train_xg.astype('category')
X_val_xg = X_val.astype('category')

In [16]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
for column in X_train_xg.columns:
    if X_train_xg[column].dtype == 'object':
        X_train_xg[column].fillna('missing', inplace=True)
        X_val_xg[column].fillna('missing', inplace=True)

        X_train_xg[column] = label_enc.fit_transform(X_train_xg[column])
        X_val_xg[column] = label_enc.transform(X_val_xg[column])


In [17]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.6, learning_rate = 0.1,
                             max_depth = 7, alpha = 10, n_estimators = 500,
                                 enable_categorical=True,)

xgb_model.fit(X_train_xg, Y_train_xg)

xgb_train_predictions = xgb_model.predict(X_train_xg)

xgb_val_predictions = xgb_model.predict(X_val_xg)

xgb_train_mse = mean_squared_error(Y_train_xg, xgb_train_predictions)
xgb_train_r2 = r2_score(Y_train_xg, xgb_train_predictions)

xgb_val_mse = mean_squared_error(Y_val_xg, xgb_val_predictions)
xgb_val_r2 = r2_score(Y_val_xg, xgb_val_predictions)

print("XGBoost Training Data - Mean Squared Error: {:.2f}".format(xgb_train_mse))
print("XGBoost Training Data - R² Score: {:.2f}".format(xgb_train_r2))
print("XGBoost Validation Data - Mean Squared Error: {:.2f}".format(xgb_val_mse))
print("XGBoost Validation Data - R² Score: {:.2f}".format(xgb_val_r2))

def concordance_correlation_coefficient(y_true, y_pred):
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    covariance = np.mean((y_pred - mean_pred) * (y_true - mean_true))
    ccc = (2 * covariance) / (var_pred + var_true + (mean_pred - mean_true) ** 2)
    return ccc

xgb_train_ccc = concordance_correlation_coefficient(Y_train_xg, xgb_train_predictions)
xgb_val_ccc = concordance_correlation_coefficient(Y_val_xg, xgb_val_predictions)

print("XGBoost Training Data - CCC: {:.2f}".format(xgb_train_ccc))
print("XGBoost Validation Data - CCC: {:.2f}".format(xgb_val_ccc))


XGBoost Training Data - Mean Squared Error: 0.38
XGBoost Training Data - R² Score: 0.70
XGBoost Validation Data - Mean Squared Error: 1.12
XGBoost Validation Data - R² Score: 0.22
XGBoost Training Data - CCC: 0.79
XGBoost Validation Data - CCC: 0.33


# LightGBM

In [18]:
Xt_backup = X_train.copy()
Xv_backup = X_val.copy()
Yt_backup = Y_train.copy()
Yv_backup = Y_val.copy()

In [19]:
X_train_l = X_train.copy()
X_val_l = X_val.copy()
Y_train_l = Y_train.copy()
Y_val_l = Y_val.copy()

In [20]:
def convert_to_datetime(df, column):
    df[column] = pd.to_datetime(df[column], errors='coerce')

    if df[column].dtype == '<M8[ns]':  # Check if column is datetime
        df[column + '_hour'] = df[column].dt.hour
        df[column + '_weekday'] = df[column].dt.weekday
    else:
        df[column + '_hour'] = pd.NA
        df[column + '_weekday'] = pd.NA

columns_to_convert = ['delivered_ts', 'started_ts', 'completed_ts']
for col in columns_to_convert:
    convert_to_datetime(X_train, col)
    convert_to_datetime(X_val, col)

X_train.drop(columns_to_convert, axis=1, inplace=True)
X_val.drop(columns_to_convert, axis=1, inplace=True)


In [21]:
categorical_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        X_train[column] = X_train[column].astype('category')
        X_val[column] = X_val[column].astype('category')
        categorical_features.append(column)


In [22]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize LightGBM
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=100,
    categorical_feature=categorical_features  # Ensure categorical features are specified
)

# Fit the model
lgb_model.fit(
    X_train, Y_train,
    eval_set=[(X_val, Y_val)],
    eval_metric='mse',
)

# Predictions
lgb_train_predictions = lgb_model.predict(X_train)
lgb_val_predictions = lgb_model.predict(X_val)



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8891
[LightGBM] [Info] Number of data points in the train set: 542, number of used features: 121
[LightGBM] [Info] Start training from score 2.966790


In [23]:
# Metrics
print("LightGBM")
train_mse = mean_squared_error(Y_train, lgb_train_predictions)
train_r2 = r2_score(Y_train, lgb_train_predictions)
val_mse = mean_squared_error(Y_val, lgb_val_predictions)
val_r2 = r2_score(Y_val, lgb_val_predictions)

print("Training MSE: {:.3f}, R2: {:.2f}".format(train_mse, train_r2))
print("Validation MSE: {:.3f}, R2: {:.2f}".format(val_mse, val_r2))

train_ccc = concordance_correlation_coefficient(Y_train, lgb_train_predictions)
val_ccc = concordance_correlation_coefficient(Y_val_l, lgb_val_predictions)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))


LightGBM
Training MSE: 0.219, R2: 0.83
Validation MSE: 0.784, R2: 0.45
Training CCC: 0.894
Validation CCC: 0.633


# Sk-learn

In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_prepared = preprocessor.fit_transform(X_train)
X_val_prepared = preprocessor.transform(X_val)


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_prepared, Y_train)
rf_train_preds = rf_model.predict(X_train_prepared)
rf_val_preds = rf_model.predict(X_val_prepared)

# Support Vector Machine
svm_model = SVR()
svm_model.fit(X_train_prepared, Y_train)
svm_train_preds = svm_model.predict(X_train_prepared)
svm_val_preds = svm_model.predict(X_val_prepared)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_prepared, Y_train)
lr_train_preds = lr_model.predict(X_train_prepared)
lr_val_preds = lr_model.predict(X_val_prepared)

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_prepared, Y_train)
lasso_train_preds = lasso_model.predict(X_train_prepared)
lasso_val_preds = lasso_model.predict(X_val_prepared)

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(name, train_actuals, train_preds, val_actuals, val_preds):
    train_mse = mean_squared_error(train_actuals, train_preds)
    train_r2 = r2_score(train_actuals, train_preds)
    val_mse = mean_squared_error(val_actuals, val_preds)
    val_r2 = r2_score(val_actuals, val_preds)
    print(f"{name} - Training MSE: {train_mse:.3f}, R²: {train_r2:.2f}")
    print(f"{name} - Validation MSE: {val_mse:.3f}, R²: {val_r2:.2f}")

evaluate_model('Random Forest', Y_train, rf_train_preds, Y_val, rf_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, rf_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, rf_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))

evaluate_model('SVM', Y_train, svm_train_preds, Y_val, svm_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, svm_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, svm_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))
evaluate_model('Linear Regression', Y_train, lr_train_preds, Y_val, lr_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, lr_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, lr_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))
evaluate_model('Lasso', Y_train, lasso_train_preds, Y_val, lasso_val_preds)
train_ccc = concordance_correlation_coefficient(Y_train, lasso_train_preds)
val_ccc = concordance_correlation_coefficient(Y_val, lasso_val_preds)

print("Training CCC: {:.3f}".format(train_ccc))
print("Validation CCC: {:.3f}".format(val_ccc))




Random Forest - Training MSE: 0.126, R²: 0.90
Random Forest - Validation MSE: 0.767, R²: 0.46
Training CCC: 0.938
Validation CCC: 0.585
SVM - Training MSE: 1.285, R²: -0.00
SVM - Validation MSE: 1.381, R²: 0.03
Training CCC: 0.009
Validation CCC: 0.038
Linear Regression - Training MSE: 0.282, R²: 0.78
Linear Regression - Validation MSE: 762325.861, R²: -535082.27
Training CCC: 0.876
Validation CCC: 0.000
Lasso - Training MSE: 1.092, R²: 0.15
Lasso - Validation MSE: 1.341, R²: 0.06
Training CCC: 0.239
Validation CCC: 0.190


  model = cd_fast.enet_coordinate_descent(


# Torch

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class TabularDataset(Dataset):
    def __init__(self, X, y):
        scaler = StandardScaler()
        self.X = torch.tensor(scaler.fit_transform(X), dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TabularDataset(X_train_prepared, Y_train)
val_dataset = TabularDataset(X_val_prepared, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class FeedForwardNet(nn.Module):
    def __init__(self, num_features):
        super(FeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(num_features, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = FeedForwardNet(X_train_prepared.shape[1])


In [None]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        for data, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                outputs = model(data)
                val_loss += criterion(outputs, targets).item()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")

train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)


Epoch 1, Loss: 7.4012, Validation Loss: 8.0982
Epoch 2, Loss: 2.5971, Validation Loss: 4.2371
Epoch 3, Loss: 1.5607, Validation Loss: 2.3721
Epoch 4, Loss: 1.6898, Validation Loss: 1.6664
Epoch 5, Loss: 0.4629, Validation Loss: 1.4562
Epoch 6, Loss: 0.3970, Validation Loss: 1.4188
Epoch 7, Loss: 0.4185, Validation Loss: 1.3557
Epoch 8, Loss: 0.3510, Validation Loss: 1.3434
Epoch 9, Loss: 0.4413, Validation Loss: 1.2889
Epoch 10, Loss: 0.2391, Validation Loss: 1.2890
Epoch 11, Loss: 0.4684, Validation Loss: 1.2581
Epoch 12, Loss: 0.2840, Validation Loss: 1.3051
Epoch 13, Loss: 0.1758, Validation Loss: 1.2673
Epoch 14, Loss: 0.3084, Validation Loss: 1.2957
Epoch 15, Loss: 0.1823, Validation Loss: 1.2743
Epoch 16, Loss: 0.1814, Validation Loss: 1.2643
Epoch 17, Loss: 0.2694, Validation Loss: 1.2476
Epoch 18, Loss: 0.1311, Validation Loss: 1.2143
Epoch 19, Loss: 0.0943, Validation Loss: 1.2775
Epoch 20, Loss: 0.2977, Validation Loss: 1.2373


In [None]:
def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        loss_lst = []
        for data, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            loss_lst += [loss]
        loss = sum(loss_lst) / len(loss_lst)

        model.eval()
        actuals = []
        predictions = []
        with torch.no_grad():
            for data, targets in val_loader:
                outputs = model(data)
                actuals.extend(targets.view(-1).tolist())
                predictions.extend(outputs.view(-1).tolist())

        mse = mean_squared_error(actuals, predictions)
        r2 = r2_score(actuals, predictions)
        ccc = concordance_correlation_coefficient(np.array(actuals), np.array(predictions))

        print(f"Epoch {epoch+1}: Val MSE: {mse:.4f}, Val R²: {r2:.4f}, Val CCC: {ccc:.4f}; Train loss : {loss}")

train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, epochs=20)
def evaluate_model(model, loader):
    model.eval()
    actuals = []
    predictions = []
    with torch.no_grad():
        for data, targets in loader:
            outputs = model(data)
            actuals.extend(targets.view(-1).tolist())
            predictions.extend(outputs.view(-1).tolist())

    mse = mean_squared_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    ccc = concordance_correlation_coefficient(np.array(actuals), np.array(predictions))
    return mse, r2, ccc

final_mse, final_r2, final_ccc = evaluate_model(model, val_loader)
print(f"Final Validation MSE: {final_mse:.4f}, Final Validation R²: {final_r2:.4f}, Final Validation CCC: {final_ccc:.4f}")


Epoch 1: Val MSE: 1.4212, Val R²: 0.0024, Val CCC: 0.4926; Train loss : 0.10040497034788132
Epoch 2: Val MSE: 1.3746, Val R²: 0.0352, Val CCC: 0.4969; Train loss : 0.09729180485010147
Epoch 3: Val MSE: 1.4550, Val R²: -0.0213, Val CCC: 0.4966; Train loss : 0.09762842953205109
Epoch 4: Val MSE: 1.4259, Val R²: -0.0009, Val CCC: 0.4957; Train loss : 0.09813300520181656
Epoch 5: Val MSE: 1.4047, Val R²: 0.0140, Val CCC: 0.5007; Train loss : 0.08900494128465652
Epoch 6: Val MSE: 1.4448, Val R²: -0.0142, Val CCC: 0.4887; Train loss : 0.07794305682182312
Epoch 7: Val MSE: 1.5161, Val R²: -0.0642, Val CCC: 0.4906; Train loss : 0.09237392246723175
Epoch 8: Val MSE: 1.4308, Val R²: -0.0043, Val CCC: 0.4910; Train loss : 0.0813034400343895
Epoch 9: Val MSE: 1.4379, Val R²: -0.0093, Val CCC: 0.4964; Train loss : 0.07742319256067276
Epoch 10: Val MSE: 1.3928, Val R²: 0.0224, Val CCC: 0.4961; Train loss : 0.07485932856798172
Epoch 11: Val MSE: 1.4894, Val R²: -0.0454, Val CCC: 0.4882; Train loss : 