## XGBoost Embeddings

XGBoost_embeddings.ipynb

This script helps examine predictive power of MPNet vector embeddings for predicting
depression symptom severity (PHQ-8 scores) using XGBoost regression. It includes 
preprocessing steps, dimensionality reduction, and hyperparameter tuning.
Developed as part of a study on multilingual lexical markers and depression severity.


**Usage**:
- Place your input CSV file in the desired directory.
- Make sure the file includes columns starting with 'MPNet' containing MPNet embeddings.

**Author**: Anastasiia Tokareva


### Models tested:
1. Full embeddings
2. Embeddings + TSVD dimensionality reduction

In [1]:
## Load libraries

# general
import pandas as pd
import numpy as np

# pre-processing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

# ML pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GroupKFold, cross_validate, GridSearchCV 
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

# regressor
!pip install xgboost
from xgboost import XGBRegressor   


Defaulting to user installation because normal site-packages is not writeable


In [2]:
## 1. Clean data
data = pd.read_csv("C:/Users/your/file/name/here.csv")  
data_cleaned = data.dropna(axis=0) 

## 2. Binarise COVID data
data_cleaned['Recording_Date'] = pd.to_datetime(data_cleaned['Recording_Date'])

# define COVID lockdown start and end dates (dates based on Leightley et al. (2021), https://pubmed.ncbi.nlm.nih.gov/34488697/)
covid_start = pd.to_datetime('2020-03-23')
covid_end = pd.to_datetime('2021-05-11')

data_cleaned['COVID'] = ((data_cleaned['Recording_Date'] >= covid_start) & (data_cleaned['Recording_Date'] <= covid_end)).astype(int)
# COVID now added as the last column (0/1)
data_cleaned.head(n=5)

Unnamed: 0,Site,participant_ID,Age,Gender,Education_Years,Height,Recording_Date,Task,PHQ8,Roberta_1,...,Roberta_1016,Roberta_1017,Roberta_1018,Roberta_1019,Roberta_1020,Roberta_1021,Roberta_1022,Roberta_1023,Roberta_1024,COVID
0,RADAR-MDD-KCL-s1,71a74929-ce52-494f-9d41-d08cbcf53707,56,1,14.0,165.0,2020-05-27,Unscripted,1.0,-0.023733,...,-0.030548,-0.014701,0.04196,0.024152,0.018132,0.044252,0.074237,-0.028201,-0.01721,1
1,RADAR-MDD-KCL-s1,71a74929-ce52-494f-9d41-d08cbcf53707,56,1,14.0,165.0,2020-01-08,Unscripted,2.0,-0.05012,...,0.002845,-0.007813,0.015595,-0.002314,-0.01538,-0.013355,0.05379,0.002994,-0.02968,0
2,RADAR-MDD-KCL-s1,71a74929-ce52-494f-9d41-d08cbcf53707,56,1,14.0,165.0,2020-02-19,Unscripted,1.0,-0.01949,...,0.003455,0.0143,-0.002872,0.001538,0.018539,-0.002485,0.063232,-0.051373,-0.001917,0
3,RADAR-MDD-KCL-s1,71a74929-ce52-494f-9d41-d08cbcf53707,56,1,14.0,165.0,2019-11-28,Unscripted,0.0,-0.002144,...,0.029639,0.00651,0.0388,-0.006526,-0.031539,-0.03064,0.057877,-0.022176,0.01012,0
4,RADAR-MDD-KCL-s1,71a74929-ce52-494f-9d41-d08cbcf53707,56,1,14.0,165.0,2019-12-11,Unscripted,1.0,0.010335,...,-0.00774,0.04384,0.041672,-0.003667,-0.034407,-0.007963,0.037842,-0.015746,-0.036453,0


#### 1. Full Embeddings

In [3]:
# Define MPNet columns 
mpnet = [col for col in data_cleaned.columns if col.startswith('Mpnet_')]  # extract MPNet column names

# Define column transformer (StandardScaler for numerical, FunctionTransformer for raw features)
preprocessor = ColumnTransformer([
    ('num_scaler', StandardScaler(), ['Age', 'Education_Years'] + mpnet),               # scale numerical features + embeddings
    ('num_raw', FunctionTransformer(lambda x: x, validate=False), ['Gender', 'COVID'])  # keeps dummy variables unscaled
])

# Define the pipeline
pipeline_1 = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', XGBRegressor())  
])


#### Grid search parameters

In [4]:
param_grid = {
    'regressor__max_depth': [3, 5, 7, 9],
    'regressor__learning_rate': [0.1, 0.01, 0.001],
    'regressor__subsample': [0.5, 0.7, 0.9],        # lowering the value prevents overfitting
    'regressor__alpha': [0, 0.01, 0.1, 0.5, 1],     # L1 regularisation (default = 0)
    'regressor__lambda': [0, 0.01, 0.1, 0.5, 1]     # L2 regularisation (default = 1)                      
}

#### 2. Embeddings + TSVD dimensionality reduction

In [5]:
# Define column transformer (StandardScaler for numerical, and FunctionTransformer for raw features)
preprocessor = ColumnTransformer([
    ('num_scaler', StandardScaler(), ['Age', 'Education_Years'] + mpnet),               # scale numerical features + embeddings
    ('num_raw', FunctionTransformer(lambda x: x, validate=False), ['Gender', 'COVID'])  # keeps dummy variables unscaled
])

# Define the pipeline
pipeline_2 = Pipeline([
    ('preprocessing', preprocessor),  
    ('pca', TruncatedSVD(n_components=100)), 
    ('regressor', XGBRegressor()) 
])


### Custom RMSE

In [6]:
# Custom RMSE scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the scorers dictionary
scorers = {
    'rmse': make_scorer(rmse), 
    'r2': make_scorer(r2_score)
}

### Set up nested CFV

In [7]:
# demographic variables and all embeddings
X = data_cleaned[['Age','Education_Years','Gender', 'COVID'] + mpnet]

# y = data_cleaned[['PHQ8']]
y = data_cleaned['PHQ8']
groups = data_cleaned['participant_ID']

In [8]:
# Define the outer cross-validation strategy (GroupKFold)
inner_cv = GroupKFold(n_splits=5)
outer_cv = GroupKFold(n_splits=5)

#### 1. Full embeddings

In [9]:
# Inner Loop
Inner_Grid = GridSearchCV(pipeline_1,
                          param_grid,
                          verbose = 1,
                          cv=inner_cv,
                          refit='rmse',
                          return_train_score=True  
                         )

# Outer Loop
nested_results = cross_validate(Inner_Grid, X, y, 
                                cv=outer_cv,
                                groups=groups,
                                params={'groups': groups},  # pass group information to inner split 
                                scoring=scorers,
                                return_train_score=True)    # optionally return train scores

print(f"Average Inner RMSE: {np.mean(nested_results['train_rmse']):.2f}")
print(f"Average Inner R²: {np.mean(nested_results['train_r2']):.2f}")
print(f"Average Outer RMSE: {np.mean(nested_results['test_rmse']):.2f}")
print(f"Average Outer R²: {np.mean(nested_results['test_r2']):.2f}")

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Average Inner RMSE: 4.72
Average Inner R²: 0.37
Average Outer RMSE: 5.73
Average Outer R²: 0.04


#### 2. Embeddings + TSVD dimensionality reduction

In [None]:
# Inner Loop
Inner_Grid = GridSearchCV(pipeline_2,
                          param_grid,
                          verbose = 1,
                          cv=inner_cv,
                          refit='rmse',
                          return_train_score=True  
                         )

# Outer Loop
nested_results = cross_validate(Inner_Grid, X, y, 
                                cv=outer_cv,
                                groups=groups,
                                params={'groups': groups},   # pass group information to inner split 
                                scoring=scorers,
                                return_train_score=True)     # optionally return train scores

print(f"Average Inner RMSE: {np.mean(nested_results['train_rmse']):.2f}")
print(f"Average Inner R²: {np.mean(nested_results['train_r2']):.2f}")
print(f"Average Outer RMSE: {np.mean(nested_results['test_rmse']):.2f}")
print(f"Average Outer R²: {np.mean(nested_results['test_r2']):.2f}")