# Machine Learning Integration

In [1]:
import cuml
dir(cuml.ensemble)


['RandomForestClassifier',
 'RandomForestRegressor',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_check_fil_parameter_validity',
 '_obtain_fil_model',
 'randomforest_common',
 'randomforest_shared',
 'randomforestclassifier',
 'randomforestregressor']

In [2]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.ensemble import GradientBoostingRegressor as cuGBR
from cuml.svm import SVR as cuSVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from tqdm import tqdm

**importing datasets**

In [3]:
dataset1 = "/kaggle/input/brain-activity/processed data/combined_brain_activity.csv"
dataset2 = "/kaggle/input/brain-activity/processed data/processed_participent_data.csv"

In [4]:
brain_activity_data = pd.read_csv(dataset1)
participant_data = pd.read_csv(dataset2)

In [5]:
brain_activity_data.head()

Unnamed: 0,participant_number,row_id,oxy_channel_1,deoxy_channel_1,oxy_channel_2,deoxy_channel_2,oxy_channel_3,deoxy_channel_3,oxy_channel_4,deoxy_channel_4,...,deoxy_channel_18,oxy_channel_19,deoxy_channel_19,oxy_channel_20,deoxy_channel_20,oxy_channel_21,deoxy_channel_21,task,avg_oxy,avg_deoxy
0,301,23,4.289045,0.459794,0.657899,-0.894504,1.455266,0.028672,1.073885,-0.067536,...,0.014017,-0.936025,-0.028506,0.80031,-0.203953,0.289717,-0.750553,one_back,0.271544,-0.119245
1,301,24,5.216971,-3.337158,-2.768425,2.102182,1.581249,0.151063,0.648402,-0.232901,...,0.015815,-0.77073,0.074967,0.435788,0.039757,-1.701826,1.56774,one_back,0.141732,-0.291624
2,301,25,2.623103,-0.293645,-0.989687,0.968344,1.058958,0.110978,-0.239769,0.076019,...,0.072948,-0.252961,-0.189784,-0.243315,0.020621,2.407094,-0.793055,one_back,0.13894,-0.025194
3,301,28,1.624431,-0.885932,0.59011,-1.165932,0.617608,0.204075,-0.333913,-0.364125,...,0.031854,-0.865719,0.048385,0.09506,-0.254102,-0.407511,-0.61536,one_back,-0.084068,0.191113
4,301,39,3.26997,-1.588405,-0.276325,0.085944,-0.739847,0.735898,0.22775,-0.034164,...,0.036143,-0.756596,0.162255,-0.155472,-0.082326,-0.625893,0.629442,one_back,-0.007263,-0.354348


In [6]:
brain_activity_data.shape

(1052524, 47)

In [7]:
participant_data.head()

Unnamed: 0,participant_number,participant_group,participant_age,participant_sex,participant_moca,participant_rbans,participant_criq
0,301,YA,21,M,28.0,0.66,120.0
1,302,YA,22,M,28.0,0.66,120.0
2,303,YA,21,F,28.0,0.66,120.0
3,304,YA,23,F,28.0,0.66,120.0
4,305,YA,21,F,28.0,0.66,120.0


In [8]:
participant_data.shape

(58, 7)

**Merging Datasets**

In [9]:
merged_data = pd.merge(brain_activity_data, participant_data, on='participant_number')
merged_data = merged_data.dropna()

In [10]:
merged_data.shape

(1046216, 53)

In [11]:
merged_data.head()

Unnamed: 0,participant_number,row_id,oxy_channel_1,deoxy_channel_1,oxy_channel_2,deoxy_channel_2,oxy_channel_3,deoxy_channel_3,oxy_channel_4,deoxy_channel_4,...,deoxy_channel_21,task,avg_oxy,avg_deoxy,participant_group,participant_age,participant_sex,participant_moca,participant_rbans,participant_criq
0,301,23,4.289045,0.459794,0.657899,-0.894504,1.455266,0.028672,1.073885,-0.067536,...,-0.750553,one_back,0.271544,-0.119245,YA,21,M,28.0,0.66,120.0
1,301,24,5.216971,-3.337158,-2.768425,2.102182,1.581249,0.151063,0.648402,-0.232901,...,1.56774,one_back,0.141732,-0.291624,YA,21,M,28.0,0.66,120.0
2,301,25,2.623103,-0.293645,-0.989687,0.968344,1.058958,0.110978,-0.239769,0.076019,...,-0.793055,one_back,0.13894,-0.025194,YA,21,M,28.0,0.66,120.0
3,301,28,1.624431,-0.885932,0.59011,-1.165932,0.617608,0.204075,-0.333913,-0.364125,...,-0.61536,one_back,-0.084068,0.191113,YA,21,M,28.0,0.66,120.0
4,301,39,3.26997,-1.588405,-0.276325,0.085944,-0.739847,0.735898,0.22775,-0.034164,...,0.629442,one_back,-0.007263,-0.354348,YA,21,M,28.0,0.66,120.0


**Feature Selection**

In [12]:
oxy_columns = [col for col in merged_data.columns if 'oxy_channel' in col]
deoxy_columns = [col for col in merged_data.columns if 'deoxy_channel' in col]
additional_features = ['avg_oxy', 'avg_deoxy',  'participant_moca', 
                       'participant_rbans', 'participant_criq']


In [13]:
features = oxy_columns + deoxy_columns + additional_features

X = merged_data[features]
y = merged_data['participant_age']


**Normalizing the data**

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Spllitting the data into test and train

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


**Implementing Linear regression**

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score


In [17]:
# Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [18]:
gpu_lr_pred = linear_model.predict(X_test)
gpu_lr_mae = mean_absolute_error(y_test, gpu_lr_pred)
gpu_lr_r2 = r2_score(y_test, gpu_lr_pred)

**implementing Random Forest**

In [19]:
gpu_rf_model = cuRF(n_estimators=100, random_state=42)
with tqdm(total=1, desc="Training GPU Random Forest") as pbar:
    gpu_rf_model.fit(X_train, y_train)
    pbar.update(1)
gpu_rf_pred = gpu_rf_model.predict(X_test)
gpu_rf_mae = mean_absolute_error(y_test, gpu_rf_pred)
gpu_rf_r2 = r2_score(y_test, gpu_rf_pred)

  return func(**kwargs)
  ret = func(*args, **kwargs)
Training GPU Random Forest: 100%|██████████| 1/1 [00:10<00:00, 10.13s/it]


**Implementing XGBoost Regressor**

In [20]:
xgb_model = XGBRegressor(n_estimators=100, random_state=42, verbosity=1,
                         tree_method='gpu_hist', predictor='gpu_predictor')
with tqdm(total=1, desc="Training XGBoost") as pbar:
    xgb_model.fit(X_train, y_train)
    pbar.update(1)
xgb_pred = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.

Training XGBoost: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]

    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




**Evaluating and choosing the best model**

In [30]:
print("Linear Model Result:")
print("Mean Absolute Error: ", gpu_lr_mae)
print("R-squared: ", gpu_lr_r2)

Linear Model Result:
Mean Absolute Error:  18.13617986515348
R-squared:  0.37066653258884885


In [31]:
print("Random Forest Result:")
print("Mean Absolute Error: ", gpu_rf_mae)
print("R-squared: ", gpu_rf_r2)

Random Forest Result:
Mean Absolute Error:  2.7928106544745708
R-squared:  0.964456545044557


In [34]:
print("XGBoost Result:")
print("Mean Absolute Error: ", xgb_mae)
print("R-squared: ", xgb_r2)

XGBoost Result:
Mean Absolute Error:  0.34297200718309967
R-squared:  0.9995763355322441


In [27]:
models = {
    'Linear Regression Model': (linear_model, gpu_lr_mae, gpu_lr_r2),
    'Random Forest': (gpu_rf_model, gpu_rf_mae, gpu_rf_r2),
    'XGBoost': (xgb_model, xgb_mae, xgb_r2)
}


In [23]:
best_model_name = min(models, key=lambda k: models[k][1])  # Based on lowest MAE
best_model = models[best_model_name][0]

In [24]:
print(f"Best model: {best_model_name}")
print(f"MAE: {models[best_model_name][1]}")
print(f"R²: {models[best_model_name][2]}")


Best model: XGBoost
MAE: 0.34297200718309967
R²: 0.9995763355322441


**Saving the best model**

In [28]:
# Save the best model using joblib
joblib.dump(best_model, 'best_model.joblib')

['best_model.joblib']