In [1]:
import os
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"]="6"
from cuml.naive_bayes import GaussianNB
from cuml.common.device_selection import set_global_device_type, get_global_device_type
from model.tuning import *

set_global_device_type('gpu')
print('new device type:', get_global_device_type())

from model.testing import *

new device type: DeviceType.device


In [2]:
# load data
train_val_path = "./data/fallreports_2023-9-21_train.csv"
test_path = "./data/fallreports_2023-9-21_test.csv"
train_val_df = pd.read_csv(train_val_path)
test_df = pd.read_csv(test_path)

In [3]:
# define parameters
target_feature = 'fog_q_class'
ngram_list = [1, 2]
n_features = 250
vector_type = 'tf-idf'
augment_dict = {
    'token_length': True, 
    'patient_demographics': True, 
    'mds_updrs': True,
    'moca': True
}

In [4]:
model = GaussianNB()

gnb_eval_obj = train_model(
    model, 
    train_val_df, 
    test_df, 
    target_feature=target_feature, 
    ngram_list=ngram_list, 
    n_features=n_features, 
    vector_type=vector_type, 
    augment_dict=augment_dict, 
    verbose=True
)


train metrics: ------------------------------

Accuracy: 0.8696
Precision: 0.9817
Recall: 0.7431
F1-Score: 0.8458

test metrics: ------------------------------

Accuracy: 0.6056
Precision: 0.6400
Recall: 0.4571
F1-Score: 0.5333


In [5]:
from sklearn.ensemble import VotingClassifier
from cuml.ensemble import RandomForestClassifier
# from cuml import SGD
from cuml import LogisticRegression

In [6]:
# using best hyperparameters found during optimization
rfc_lr_dict = {
    'max_depth': 25.518664071950102, 
    'n_estimators': 306.22402602953844, 
    'C': 95.34740492076249, 
    'l1_ratio': 0.34681920586406, 
    'penalty': 'l2',
    'rfc_weight': 0.6
}

def ensemble_classifier(max_depth, n_estimators, C, l1_ratio, penalty, rfc_weight):
    # define classifiers
    rfc = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    lr = LogisticRegression(C=C, l1_ratio=l1_ratio, penalty=penalty)
    
    # define voting ensemble
    voting_model = VotingClassifier(
        estimators=[
            ('rfc', rfc), 
            ('lr', lr),
        ],
        voting='soft',
        weights=[rfc_weight, 1.0 - rfc_weight],
    )
    return voting_model

In [7]:
voting_eval_obj = train_model(
    ensemble_classifier(**rfc_lr_dict), 
    train_val_df, 
    test_df, 
    target_feature=target_feature, 
    ngram_list=ngram_list, 
    n_features=n_features, 
    vector_type=vector_type, 
    augment_dict=augment_dict, 
    verbose=True
)


train metrics: ------------------------------

Accuracy: 0.9967
Precision: 0.9931
Recall: 1.0000
F1-Score: 0.9965

test metrics: ------------------------------

Accuracy: 0.9014
Precision: 0.8889
Recall: 0.9143
F1-Score: 0.9014


In [14]:
ensemble_best_param_dict = {
    'C': 2045.7891312130546,
    'l1_ratio': 0.049833394439413,
    'max_depth': 48.12718850106587,
    'n_estimators': 97.06033604146855,
    'penalty': 'elasticnet',
    'rfc_weight': 0.855137554079295
}

augment_dict = {
    'token_length': True, 
    'patient_demographics': True, 
    'mds_updrs': True,
    'moca': True
}

voting_eval_obj = train_model(
    ensemble_classifier(**ensemble_best_param_dict), 
    train_val_df, 
    test_df, 
    target_feature=target_feature, 
    ngram_list=ngram_list, 
    n_features=n_features, 
    vector_type=vector_type, 
    augment_dict=augment_dict, 
    verbose=True
)

[W] [18:12:00.368848] QWL-QN: max iterations reached
[W] [18:12:00.368938] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.

train metrics: ------------------------------

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000

test metrics: ------------------------------

Accuracy: 0.8873
Precision: 0.8857
Recall: 0.8857
F1-Score: 0.8857


In [15]:
augment_dict = {
    'token_length': False, 
    'patient_demographics': False, 
    'mds_updrs': True,
    'moca': True
}

voting_eval_obj = train_model(
    ensemble_classifier(**ensemble_best_param_dict), 
    train_val_df, 
    test_df, 
    target_feature=target_feature, 
    ngram_list=ngram_list, 
    n_features=n_features, 
    vector_type=vector_type, 
    augment_dict=augment_dict, 
    verbose=True
)


train metrics: ------------------------------

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000

test metrics: ------------------------------

Accuracy: 0.8028
Precision: 0.8000
Recall: 0.8000
F1-Score: 0.8000
