# Best Results for Each Task

This notebook shows the evaluation process. Extracted features are utilized as the input of SVM, and different metrics are reported to showcase the effectiveness of our model.

In [11]:
## load data
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.externals import joblib #import joblib #if externals does not exist
from sklearn import decomposition
from sklearn.metrics import (
    make_scorer,
    recall_score,
    roc_auc_score,
    f1_score,
    precision_score,
    accuracy_score,
)
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
import random
import warnings

warnings.filterwarnings("ignore")


def RandomUnderSampler(np_data, np_label): 
    """downsample the majority class according to the given labels.

    :param np_data: extracted features as a array
    :type np_data: numpy.ndarray
    :param np_label: correspoinds labes as a vector
    :type np_data: numpy.ndarray
    :return: feature vectors and labels for balanced samples
    :rtype: numpy.ndarray
    """
    label = list(set(np_label))
    
    # perform a sanity check
    if len(label) < 2:
        raise ValueError("Less than two classed input")
        
    # seperate two class
    number_c0 = np.sum(np_label == label[0])
    number_c1 = np.sum(np_label == label[1])
    x_c0 = np_data[np_label == label[0], :]
    x_c1 = np_data[np_label == label[1], :]
    y_c0 = np_label[np_label == label[0]]
    y_c1 = np_label[np_label == label[1]]
    
    # downsample the majority class
    random.seed(0)
    if number_c0 < number_c1:
        index = random.sample(range(0, number_c1), number_c0)
        x_c1 = x_c1[index, :]
        y_c1 = y_c1[index]

    else:
        index = random.sample(range(0, number_c0), number_c1)
        x_c0 = x_c0[index, :]
        y_c0 = y_c0[index]
        
    new_data = np.concatenate((x_c0, x_c1), axis=0)
    new_label = np.concatenate((y_c0, y_c1), axis=0)
    
    #return the balanced class
    return new_data, new_label


## Task1

### Feature: Type 3, (A) denotes that we use VGGish-based feature plus duration, tempo, onset, and period.
### Parameter: Cough+Breath with PCA =0.95

In [12]:
for features in [4]:  # different feature combination
    print("========feature group=======")
    Number = features + 256  # the feature dimension for one modal
    print("features dimension:", Number)
    x_data_all_1 = np.load("x_data_handcraft.npy", allow_pickle=True)
    x_data_all_2 = np.load("x_data_vgg.npy", allow_pickle=True)
    x_data_all_2 = np.squeeze(x_data_all_2)  # breath all, cough all, demo
    # subset from handcraft for breath + vgg for breath + subset from handcraft for cough + vgg for cough
    x_data_all = np.concatenate(
        (
            x_data_all_1[:, :features],
            x_data_all_2[:, :256],
            x_data_all_1[:, 477 : 477 + features],
            x_data_all_2[:, 256:],
        ),
        axis=1,
    )

    y_label_all = np.load(
        "y_label_handcraft.npy", allow_pickle=True
    )  # labels are the same, eigher handcraft or vgg is correct
    y_uid_all = np.load("y_uid_handcraft.npy", allow_pickle=True)

    # split the data for different tasks
    x_data_all_1 = x_data_all[y_label_all == 1]  #covidandroidnocough
    x_data_all_2 = x_data_all[y_label_all == 2]  #covidandroidwithcough
    x_data_all_3 = x_data_all[y_label_all == 3]  #covidwebnocough
    x_data_all_4 = x_data_all[y_label_all == 4]  #covidwebwithcough
    x_data_all_6 = x_data_all[y_label_all == 6]  #asthmaandroidwithcough
    x_data_all_8 = x_data_all[y_label_all == 8]  #asthmawebwithcough
    x_data_all_m1 = x_data_all[y_label_all == -1] #healthyandroidnosymp
    x_data_all_m2 = x_data_all[y_label_all == -2] #healthyandroidwithcough
    x_data_all_m3 = x_data_all[y_label_all == -3] #healthywebnosymp
    x_data_all_m4 = x_data_all[y_label_all == -4] #healthywebwithcough

    y_label_all_1 = y_label_all[y_label_all == 1]
    y_label_all_2 = y_label_all[y_label_all == 2]
    y_label_all_3 = y_label_all[y_label_all == 3]
    y_label_all_4 = y_label_all[y_label_all == 4]
    y_label_all_6 = y_label_all[y_label_all == 6]
    y_label_all_8 = y_label_all[y_label_all == 8]
    y_label_all_m1 = y_label_all[y_label_all == -1]
    y_label_all_m2 = y_label_all[y_label_all == -2]
    y_label_all_m3 = y_label_all[y_label_all == -3]
    y_label_all_m4 = y_label_all[y_label_all == -4]

    y_uid_1 = y_uid_all[y_label_all == 1]
    y_uid_2 = y_uid_all[y_label_all == 2]
    y_uid_3 = y_uid_all[y_label_all == 3]
    y_uid_4 = y_uid_all[y_label_all == 4]
    y_uid_6 = y_uid_all[y_label_all == 6]
    y_uid_8 = y_uid_all[y_label_all == 8]
    y_uid_m1 = y_uid_all[y_label_all == -1]
    y_uid_m2 = y_uid_all[y_label_all == -2]
    y_uid_m3 = y_uid_all[y_label_all == -3]
    y_uid_m4 = y_uid_all[y_label_all == -4]

    head = [
        "Tasks",
        "Train",
        "Test",  
        "BreathingCough_PCA",
        "BreathingCough_AUC",
        "BreathingCough_ACC",
        "BreathingCough_Pre",
        "BreathingCough_Rec",
    ]

    for i0 in ["balanced"]:  # downsample to balance for both train and test
        for n in [0.95]:  # the variance remained after PCA
            print("PCA:", n)
            for i1 in ["task1"]:
                print("Conduct", i1)
                line = [i0 + str(n) + i1]
                if i1 == "task1":
                    x_data_all_task = np.concatenate(
                        (
                            x_data_all_1,
                            x_data_all_2,
                            x_data_all_3,
                            x_data_all_4,
                            x_data_all_m1,
                            x_data_all_m3,
                        ),
                        axis=0,
                    )
                    y_label_all_task = np.concatenate(
                        (
                            y_label_all_1,
                            y_label_all_2,
                            y_label_all_3,
                            y_label_all_4,
                            y_label_all_m1,
                            y_label_all_m3,
                        ),
                        axis=0,
                    )
                    y_uid_all_task = np.concatenate(
                        (y_uid_1, y_uid_2, y_uid_3, y_uid_4, y_uid_m1, y_uid_m3), axis=0
                    )

                    y_label_all_task[y_label_all_task > 0] = 1  # covid positive
                    y_label_all_task[y_label_all_task < 0] = 0

                for i2 in ["breath_cough"]:  # multi-modal
                    x_data_all_this = x_data_all_task[:, : Number * 2]

                    dpca = []
                    acc = []
                    pre = []
                    rec = []
                    auc = []
                    prauc = []
                    train_ratio = []
                    test_ratio = []

                    for seed in [1, 2, 5, 10, 100, 200, 500, 1000, 2000, 5000]:

                        gss = GroupShuffleSplit(
                            n_splits=1, test_size=0.2, random_state=seed
                        )
                        idx1, idx2 = next(
                            gss.split(x_data_all_this, groups=y_uid_all_task)
                        )

                        # Get the split DataFrames.
                        train_x, test_x = x_data_all_this[idx1], x_data_all_this[idx2]
                        y_train, y_test = y_label_all_task[idx1], y_label_all_task[idx2]
                        uid_train, uid_test = y_uid_all_task[idx1], y_uid_all_task[idx2]

                        # merge training samples
                        if i1 != "task1":
                            train_users = set(uid_train)

                        train_x, y_train = RandomUnderSampler(train_x, y_train)
                        test_x, y_test = RandomUnderSampler(test_x, y_test)

                        # train_ratio.append(1.0*np.sum(y_train==1)/np.sum(y_train==0))
                        train_ratio.append(train_x.shape[0])
                        # test_ratio.append(1.0*np.sum(y_test==1)/np.sum(y_test==0))
                        test_ratio.append(test_x.shape[0])

                        scaler = preprocessing.StandardScaler().fit(train_x)
                        x_train_n = scaler.transform(train_x)
                        x_test_n = scaler.transform(test_x)

                        pca = decomposition.PCA(n)
                        pca.fit(x_train_n)
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        dpca.append(x_train_n_pca.shape[1])
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        x_test_n_pca = pca.transform(x_test_n)

                        # for SVM
                        param_grid = [
                            {
                                "C": [10, 100, 1000],
                                "gamma": [0.1, 0.01, 0.001, 0.0001],
                                "kernel": ["rbf"],
                                "class_weight": ["balanced"],
                            }
                        ]

                        clf = SVC(probability=True)
                        gs = GridSearchCV(
                            clf,
                            param_grid,
                            scoring=make_scorer(roc_auc_score),
                            n_jobs=-1,
                            cv=5,
                        )

                        gs = gs.fit(x_train_n_pca, y_train)
                        joblib.dump(gs.best_estimator_, "best_model_android.pkl")

                        clf = joblib.load("best_model_android.pkl")
                        predicted = clf.predict(x_test_n_pca)
                        probs = clf.predict_proba(x_test_n_pca)
                        pre.append(metrics.precision_score(y_test, predicted))
                        acc.append(metrics.accuracy_score(y_test, predicted))
                        auc.append(metrics.roc_auc_score(y_test, probs[:, 1]))
                        precision, recall, _ = precision_recall_curve(
                            y_test, probs[:, 1]
                        )
                        prauc.append(metrics.auc(recall, precision))
                        rec.append(metrics.recall_score(y_test, predicted))

                    line.append(
                        "{:.2f}".format(np.mean(train_ratio)) + "("
                        "{:.2f}".format(np.std(train_ratio)) + ")"
                    )
                    line.append(
                        "{:.2f}".format(np.mean(test_ratio)) + "("
                        "{:.2f}".format(np.std(test_ratio)) + ")"
                    )

                    line.append(
                        "{:.2f}".format(np.mean(dpca)) + "("
                        "{:.2f}".format(np.std(dpca)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(auc)) + "("
                        "{:.4f}".format(np.std(auc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(acc)) + "("
                        "{:.4f}".format(np.std(acc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(pre)) + "("
                        "{:.4f}".format(np.std(pre)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(rec)) + "("
                        "{:.4f}".format(np.std(rec)) + ")"
                    )

    for i in range(len(line)):
        print(head[i], line[i])

features dimension: 260
PCA: 0.95
Conduct task1
Tasks balanced0.95task1
Train 219.80(22.72)
Test 60.80(20.28)
BreathingCough_PCA 118.20(6.84)
BreathingCough_AUC 0.7971(0.0688)
BreathingCough_ACC 0.7059(0.0588)
BreathingCough_Pre 0.7163(0.0612)
BreathingCough_Rec 0.6864(0.1128)


## Task2
### Feature: Type 3, (A) denotes that we use VGGish-based feature plus duration, tempo, onset, and period.
### Parameter: Cough with PCA =0.9


In [14]:
for features in [4]:  # different feature combination
    print("========feature group=======")
    Number = features + 256  # the feature dimension for one modal
    print("features dimension:", Number)
    x_data_all_1 = np.load("x_data_handcraft.npy", allow_pickle=True)
    x_data_all_2 = np.load("x_data_vgg.npy", allow_pickle=True)
    x_data_all_2 = np.squeeze(x_data_all_2)  # breath all, cough all, demo
    # subset from handcraft for breath + vgg for breath + subset from handcraft for cough + vgg for cough
    x_data_all = np.concatenate(
        (
            x_data_all_1[:, :features],
            x_data_all_2[:, :256],
            x_data_all_1[:, 477 : 477 + features],
            x_data_all_2[:, 256:],
        ),
        axis=1,
    )

    y_label_all = np.load(
        "y_label_handcraft.npy", allow_pickle=True
    )  # labels are the same, eigher handcraft or vgg is correct
    y_uid_all = np.load("y_uid_handcraft.npy", allow_pickle=True)

    # split the data for different tasks
    x_data_all_1 = x_data_all[y_label_all == 1]  #covidandroidnocough
    x_data_all_2 = x_data_all[y_label_all == 2]  #covidandroidwithcough
    x_data_all_3 = x_data_all[y_label_all == 3]  #covidwebnocough
    x_data_all_4 = x_data_all[y_label_all == 4]  #covidwebwithcough
    x_data_all_6 = x_data_all[y_label_all == 6]  #asthmaandroidwithcough
    x_data_all_8 = x_data_all[y_label_all == 8]  #asthmawebwithcough
    x_data_all_m1 = x_data_all[y_label_all == -1] #healthyandroidnosymp
    x_data_all_m2 = x_data_all[y_label_all == -2] #healthyandroidwithcough
    x_data_all_m3 = x_data_all[y_label_all == -3] #healthywebnosymp
    x_data_all_m4 = x_data_all[y_label_all == -4] #healthywebwithcough

    y_label_all_1 = y_label_all[y_label_all == 1]
    y_label_all_2 = y_label_all[y_label_all == 2]
    y_label_all_3 = y_label_all[y_label_all == 3]
    y_label_all_4 = y_label_all[y_label_all == 4]
    y_label_all_6 = y_label_all[y_label_all == 6]
    y_label_all_8 = y_label_all[y_label_all == 8]
    y_label_all_m1 = y_label_all[y_label_all == -1]
    y_label_all_m2 = y_label_all[y_label_all == -2]
    y_label_all_m3 = y_label_all[y_label_all == -3]
    y_label_all_m4 = y_label_all[y_label_all == -4]

    y_uid_1 = y_uid_all[y_label_all == 1]
    y_uid_2 = y_uid_all[y_label_all == 2]
    y_uid_3 = y_uid_all[y_label_all == 3]
    y_uid_4 = y_uid_all[y_label_all == 4]
    y_uid_6 = y_uid_all[y_label_all == 6]
    y_uid_8 = y_uid_all[y_label_all == 8]
    y_uid_m1 = y_uid_all[y_label_all == -1]
    y_uid_m2 = y_uid_all[y_label_all == -2]
    y_uid_m3 = y_uid_all[y_label_all == -3]
    y_uid_m4 = y_uid_all[y_label_all == -4]

    head = [
        "Tasks",
        "Train",
        "Test", 
        "Cough_PCA",
        "Cough_AUC",
        "Cough_ACC",
        "Cough_Pre",
        "Cough_Rec",
    ]

    for i0 in ["balanced"]:  # downsample to balance for both train and test
        for n in [0.9]:  # the variance remained after PCA
            print("PCA:", n)
            for i1 in ["task2"]:
                print("Conduct", i1)
                line = [i0 + str(n) + i1]
                if i1 == "task2":
                    x_data_all_task = np.concatenate(
                        (x_data_all_2, x_data_all_4, x_data_all_m2, x_data_all_m4),
                        axis=0,
                    )
                    y_label_all_task = np.concatenate(
                        (y_label_all_2, y_label_all_4, y_label_all_m2, y_label_all_m4),
                        axis=0,
                    )
                    y_uid_all_task = np.concatenate(
                        (y_uid_2, y_uid_4, y_uid_m2, y_uid_m4), axis=0
                    )

                    y_label_all_task[y_label_all_task > 0] = 1  # covid positive
                    y_label_all_task[y_label_all_task < 0] = 0

                for i2 in ["cough"]:  # multi-modal
                    x_data_all_this = x_data_all_task[:, Number : Number * 2]

                    dpca = []
                    acc = []
                    pre = []
                    rec = []
                    auc = []
                    prauc = []
                    train_ratio = []
                    test_ratio = []

                    for seed in [1, 2, 5, 10, 100, 200, 500, 1000, 2000, 5000]:

                        gss = GroupShuffleSplit(
                            n_splits=1, test_size=0.2, random_state=seed
                        )
                        idx1, idx2 = next(
                            gss.split(x_data_all_this, groups=y_uid_all_task)
                        )

                        # Get the split DataFrames.
                        train_x, test_x = x_data_all_this[idx1], x_data_all_this[idx2]
                        y_train, y_test = y_label_all_task[idx1], y_label_all_task[idx2]
                        uid_train, uid_test = y_uid_all_task[idx1], y_uid_all_task[idx2]

                        # merge training samples
                        if i1 != "task1":
                            train_users = set(uid_train)

                        train_x, y_train = RandomUnderSampler(train_x, y_train)
                        test_x, y_test = RandomUnderSampler(test_x, y_test)

                        # train_ratio.append(1.0*np.sum(y_train==1)/np.sum(y_train==0))
                        train_ratio.append(train_x.shape[0])
                        # test_ratio.append(1.0*np.sum(y_test==1)/np.sum(y_test==0))
                        test_ratio.append(test_x.shape[0])

                        scaler = preprocessing.StandardScaler().fit(train_x)
                        x_train_n = scaler.transform(train_x)
                        x_test_n = scaler.transform(test_x)

                        pca = decomposition.PCA(n)
                        pca.fit(x_train_n)
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        dpca.append(x_train_n_pca.shape[1])
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        x_test_n_pca = pca.transform(x_test_n)

                        # for SVM
                        param_grid = [
                            {
                                "C": [10, 100, 1000],
                                "gamma": [0.1, 0.01, 0.001, 0.0001],
                                "kernel": ["rbf"],
                                "class_weight": ["balanced"],
                            }
                        ]

                        clf = SVC(probability=True)
                        gs = GridSearchCV(
                            clf,
                            param_grid,
                            scoring=make_scorer(roc_auc_score),
                            n_jobs=-1,
                            cv=5,
                        )

                        gs = gs.fit(x_train_n_pca, y_train)
                        joblib.dump(gs.best_estimator_, "best_model_android.pkl")

                        clf = joblib.load("best_model_android.pkl")
                        predicted = clf.predict(x_test_n_pca)
                        probs = clf.predict_proba(x_test_n_pca)
                        pre.append(metrics.precision_score(y_test, predicted))
                        acc.append(metrics.accuracy_score(y_test, predicted))
                        auc.append(metrics.roc_auc_score(y_test, probs[:, 1]))
                        precision, recall, _ = precision_recall_curve(
                            y_test, probs[:, 1]
                        )
                        prauc.append(metrics.auc(recall, precision))
                        rec.append(metrics.recall_score(y_test, predicted))

                    line.append(
                        "{:.2f}".format(np.mean(train_ratio)) + "("
                        "{:.2f}".format(np.std(train_ratio)) + ")"
                    )
                    line.append(
                        "{:.2f}".format(np.mean(test_ratio)) + "("
                        "{:.2f}".format(np.std(test_ratio)) + ")"
                    )

                    line.append(
                        "{:.2f}".format(np.mean(dpca)) + "("
                        "{:.2f}".format(np.std(dpca)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(auc)) + "("
                        "{:.4f}".format(np.std(auc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(acc)) + "("
                        "{:.4f}".format(np.std(acc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(pre)) + "("
                        "{:.4f}".format(np.std(pre)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(rec)) + "("
                        "{:.4f}".format(np.std(rec)) + ")"
                    )

    for i in range(len(line)):
        print(head[i], line[i])

features dimension: 260
PCA: 0.9
Conduct task2
Tasks balanced0.9task2
Train 49.40(4.98)
Test 10.00(4.73)
Cough_PCA 29.10(2.02)
Cough_AUC 0.8273(0.1775)
Cough_ACC 0.7751(0.1622)
Cough_Pre 0.8017(0.1578)
Cough_Rec 0.7196(0.2347)



## Task3
### Feature: Type 3,(B) for all features except Δ-MFCCs and Δ2-MFCCs,
### Parameter: Breath with PCA =0.7


In [15]:
for features in [191]:  # different feature combination
    print("========feature group=======")
    Number = features + 256  # the feature dimension for one modal
    print("features dimension:", Number)
    x_data_all_1 = np.load("x_data_handcraft.npy", allow_pickle=True)
    x_data_all_2 = np.load("x_data_vgg.npy", allow_pickle=True)
    x_data_all_2 = np.squeeze(x_data_all_2)  # breath all, cough all, demo
    # subset from handcraft for breath + vgg for breath + subset from handcraft for cough + vgg for cough
    x_data_all = np.concatenate(
        (
            x_data_all_1[:, :features],
            x_data_all_2[:, :256],
            x_data_all_1[:, 477 : 477 + features],
            x_data_all_2[:, 256:],
        ),
        axis=1,
    )

    y_label_all = np.load(
        "y_label_handcraft.npy", allow_pickle=True
    )  # labels are the same, eigher handcraft or vgg is correct
    y_uid_all = np.load("y_uid_handcraft.npy", allow_pickle=True)

    # split the data for different tasks
    x_data_all_1 = x_data_all[y_label_all == 1]  #covidandroidnocough
    x_data_all_2 = x_data_all[y_label_all == 2]  #covidandroidwithcough
    x_data_all_3 = x_data_all[y_label_all == 3]  #covidwebnocough
    x_data_all_4 = x_data_all[y_label_all == 4]  #covidwebwithcough
    x_data_all_6 = x_data_all[y_label_all == 6]  #asthmaandroidwithcough
    x_data_all_8 = x_data_all[y_label_all == 8]  #asthmawebwithcough
    x_data_all_m1 = x_data_all[y_label_all == -1] #healthyandroidnosymp
    x_data_all_m2 = x_data_all[y_label_all == -2] #healthyandroidwithcough
    x_data_all_m3 = x_data_all[y_label_all == -3] #healthywebnosymp
    x_data_all_m4 = x_data_all[y_label_all == -4] #healthywebwithcough

    y_label_all_1 = y_label_all[y_label_all == 1]
    y_label_all_2 = y_label_all[y_label_all == 2]
    y_label_all_3 = y_label_all[y_label_all == 3]
    y_label_all_4 = y_label_all[y_label_all == 4]
    y_label_all_6 = y_label_all[y_label_all == 6]
    y_label_all_8 = y_label_all[y_label_all == 8]
    y_label_all_m1 = y_label_all[y_label_all == -1]
    y_label_all_m2 = y_label_all[y_label_all == -2]
    y_label_all_m3 = y_label_all[y_label_all == -3]
    y_label_all_m4 = y_label_all[y_label_all == -4]

    y_uid_1 = y_uid_all[y_label_all == 1]
    y_uid_2 = y_uid_all[y_label_all == 2]
    y_uid_3 = y_uid_all[y_label_all == 3]
    y_uid_4 = y_uid_all[y_label_all == 4]
    y_uid_6 = y_uid_all[y_label_all == 6]
    y_uid_8 = y_uid_all[y_label_all == 8]
    y_uid_m1 = y_uid_all[y_label_all == -1]
    y_uid_m2 = y_uid_all[y_label_all == -2]
    y_uid_m3 = y_uid_all[y_label_all == -3]
    y_uid_m4 = y_uid_all[y_label_all == -4]

    head = [
        "Tasks",
        "Train",
        "Test",
        "Breathing_PCA",
        "Breathing_AUC",
        "Breathing_ACC",
        "Breathing_Pre",
        "Breathing_Rec",
        #'Cough_PCA','Cough_AUC','Cough_ACC','Cough_Pre','Cough_Rec',
        #'BreathingCough_PCA','BreathingCough_AUC','BreathingCough_ACC','BreathingCough_Pre','BreathingCough_Rec'
    ]

    for i0 in ["balanced"]:  # downsample to balance for both train and test
        for n in [0.7]:  # the variance remained after PCA
            print("PCA:", n)
            for i1 in ["task3"]:
                print("Conduct", i1)
                line = [i0 + str(n) + i1]
                if i1 == "task3":
                    x_data_all_task = np.concatenate(
                        (x_data_all_2, x_data_all_4, x_data_all_6, x_data_all_8), axis=0
                    )
                    y_label_all_task = np.concatenate(
                        (y_label_all_2, y_label_all_4, y_label_all_6, y_label_all_8),
                        axis=0,
                    )
                    y_uid_all_task = np.concatenate(
                        (y_uid_2, y_uid_4, y_uid_6, y_uid_8), axis=0
                    )

                    y_label_all_task[y_label_all_task < 5] = 1  # covid positive
                    y_label_all_task[y_label_all_task > 4] = 0

                for i2 in ["breath"]:  # multi-modal
                    x_data_all_this = x_data_all_task[:, 0:Number]

                    dpca = []
                    acc = []
                    pre = []
                    rec = []
                    auc = []
                    prauc = []
                    train_ratio = []
                    test_ratio = []

                    for seed in [1, 2, 5, 10, 100, 200, 500, 1000, 2000, 5000]:

                        gss = GroupShuffleSplit(
                            n_splits=1, test_size=0.2, random_state=seed
                        )
                        idx1, idx2 = next(
                            gss.split(x_data_all_this, groups=y_uid_all_task)
                        )

                        # Get the split DataFrames.
                        train_x, test_x = x_data_all_this[idx1], x_data_all_this[idx2]
                        y_train, y_test = y_label_all_task[idx1], y_label_all_task[idx2]
                        uid_train, uid_test = y_uid_all_task[idx1], y_uid_all_task[idx2]

                        # merge training samples
                        if i1 != "task1":
                            train_users = set(uid_train)

                        train_x, y_train = RandomUnderSampler(train_x, y_train)
                        test_x, y_test = RandomUnderSampler(test_x, y_test)

                        # train_ratio.append(1.0*np.sum(y_train==1)/np.sum(y_train==0))
                        train_ratio.append(train_x.shape[0])
                        # test_ratio.append(1.0*np.sum(y_test==1)/np.sum(y_test==0))
                        test_ratio.append(test_x.shape[0])

                        scaler = preprocessing.StandardScaler().fit(train_x)
                        x_train_n = scaler.transform(train_x)
                        x_test_n = scaler.transform(test_x)

                        pca = decomposition.PCA(n)
                        pca.fit(x_train_n)
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        dpca.append(x_train_n_pca.shape[1])
                        x_train_n_pca = pca.fit_transform(x_train_n)
                        x_test_n_pca = pca.transform(x_test_n)

                        # for SVM
                        param_grid = [
                            {
                                "C": [10, 100, 1000],
                                "gamma": [0.1, 0.01, 0.001, 0.0001],
                                "kernel": ["rbf"],
                                "class_weight": ["balanced"],
                            }
                        ]

                        clf = SVC(probability=True)
                        gs = GridSearchCV(
                            clf,
                            param_grid,
                            scoring=make_scorer(roc_auc_score),
                            n_jobs=-1,
                            cv=5,
                        )

                        gs = gs.fit(x_train_n_pca, y_train)
                        joblib.dump(gs.best_estimator_, "best_model_android.pkl")

                        clf = joblib.load("best_model_android.pkl")
                        predicted = clf.predict(x_test_n_pca)
                        probs = clf.predict_proba(x_test_n_pca)
                        pre.append(metrics.precision_score(y_test, predicted))
                        acc.append(metrics.accuracy_score(y_test, predicted))
                        auc.append(metrics.roc_auc_score(y_test, probs[:, 1]))
                        precision, recall, _ = precision_recall_curve(
                            y_test, probs[:, 1]
                        )
                        prauc.append(metrics.auc(recall, precision))
                        rec.append(metrics.recall_score(y_test, predicted))

                    line.append(
                        "{:.2f}".format(np.mean(train_ratio)) + "("
                        "{:.2f}".format(np.std(train_ratio)) + ")"
                    )
                    line.append(
                        "{:.2f}".format(np.mean(test_ratio)) + "("
                        "{:.2f}".format(np.std(test_ratio)) + ")"
                    )

                    line.append(
                        "{:.2f}".format(np.mean(dpca)) + "("
                        "{:.2f}".format(np.std(dpca)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(auc)) + "("
                        "{:.4f}".format(np.std(auc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(acc)) + "("
                        "{:.4f}".format(np.std(acc)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(pre)) + "("
                        "{:.4f}".format(np.std(pre)) + ")"
                    )
                    line.append(
                        "{:.4f}".format(np.mean(rec)) + "("
                        "{:.4f}".format(np.std(rec)) + ")"
                    )

    for i in range(len(line)):
        print(head[i], line[i])

features dimension: 447
PCA: 0.7
Conduct task3
Tasks balanced0.7task3
Train 32.00(3.46)
Test 8.00(3.46)
Breathing_PCA 9.60(0.66)
Breathing_AUC 0.8048(0.1434)
Breathing_ACC 0.7029(0.2151)
Breathing_Pre 0.6939(0.2094)
Breathing_Rec 0.6933(0.2604)
