In [1]:
import pickle

import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

In [3]:
sim = np.load("./hm_feedback_sim.npy")
bio = np.load("./hm_feedback_bio.npy")

sim.shape, bio.shape

((48, 4), (80, 4))

# train fusion LR

In [28]:
ratio = 4

X_train_mlp = np.load("collaboration_with_ting_zhang/result/preds_sim_train_0.{}.npy".format(ratio))
X_train_rf = np.load("collaboration_with_ting_zhang/result/rf_pred_train_0.{}.npy".format(ratio))
y_train = np.load("collaboration_with_ting_zhang/result/sim_training_label_0.{}.npy".format(ratio))

X_test_mlp = np.load("collaboration_with_ting_zhang/result/preds_sim_test_0.{}.npy".format(10-ratio))
X_test_rf = np.load("collaboration_with_ting_zhang/result/rf_pred_test_0.{}.npy".format(10-ratio))
y_test = np.load("collaboration_with_ting_zhang/result/sim_testing_label_0.{}.npy".format(10-ratio))

X_train = np.hstack([X_train_mlp.reshape(-1,1), X_train_rf.reshape(-1,1)])
X_test = np.hstack([X_test_mlp.reshape(-1,1), X_test_rf.reshape(-1,1)])

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((921, 2), (921, 1), (1382, 2), (1382, 1))

In [29]:
print("train - mlp only:", accuracy_score(y_true=y_train, y_pred=X_train_mlp.round()))
print("train - rf only: ", accuracy_score(y_true=y_train, y_pred=X_train_rf.round()))
print("test - mlp only:", accuracy_score(y_true=y_test, y_pred=X_test_mlp.round()))
print("test - rf only: ", accuracy_score(y_true=y_test, y_pred=X_test_rf.round()))

train - mlp only: 0.997828447339848
train - rf only:  1.0
test - mlp only: 0.88205499276411
test - rf only:  0.9761215629522432


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV

fusion = LogisticRegression()
# fusion = RandomForestClassifier()
# fusion = SVC(kernel='rbf')
# fusion = GaussianProcessClassifier()
# fusion = LGBMClassifier()

fusion.fit(X_train, y_train.reshape(-1))

train_acc = accuracy_score(y_true=y_train, y_pred=fusion.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=fusion.predict(X_test))

print("train acc: ", train_acc)
print("test acc", test_acc)

train acc:  1.0
test acc 0.9319826338639653


In [31]:
pickle.dump(fusion, open("collaboration_with_ting_zhang/result/fusion_0.{}.pkl".format(ratio), "wb"))

### error correction module

In [None]:
# fusion_test_pred = fusion.predict(X_test)

# for i in range(len(X_test)):
#     if fusion_test_pred[i] != sim_test_label[i]:
#         print(sim_test_data[i], sim_test_label[i])

In [79]:
import torch

torch.save(torch.from_numpy(X_train), "collaboration_with_ting_zhang/result/fusion_train_X.pth")
torch.save(torch.from_numpy(y_train), "collaboration_with_ting_zhang/result/fusion_train_y.pth")
torch.save(torch.from_numpy(X_test), "collaboration_with_ting_zhang/result/fusion_test_X.pth")
torch.save(torch.from_numpy(y_test), "collaboration_with_ting_zhang/result/fusion_test_y.pth")

In [80]:
error_train_X = torch.hstack([
    torch.from_numpy(sim_train_data), 
    torch.from_numpy(X_train), 
    torch.from_numpy(fusion.predict_proba(X_train)[:,1]).view(-1,1)])
error_train_y = torch.from_numpy(y_train) - torch.from_numpy(fusion.predict_proba(X_train)[:,1]).view(-1,1)

error_test_X = torch.hstack([
    torch.from_numpy(sim_test_data), 
    torch.from_numpy(X_test), 
    torch.from_numpy(fusion.predict_proba(X_test)[:,1]).view(-1,1)])
error_test_y = torch.from_numpy(y_test) - torch.from_numpy(fusion.predict_proba(X_test)[:,1]).view(-1,1)

print("error train: ", error_train_X.shape, error_train_y.shape)
print("error test: ", error_test_X.shape, error_test_y.shape)

torch.save(error_train_X, "collaboration_with_ting_zhang/result/error_train_X.pth")
torch.save(error_train_y, "collaboration_with_ting_zhang/result/error_train_y.pth")
torch.save(error_test_X, "collaboration_with_ting_zhang/result/error_test_X.pth")
torch.save(error_test_y, "collaboration_with_ting_zhang/result/error_test_y.pth")

error train:  torch.Size([460, 9]) torch.Size([460, 1])
error test:  torch.Size([1843, 9]) torch.Size([1843, 1])


In [83]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor

# error_corr = LGBMRegressor()
error_corr = GaussianProcessRegressor()

error_corr.fit(error_train_X, error_train_y.reshape(-1))

train_mse = mean_squared_error(y_true=error_train_y, y_pred=error_corr.predict(error_train_X))
test_mse = mean_squared_error(y_true=error_test_y, y_pred=error_corr.predict(error_test_X))

print("train mse: ", train_mse)
print("test mse", train_mse)

new_pred_train = error_corr.predict(error_train_X) + error_train_X[:, -1].numpy().reshape(-1)
new_pred_test = error_corr.predict(error_test_X) + error_test_X[:, -1].numpy().reshape(-1)

train_acc = accuracy_score(y_true=y_train.reshape(-1), y_pred=new_pred_train.round())
test_acc = accuracy_score(y_true=y_test.reshape(-1), y_pred=new_pred_test.round())

print("train acc: ", train_acc)
print("test acc: ", test_acc)

train mse:  7.16483782871675e-24
test mse 7.16483782871675e-24
train acc:  1.0
test acc:  0.893651654910472


### pipeline inference

In [81]:
def aug_features(x1, x2, mode=0):

    # only original feats
    if mode==0:
        feats = [x1[0], x1[1], x1[2], x2[0], x2[1], x2[2]]

    # original feats + cross feats
    elif mode==1:
        feats = [
            x1[0], x1[1], x1[2], 
            x2[0], x2[1], x2[2], 
            x1[0]-x2[0], 
            x1[1]-x2[1], 
            x1[2]-x2[2], 
            x1[0]/x2[0], 
            x1[1]/x2[1], 
            x1[2]/x2[2], 
            x1[0]+x1[0]*x1[1]-x1[0]*x1[2], 
            x2[0]+x2[0]*x2[1]-x2[0]*x2[2], 
            (x1[0]+x1[0]*x1[1])/x1[0]*x1[2], 
            (x2[0]+x2[0]*x2[1])/x2[0]*x2[2], 
            x1[0]*x1[1], 
            x2[0]*x2[1], 
            x1[0]*x1[2], 
            x2[0]*x2[1]
        ]

    # only cross feats
    elif mode==2:
        feats = [
            # x1[0], x1[1], x1[2], 
            # x2[0], x2[1], x2[2], 
            x1[0]-x2[0], 
            x1[1]-x2[1], 
            x1[2]-x2[2], 
            x1[0]/x2[0], 
            x1[1]/x2[1], 
            x1[2]/x2[2], 
            x1[0]+x1[0]*x1[1]-x1[0]*x1[2], 
            x2[0]+x2[0]*x2[1]-x2[0]*x2[2], 
            (x1[0]+x1[0]*x1[1])/x1[0]*x1[2], 
            (x2[0]+x2[0]*x2[1])/x2[0]*x2[2], 
            x1[0]*x1[1], 
            x2[0]*x2[1], 
            x1[0]*x1[2], 
            x2[0]*x2[1]
        ]

    else:
        print("wrong mode!")

    return feats

def surrogate_inference(models, data, data_maxmin, label_maxmin):
    """
    models: {'mlp': MLP, 'rf': RF, 'fusion': fusion}
    data: [N, 6], two sets of parameters, original (non-normalised)
    data_maxmin: [2, 6], first row max, second row min
    label_maxmin: [2, 1]
    : return: [N, 1]
    """
    N = data.shape[0]

    # get p1
    with torch.no_grad():
        models['mlp'].eval()

        data_max, data_min = data_maxmin[0, :], data_maxmin[1, :]
        label_max, label_min = label_maxmin[0, :], label_maxmin[1, :]

        data_input = (data - data_min) / (data_max - data_min)

        p1 = models['mlp'](torch.from_numpy(data_input).float().cuda())
        p1 = p1.detach().cpu().numpy() * (label_max - label_min) + label_min
        p1 = p1.reshape(-1, 1)

    # get p2
    if type(data) != type(np.ones(1)):
        data = data.detach().cpu().numpy()
    data_aug = list(map(lambda x:aug_features(x[:3], x[3:], mode=1), data))
    data_aug = np.array(data_aug).reshape(N, -1)
    p2 = models['rf'].predict_proba(data_aug)[:, 1]
    p2 = p2.reshape(-1, 1)

    # get fusion
    data_fusion = np.hstack([p1, p2])
    p_fusion = models['fusion'].predict_proba(data_fusion)[:, 1]
    p_fusion = p_fusion.reshape(-1, 1)

    return p_fusion

In [82]:
from collaboration_with_ting_zhang.network_repo import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

mlp = MLP_forward_embed([128,64,32,16,1], embedding_dim=int(128/2), input_dim=6).cuda()
mlp.load_state_dict(torch.load("collaboration_with_ting_zhang/result/20241109_mlp_sim_0.2.pth"))
rf = pickle.load(open("collaboration_with_ting_zhang/result/rf.pkl", "rb"))
lr = pickle.load(open("collaboration_with_ting_zhang/result/fusion.pkl", "rb"))

data_maxmin = np.load("collaboration_with_ting_zhang/result/sim_training_data_maxmin0.2.npy")
label_maxmin = np.load("collaboration_with_ting_zhang/result/sim_training_label_maxmin0.2.npy")

models = {
    'mlp': mlp,
    'rf': rf,
    'fusion': lr
}

train_pred = surrogate_inference(models, sim_train_data, data_maxmin, label_maxmin)
test_pred = surrogate_inference(models, sim_test_data, data_maxmin, label_maxmin)

print("train acc", accuracy_score(y_true=sim_train_label, y_pred=train_pred.round()))
print("test acc", accuracy_score(y_true=sim_test_label, y_pred=test_pred.round()))

train acc 1.0
test acc 0.893651654910472


# train RF 

##### on [original + cross feats] with [2:8 train-test]

In [22]:
ratio = 4

sim_train_data = np.load("collaboration_with_ting_zhang/result/sim_training_data_0.{}.npy".format(ratio))
sim_train_label = np.load("collaboration_with_ting_zhang/result/sim_training_label_0.{}.npy".format(ratio))

sim_test_data = np.load("collaboration_with_ting_zhang/result/sim_testing_data_0.{}.npy".format(10-ratio))
sim_test_label = np.load("collaboration_with_ting_zhang/result/sim_testing_label_0.{}.npy".format(10-ratio))

print("train: ", sim_train_data.shape, sim_train_label.shape)
print("test: ", sim_test_data.shape, sim_test_label.shape)

train:  (921, 6) (921, 1)
test:  (1382, 6) (1382, 1)


In [23]:
sim_training_data_maxmin = np.load(
    "collaboration_with_ting_zhang/result/sim_training_data_maxmin0.{}.npy".format(ratio))
sim_training_label_maxmin = np.load(
    "collaboration_with_ting_zhang/result/sim_training_label_maxmin0.{}.npy".format(ratio))

# sim_testing_data_maxmin = np.load("collaboration_with_ting_zhang/result/sim_testing_data_maxmin0.8.npy")
# sim_testing_label_maxmin = np.load("collaboration_with_ting_zhang/result/sim_testing_data_maxmin0.8.npy")

sim_train_data = sim_train_data * (sim_training_data_maxmin[0, :] - sim_training_data_maxmin[1, :]) \
    + sim_training_data_maxmin[1, :]
sim_train_label = sim_train_label * (sim_training_label_maxmin[0, :] - sim_training_label_maxmin[1, :]) \
    + sim_training_label_maxmin[1, :]

sim_test_data = sim_test_data * (sim_training_data_maxmin[0, :] - sim_training_data_maxmin[1, :]) \
    + sim_training_data_maxmin[1, :]
sim_test_label = sim_test_label * (sim_training_label_maxmin[0, :] - sim_training_label_maxmin[1, :]) \
    + sim_training_label_maxmin[1, :]

print("train: ", sim_train_data.shape, sim_train_label.shape)
print("test: ", sim_test_data.shape, sim_test_label.shape)

train:  (921, 6) (921, 1)
test:  (1382, 6) (1382, 1)


In [24]:
sim_train_data

array([[50. ,  1. ,  2. , 10. ,  0.5,  5. ],
       [20. , 10. ,  5. , 50. ,  5. ,  2. ],
       [50. ,  5. ,  5. , 50. ,  1. ,  5. ],
       ...,
       [10. ,  5. ,  2. , 10. ,  1. ,  1. ],
       [50. ,  1. ,  1. , 20. , 10. ,  1. ],
       [10. ,  0.5,  1. , 10. ,  1. ,  2. ]])

In [25]:
sim_test_data

array([[50., 10.,  1., 15., 10.,  5.],
       [15.,  1.,  5., 10.,  5.,  1.],
       [50.,  1.,  5., 15.,  5.,  2.],
       ...,
       [15., 10.,  2., 20.,  1.,  1.],
       [20., 10.,  2., 20.,  1.,  5.],
       [20.,  5.,  1., 50.,  5.,  2.]])

In [9]:
cnt = 0
for i in range(len(sim_train_data)):
    for j in range(len(sim_test_data)):
        if np.array_equal(sim_train_data[i], sim_test_data[j]):
            print(i, j)
            cnt += 1

print(cnt)

0 12
2 32
4 360
5 1187
6 1051
8 543
10 628
12 281
14 1188
15 760
16 22
17 47
18 764
21 726
22 112
24 1085
25 254
26 630
31 208
33 661
34 240
35 1201
38 339
39 880
40 315
41 514
47 291
48 262
49 584
50 1323
51 969
52 111
53 1299
55 15
62 1080
63 930
65 974
66 131
67 844
68 1256
69 1121
71 186
74 1312
75 9
78 538
79 832
80 1340
82 181
83 539
84 1349
85 1213
88 257
89 1038
90 74
91 412
92 1013
94 174
97 1123
100 384
102 843
103 1238
104 1023
105 1218
106 961
108 587
109 1132
110 265
111 85
112 727
114 1093
117 755
118 467
119 997
121 644
122 372
123 687
124 858
126 598
131 1309
132 76
134 234
135 278
137 1344
138 1300
139 474
140 579
141 903
143 686
144 447
146 498
147 193
148 348
150 1005
153 1282
155 820
156 916
157 1381
161 624
162 94
166 523
167 162
168 177
169 90
172 295
174 92
176 1360
178 899
181 954
182 943
183 388
184 400
185 164
188 1222
189 1075
190 1084
191 280
192 314
195 1287
197 887
198 464
199 1356
202 1010
203 897
204 1235
206 413
208 1303
209 1334
210 207
211 1185
212 11

In [26]:
train = []
for i in range(len(sim_train_data)):
    feat = calc_theta_features(x1=sim_train_data[i, :3], x2=sim_train_data[i, 3:], y=sim_train_label.reshape(-1)[i], mode=1)
    train.append(np.array(feat))

test = []
for i in range(len(sim_test_data)):
    feat = calc_theta_features(x1=sim_test_data[i, :3], x2=sim_test_data[i, 3:], y=sim_test_label.reshape(-1)[i], mode=1)
    test.append(np.array(feat))

train = np.array(train)
test = np.array(test)

X_train, y_train = train[:, :-1], train[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

print("train: ", X_train.shape, y_train.shape)
print("test: ", X_test.shape, y_test.shape)

train:  (921, 20) (921,)
test:  (1382, 20) (1382,)


In [27]:
best = 0.0
best_param = []

ne_list=[50,100,200]
md_list=[8,12,16,32,None]
mss_list=[2,5,10,20]

for ne in ne_list:
    for md in md_list:
        for mss in mss_list:
                
            rf = RandomForestClassifier(n_estimators=ne, max_depth=md, min_samples_split=mss)

            rf.fit(X_train, y_train.reshape(-1))
            train_acc = accuracy_score(y_true=y_train, y_pred=rf.predict(X_train))
            test_acc = accuracy_score(y_true=y_test, y_pred=rf.predict(X_test))
            
            if test_acc > best:
                best = test_acc
                best_param = [ne, md, mss]
                print("best param: ", ne, md, mss)
                print("best train acc: ", train_acc)
                print("best test acc: ", test_acc)
                print("save to collaboration_with_ting_zhang/result/rf.pkl\n")
                pickle.dump(rf, open("collaboration_with_ting_zhang/result/rf_0.{}.pkl".format(ratio), "wb"))
                np.save("collaboration_with_ting_zhang/result/rf_pred_train_0.{}.npy".format(ratio), rf.predict_proba(X_train)[:, 1])
                np.save("collaboration_with_ting_zhang/result/rf_pred_test_0.{}.npy".format(10-ratio), rf.predict_proba(X_test)[:, 1])

best param:  50 8 2
best train acc:  0.996742671009772
best test acc:  0.9667149059334298
save to collaboration_with_ting_zhang/result/rf.pkl

best param:  50 8 5
best train acc:  0.990228013029316
best test acc:  0.9688856729377714
save to collaboration_with_ting_zhang/result/rf.pkl

best param:  50 12 5
best train acc:  0.995656894679696
best test acc:  0.9717800289435601
save to collaboration_with_ting_zhang/result/rf.pkl

best param:  50 16 2
best train acc:  1.0
best test acc:  0.9761215629522432
save to collaboration_with_ting_zhang/result/rf.pkl



# prepare data

In [3]:
# create pair (x1, x2, y)
# x1 & x2 are parameters, y=1 if x1 is preferred over x2

def calc_theta_features(x1, x2, y, mode=0):

    # only original feats
    if mode==0:
        feats = [x1[0], x1[1], x1[2], x2[0], x2[1], x2[2], y]

    # original feats + cross feats
    elif mode==1:
        feats = [
            x1[0], x1[1], x1[2], 
            x2[0], x2[1], x2[2], 
            x1[0]-x2[0], 
            x1[1]-x2[1], 
            x1[2]-x2[2], 
            x1[0]/x2[0], 
            x1[1]/x2[1], 
            x1[2]/x2[2], 
            x1[0]+x1[0]*x1[1]-x1[0]*x1[2], 
            x2[0]+x2[0]*x2[1]-x2[0]*x2[2], 
            (x1[0]+x1[0]*x1[1])/x1[0]*x1[2], 
            (x2[0]+x2[0]*x2[1])/x2[0]*x2[2], 
            x1[0]*x1[1], 
            x2[0]*x2[1], 
            x1[0]*x1[2], 
            x2[0]*x2[1], 
            y
        ]

    # only cross feats
    elif mode==2:
        feats = [
            # x1[0], x1[1], x1[2], 
            # x2[0], x2[1], x2[2], 
            x1[0]-x2[0], 
            x1[1]-x2[1], 
            x1[2]-x2[2], 
            x1[0]/x2[0], 
            x1[1]/x2[1], 
            x1[2]/x2[2], 
            x1[0]+x1[0]*x1[1]-x1[0]*x1[2], 
            x2[0]+x2[0]*x2[1]-x2[0]*x2[2], 
            (x1[0]+x1[0]*x1[1])/x1[0]*x1[2], 
            (x2[0]+x2[0]*x2[1])/x2[0]*x2[2], 
            x1[0]*x1[1], 
            x2[0]*x2[1], 
            x1[0]*x1[2], 
            x2[0]*x2[1], 
            y
        ]

    else:
        print("wrong mode!")

    return feats

def create_pref(data, mode=0):
    pairs = []
    for i in range(len(data)):
        for j in range(len(data)):
            y = 1 if data[i][-1] > data[j][-1] else 0
            pairs.append(calc_theta_features(data[i][:3], data[j][:3], y, mode=mode))
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

In [4]:
sim_data0 = create_pref(sim, mode=0)
bio_data0 = create_pref(bio, mode=0)
sim_data1 = create_pref(sim, mode=1)
bio_data1 = create_pref(bio, mode=1)
sim_data2 = create_pref(sim, mode=2)
bio_data2 = create_pref(bio, mode=2)

NameError: name 'sim' is not defined

# Tree

In [84]:
def cross_val_rf(ne_list, md_list, mss_list, X, y, cv=5):

    best = [0.0, 0.0]
    best_param = []

    for ne in ne_list:
        for md in md_list:
            for mss in mss_list:
                    
                    rf = RandomForestClassifier(n_estimators=ne, max_depth=md, min_samples_split=mss)

                    scores = cross_val_score(rf, X, y, cv=cv)
                    if scores.mean() > best[0]:
                        best = [scores.mean(), scores.std()]
                        best_param = [ne, md, mss]
                        print("best param: ", ne, md, mss)
                        print("best acc: ", best[0], best[1])

In [41]:
X, y = sim_data0[:, :-1], sim_data0[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.9218956898990852 0.02865741467929385
best param:  50 8 5
best acc:  0.9236310478166556 0.025175969011542627
best param:  50 8 10
best acc:  0.9249306799962275 0.030736953884597025
best param:  50 12 2
best acc:  0.925804017730831 0.033036644254497524
best param:  50 12 5
best acc:  0.9292728473073659 0.0322930152890416
best param:  50 16 2
best acc:  0.9323097236631142 0.02480482307886147
best param:  100 32 2
best acc:  0.9336112421012921 0.023845984313836176


In [42]:
X, y = sim_data1[:, :-1], sim_data1[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.9366518909742526 0.03668526283071027
best param:  50 12 2
best acc:  0.9500971423182119 0.02711803247544511
best param:  100 32 2
best acc:  0.9514014901442988 0.022389563171431728


In [43]:
X, y = sim_data2[:, :-1], sim_data2[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.9288371215693673 0.036986646263492455
best param:  50 8 5
best acc:  0.9297048005281525 0.038119665014149526
best param:  50 12 2
best acc:  0.9388163727247004 0.028414689358960674


In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    sim_data2[:, :-1], sim_data2[:, -1], test_size=0.8, random_state=114514)

rf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=1)

rf.fit(X_train, y_train.reshape(-1))
print("train acc: ", accuracy_score(y_true=y_train, y_pred=rf.predict(X_train)))
print("test acc: ", accuracy_score(y_true=y_test, y_pred=rf.predict(X_test)))

train acc:  1.0
test acc:  0.952819956616052


In [11]:
X, y = bio_data0[:, :-1], bio_data0[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.72046875 0.014785246552729525
best param:  50 8 20
best acc:  0.72265625 0.026773062488254885
best param:  100 8 10
best acc:  0.7254687500000001 0.021950761661044944
best param:  200 8 10
best acc:  0.72609375 0.023147159990482603
best param:  200 8 20
best acc:  0.7284375 0.025129547163150388


In [12]:
X, y = bio_data1[:, :-1], bio_data1[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.714375 0.06656249999999998
best param:  50 8 5
best acc:  0.71671875 0.07061670305246488
best param:  50 8 20
best acc:  0.7215625 0.06061332039556155
best param:  100 8 10
best acc:  0.7225 0.06354094091115585


In [11]:
X, y = bio_data2[:, :-1], bio_data2[:, -1]

cross_val_rf(ne_list=[50,100,200], md_list=[8,12,16,32,None], mss_list=[2,5,10,20], X=X, y=y, cv=5)

best param:  50 8 2
best acc:  0.7120312500000001 0.061292236712735496
best param:  50 8 5
best acc:  0.724375 0.06192982104265277
best param:  50 8 20
best acc:  0.7259375 0.05306384347533636


In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    bio_data2[:, :-1], bio_data2[:, -1], test_size=0.8, random_state=114514)

rf = RandomForestClassifier(n_estimators=200, max_depth=8, min_samples_split=10)

rf.fit(X_train, y_train.reshape(-1))
print("train acc: ", accuracy_score(y_true=y_train, y_pred=rf.predict(X_train)))
print("test acc: ", accuracy_score(y_true=y_test, y_pred=rf.predict(X_test)))

train acc:  0.8875
test acc:  0.8123046875


In [18]:
scores = cross_val_score(rf, X=bio_data2[:, :-1], y=bio_data2[:, -1], cv=5)
scores, scores.mean(), scores.std()

(array([0.746875  , 0.7390625 , 0.78203125, 0.59375   , 0.703125  ]),
 0.7129687499999999,
 0.06466862647373917)

# GP

In [50]:
def cross_val_gp(kernel_list, X, y, cv=5):

    best = [0.0, 0.0]
    best_param = []

    for kernel in kernel_list:
                
        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)

        scores = cross_val_score(gpc, X, y, cv=cv)
        if scores.mean() > best[0]:
            best = [scores.mean(), scores.std()]
            print("best kernel: ", kernel)
            print("best acc: ", best[0], best[1])

In [53]:
kernel_list = [
    1.0 * RBF(1.0), 
    1.0 * Matern(1.0), 
    1.0 * RationalQuadratic(1.0), 
    1.0 * ExpSineSquared(1.0), 
    1.0 * DotProduct(1.0)
]

In [54]:
X, y = sim_data0[:, :-1], sim_data0[:, -1]

cross_val_gp(kernel_list=kernel_list, X=X, y=y, cv=5)

best kernel:  1**2 * RBF(length_scale=1)
best acc:  0.6141497689333207 0.0005338111855135441


In [55]:
X, y = sim_data1[:, :-1], sim_data1[:, -1]

cross_val_gp(kernel_list=kernel_list, X=X, y=y, cv=5)

best kernel:  1**2 * RBF(length_scale=1)
best acc:  0.6141497689333207 0.0005338111855135441


In [56]:
X, y = bio_data0[:, :-1], bio_data0[:, -1]

cross_val_gp(kernel_list=kernel_list, X=X, y=y, cv=5)

best kernel:  1**2 * RBF(length_scale=1)
best acc:  0.6884375 0.0003125000000000267


In [None]:
X, y = bio_data1[:, :-1], bio_data1[:, -1]

cross_val_gp(kernel_list=kernel_list, X=X, y=y, cv=5)