-
Notifications
You must be signed in to change notification settings - Fork 0
/
3assembleDataWithProb.py
92 lines (78 loc) · 3.17 KB
/
3assembleDataWithProb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import h5py
import numpy as np
from chooseTestData import readCSV
trainingfile = h5py.File("data/training.h5", 'r')
validationfile = h5py.File("data/validation.h5", 'r')
testfile_a = h5py.File("data/round1_test_a_20181109.h5")
testfile_b = h5py.File("data/round1_test_b_20190104.h5")
s1_train = trainingfile['sen1']
s1_val = validationfile['sen1']
s1_test_a = testfile_a['sen1']
s1_test_b = testfile_b['sen1']
s2_train = trainingfile['sen2']
s2_val = validationfile['sen2']
s2_test_a = testfile_a['sen2']
s2_test_b = testfile_b['sen2']
labels_train = np.asarray(trainingfile['label'])
labels_val = np.asarray(validationfile['label'])
prob_test_a = readCSV("data/integrate_testa_softmax.csv")
prob_test_b = readCSV("data/integrate_testb_softmax.csv")
testfile_c = h5py.File("data/round2_test_a_20190121.h5")
s1_test_c = testfile_c['sen1']
s2_test_c = testfile_c['sen2']
testfile_d = h5py.File("data/round2_test_b_20190211.h5")
s1_test_d = testfile_d['sen1']
s2_test_d = testfile_d['sen2']
prob_test_c = readCSV("data/jc0203_softmax.csv")
prob_test_d = readCSV("data/jc0212_temp_softmax.csv")
import sys
set_id = int(sys.argv[1])
# load npy data
idx_selected = np.load("data/idx_selected_0204_%d.npy"%set_id).item()
# extract and assemble them to training_seti.h5 File
num_classes = 17
print("Processing")
idx_from_val = []
idx_from_train = []
for i in range(num_classes):
idx_selected_from_val = idx_selected["val"][i]
idx_selected_from_train = idx_selected["train"][i]
idx_from_val.append(idx_selected_from_val)
idx_from_train.append(idx_selected_from_train)
idx_from_val = np.concatenate(idx_from_val, axis=0)
idx_from_train = np.concatenate(idx_from_train, axis=0)
idx_from_val.sort()
idx_from_train.sort()
import ipdb; ipdb.set_trace()
s1_set_new = np.concatenate([np.asarray(s1_val)[idx_from_val,:,:,:], np.asarray(s1_test_a)[:,:,:,:], np.asarray(s1_test_b)[:,:,:,:], np.asarray(s1_test_c)[:,:,:,:], np.asarray(s1_test_d)[:,:,:,:]], axis=0)
s1_set_new = np.concatenate([s1_set_new, np.asarray(s1_train)[idx_from_train,:,:,:]], axis=0)
s2_set_new = np.concatenate([np.asarray(s2_val)[idx_from_val,:,:,:], np.asarray(s2_test_a)[:,:,:,:], np.asarray(s2_test_b)[:,:,:,:], np.asarray(s2_test_c)[:,:,:,:], np.asarray(s2_test_d)[:,:,:,:]], axis=0)
s2_set_new = np.concatenate([s2_set_new, np.asarray(s2_train)[idx_from_train,:,:,:]], axis=0)
labels_set_new = np.concatenate([labels_val[idx_from_val,:].astype(np.float64), prob_test_a, prob_test_b, prob_test_c, prob_test_d, labels_train[idx_from_train,:].astype(np.float64)], axis=0)
# Saving new training dataset to H5 file
f_train_new = h5py.File("data/training_0211_test_with_prob_%d.h5"%set_id, "w")
f_train_new.create_dataset("sen1", data=s1_set_new)
f_train_new.create_dataset("sen2", data=s2_set_new)
f_train_new.create_dataset("label", data=labels_set_new)
f_train_new.close()
"""
Training set distribution:
0 3000
1 3000
2 3000
3 3000
4 3000
5 3000
6 3000
7 5000
8 3000
9 3000
10 3000
11 3000
12 3000
13 3000
14 2392
15 3000
16 3000
"""
import ipdb; ipdb.set_trace()