-
Notifications
You must be signed in to change notification settings - Fork 269
/
test_textregression.py
147 lines (116 loc) · 5.47 KB
/
test_textregression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
Tests of ktrain text regression
"""
import testenv
import IPython
from unittest import TestCase, main, skip
import numpy as np
import ktrain
from ktrain import text as txt
from ktrain.imports import ACC_NAME, VAL_ACC_NAME
TEST_DOC = """A wine that has created its own universe. It has a unique, special softness
that allies with the total purity that comes from a small, enclosed single vineyard.
The fruit is almost irrelevant here, because it comes as part of a much deeper complexity.
This is a great wine, at the summit of Champagne, a sublime, unforgettable experience.
"""
class TestTextRegression(TestCase):
def setUp(self):
import pandas as pd
# wine price dataset should be downloaded
# from: https://github.com/floydhub/regression-template
# and prepared as described in the wide-deep.ipynb notebook
path = './text_data/wines.csv'
data = pd.read_csv(path)
data = data.sample(frac=1., random_state=42)
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))
# Train features
description_train = data['description'][:train_size]
# Train labels
labels_train = data['price'][:train_size]
# Test features
description_test = data['description'][train_size:]
# Test labels
labels_test = data['price'][train_size:]
# dataset
x_train = description_train.values
y_train = labels_train.values
x_test = description_test.values
y_test = labels_test.values
self.trn = (x_train, y_train)
self.val = (x_test, y_test)
#@skip('temporarily disabled')
def test_linreg(self):
trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
y_train=self.trn[1],
x_test=self.val[0],
y_test=self.val[1],
preprocess_mode='standard',
ngram_range=3,
maxlen=200,
max_features=35000)
model = txt.text_regression_model('linreg', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256)
lr = 0.01
hist = learner.fit_onecycle(lr, 10)
# test training results
self.assertAlmostEqual(max(hist.history['lr']), lr)
self.assertLess(min(hist.history['val_mae']), 12)
# test top losses
obs = learner.top_losses(n=1, val_data=None)
self.assertIn(obs[0][0], list(range(len(val[0]))))
learner.view_top_losses(preproc=preproc, n=1, val_data=None)
# test weight decay
self.assertEqual(learner.get_weight_decay(), None)
learner.set_weight_decay(1e-2)
self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)
# test load and save model
learner.save_model('/tmp/test_model')
learner.load_model('/tmp/test_model')
# test predictor
p = ktrain.get_predictor(learner.model, preproc)
self.assertGreater(p.predict([TEST_DOC])[0], 100)
p.save('/tmp/test_predictor')
p = ktrain.load_predictor('/tmp/test_predictor')
self.assertGreater(p.predict([TEST_DOC])[0], 100)
self.assertIsNone(p.explain(TEST_DOC))
#@skip('temporarily disabled')
def test_distilbert(self):
trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
y_train=self.trn[1],
x_test=self.val[0],
y_test=self.val[1],
preprocess_mode='distilbert',
maxlen=75)
model = txt.text_regression_model('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100)
lr = 5e-5
hist = learner.fit_onecycle(lr, 1)
# test training results
self.assertAlmostEqual(max(hist.history['lr']), lr)
self.assertLess(min(hist.history['val_mae']), 16)
# test top losses
obs = learner.top_losses(n=1, val_data=None)
self.assertIn(obs[0][0], list(range(len(val.x))))
learner.view_top_losses(preproc=preproc, n=1, val_data=None)
# test weight decay
self.assertEqual(learner.get_weight_decay(), None)
learner.set_weight_decay(1e-2)
self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)
# test load and save model
tmp_folder = ktrain.imports.tempfile.mkdtemp()
learner.save_model(tmp_folder)
learner.load_model(tmp_folder, preproc=preproc)
# test predictor
p = ktrain.get_predictor(learner.model, preproc, batch_size=64)
self.assertGreater(p.predict([TEST_DOC])[0], 1)
tmp_folder = ktrain.imports.tempfile.mkdtemp()
p.save(tmp_folder)
p = ktrain.load_predictor(tmp_folder, batch_size=64)
self.assertGreater(p.predict([TEST_DOC])[0], 1)
self.assertIsNone(p.explain(TEST_DOC))
if __name__ == "__main__":
main()