/
test_textregression.py
150 lines (124 loc) · 5.12 KB
/
test_textregression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
"""
Tests of ktrain text regression
"""
from unittest import TestCase, main, skip
import IPython
import numpy as np
import testenv
import ktrain
from ktrain import text as txt
from ktrain.imports import ACC_NAME, VAL_ACC_NAME
TEST_DOC = """A wine that has created its own universe. It has a unique, special softness
that allies with the total purity that comes from a small, enclosed single vineyard.
The fruit is almost irrelevant here, because it comes as part of a much deeper complexity.
This is a great wine, at the summit of Champagne, a sublime, unforgettable experience.
"""
class TestTextRegression(TestCase):
def setUp(self):
import pandas as pd
# wine price dataset should be downloaded
# from: https://github.com/floydhub/regression-template
# and prepared as described in the wide-deep.ipynb notebook
path = "./resources/text_data/wines.csv"
data = pd.read_csv(path)
data = data.sample(frac=1.0, random_state=42)
# Split data into train and test
train_size = int(len(data) * 0.8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))
# Train features
description_train = data["description"][:train_size]
# Train labels
labels_train = data["price"][:train_size]
# Test features
description_test = data["description"][train_size:]
# Test labels
labels_test = data["price"][train_size:]
# dataset
x_train = description_train.values
y_train = labels_train.values
x_test = description_test.values
y_test = labels_test.values
self.trn = (x_train, y_train)
self.val = (x_test, y_test)
# @skip('temporarily disabled')
def test_linreg(self):
trn, val, preproc = txt.texts_from_array(
x_train=self.trn[0],
y_train=self.trn[1],
x_test=self.val[0],
y_test=self.val[1],
preprocess_mode="standard",
ngram_range=3,
maxlen=200,
max_features=35000,
)
model = txt.text_regression_model("linreg", train_data=trn, preproc=preproc)
learner = ktrain.get_learner(
model, train_data=trn, val_data=val, batch_size=256
)
lr = 0.01
hist = learner.fit_onecycle(lr, 10)
# test training results
self.assertAlmostEqual(max(hist.history["lr"]), lr)
self.assertLess(min(hist.history["val_mae"]), 12)
# test top losses
obs = learner.top_losses(n=1, val_data=None)
self.assertIn(obs[0][0], list(range(len(val[0]))))
learner.view_top_losses(preproc=preproc, n=1, val_data=None)
# test weight decay
self.assertEqual(learner.get_weight_decay(), None)
learner.set_weight_decay(1e-2)
self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)
# test load and save model
learner.save_model("/tmp/test_model")
learner.load_model("/tmp/test_model")
# test predictor
p = ktrain.get_predictor(learner.model, preproc)
self.assertGreater(p.predict([TEST_DOC])[0], 100)
p.save("/tmp/test_predictor")
p = ktrain.load_predictor("/tmp/test_predictor")
self.assertGreater(p.predict([TEST_DOC])[0], 100)
self.assertIsNone(p.explain(TEST_DOC))
# @skip('temporarily disabled')
def test_distilbert(self):
trn, val, preproc = txt.texts_from_array(
x_train=self.trn[0],
y_train=self.trn[1],
x_test=self.val[0],
y_test=self.val[1],
preprocess_mode="distilbert",
maxlen=75,
)
model = txt.text_regression_model("distilbert", train_data=trn, preproc=preproc)
learner = ktrain.get_learner(
model, train_data=trn, val_data=val, batch_size=100
)
lr = 5e-5
hist = learner.fit_onecycle(lr, 1)
# test training results
self.assertAlmostEqual(max(hist.history["lr"]), lr)
self.assertLess(min(hist.history["val_mae"]), 16)
# test top losses
obs = learner.top_losses(n=1, val_data=None)
self.assertIn(obs[0][0], list(range(len(val.x))))
learner.view_top_losses(preproc=preproc, n=1, val_data=None)
# test weight decay
self.assertEqual(learner.get_weight_decay(), None)
learner.set_weight_decay(1e-2)
self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)
# test load and save model
tmp_folder = ktrain.imports.tempfile.mkdtemp()
learner.save_model(tmp_folder)
learner.load_model(tmp_folder, preproc=preproc)
# test predictor
p = ktrain.get_predictor(learner.model, preproc, batch_size=64)
self.assertGreater(p.predict([TEST_DOC])[0], 1)
tmp_folder = ktrain.imports.tempfile.mkdtemp()
p.save(tmp_folder)
p = ktrain.load_predictor(tmp_folder, batch_size=64)
self.assertGreater(p.predict([TEST_DOC])[0], 1)
self.assertIsNone(p.explain(TEST_DOC))
if __name__ == "__main__":
main()