/
lgbm_model.py
304 lines (210 loc) · 11 KB
/
lgbm_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 12 13:54:26 2018
@author: berend
"""
### train on tfidf vectors of arxiv model:
import sys
sys.path.append('/Users/berend/Documents/Coding/ML-projects/ArxivData/')
import numpy as np
#import matplotlib.pyplot as plt
import data_preprocessing as dp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
import pickle
import lightgbm as lgb
import pandas as pd
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
class lgbmTextClassifier():
def __init__(self, train_data, test_data, savename = None, run_transform = True,
ylabels = None,train_split = 0.66, random_seed = 0, tfidf_params = {},
lgbm_params = {}):
"""lgbm classifier
Args:
train_data: tuple containing (data_X,data_y)
data_X: list of untokenized text samples
data_y: numpy array of one-hot encoded classes
test_data: same as train_data, for test data.
savename: save base path, used to save savename_svd.pickle, etc.
run_transform: bool, if True, run and fit the tf-idf vectorizer
train_split: split training into a train and validation set
ylabels: label names for the y categories
random_seed: seed to pass to loading function, used to
randomize training data before splitting into train/val set
can be used for x-validation
tfidf_params: dictionary of parameters to pass to tfidfvectorizer
lgbm_params: dictionary of parameters to pass lgbm"""
self.savename = savename
self.ylabels = ylabels
self.random_seed = random_seed
if random_seed is not None:
np.random.seed(random_seed)
p = np.random.permutation(len(train_data[0]))
split_n = int(train_split*len(train_data[0]))
self.train_X = [train_data[0][i] for i in p[:split_n]]
self.train_y = np.array([train_data[1][i] for i in p[:split_n]])
#split val data off the train data, so can x-validate by changing random_seed
self.val_X = [train_data[0][i] for i in p[split_n:]]
self.val_y = np.array([train_data[1][i] for i in p[split_n:]])
self.test_X = test_data[0]
self.test_y = test_data[1]
tfidf_std_params = {'stop_words' : None,
'min_df' : 5,
'max_features' : None,
'use_idf' : True,
'tokenizer' : None,
'ngram_range' : (1,2)}
lgbm_std_params = {'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'xentropy',
'num_leaves': 20,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 1,
'verbose': 0,
'num_boost_round': 500,
'early_stopping_rounds': 5,}
self.tfidf_params = dict(tfidf_std_params, **tfidf_params)
self.lgbm_params = dict(lgbm_std_params, **lgbm_params)
self.train_word_vectors(self.train_X)
self.transform_word_vectors()
def train_word_vectors(self,docs):
"""Train the tfidf vectorizer
Args:
docs: list of input strings
Returns:
None"""
#may need to remove interpunction too?
print('Building tfidf vectorizer')
self.tfidf = TfidfVectorizer(**self.tfidf_params)
self.tfidf.fit(docs)
if self.savename is not None:
with open(self.savename + '_tfidf.obj','wb') as f:
pickle.dump(self.tfidf,f)
print('Done training tfidf vectorizer')
def get_word_vectors(self, docs):
"""Get the tfidf vectors corresponding to text
Args:
docs: list of docs to be transformed using tfidf
Returns:
sparse array containing word vectors according to the trained transformer"""
return self.tfidf.transform(docs)
def transform_word_vectors(self):
"""Transform the train, val and test data and save if savename is given"""
print('Transforming word vectors')
self.train_X_tfidfvec = self.get_word_vectors(self.train_X)
self.val_X_tfidfvec = self.get_word_vectors(self.val_X)
self.test_X_tfidfvec = self.get_word_vectors(self.test_X)
if self.savename is not None:
with open(self.savename + '_X_tfidfvec.obj','wb') as f:
pickle.dump((self.train_X_tfidfvec,self.val_X_tfidfvec,self.test_X_tfidfvec),f)
print('Done transforming word vectors')
def train_models(self, savepath = None):
"""Build and compile lgbm models for every category
Note: since the categories are not mutually exclusive, and lgbm does not
support multiple binary classes, train an lgbm model for every class."""
self.gbms = []
self.accs = []
for i in range(self.train_y.shape[1]):
if self.ylabels is not None:
print('Training GBM for {}'.format(self.ylabels[i]))
lgb_train = lgb.Dataset(self.train_X_tfidfvec, self.train_y[:,i].flatten())
lgb_eval = lgb.Dataset(self.val_X_tfidfvec, self.val_y[:,i].flatten(), reference=lgb_train)
gbm = lgb.train(self.lgbm_params,
lgb_train,
num_boost_round=self.lgbm_params['num_boost_round'],
valid_sets=lgb_eval,
early_stopping_rounds=self.lgbm_params['early_stopping_rounds'])
self.gbms.append(gbm)
y_pred = gbm.predict(self.val_X_tfidfvec, num_iteration=gbm.best_iteration)
y_pred_cls = np.round(y_pred)
self.accs.append(np.mean(y_pred_cls == self.val_y[:,i].flatten()))
for i in range(len(self.accs)):
print('Validation acc for {} is {:.2f}'.format(self.ylabels[i],self.accs[i]))
with open(self.savename + '_models.obj','wb') as f:
pickle.dump((self.ylabels,self.gbms),f)
def predict(self, docs):
"""Predict the labels of docs
Args:
docs: list of documents
"""
tf_idf_vecs = self.tfidf.transform(docs)
y_pred = np.zeros((tf_idf_vecs.shape[0], len(self.gbms)))
for i,gbm in enumerate(self.gbms):
y_pred[:,i] = gbm.predict(tf_idf_vecs)
return y_pred
if __name__ == "__main__":
trainpath = '/users/berend/Documents/Coding/ML-projects/ArxivData/train_data/train_data.json'
testpath = '/users/berend/Documents/Coding/ML-projects/ArxivData/test_data/test_data.json'
traindata,testdata = dp.loadfile(trainpath),dp.loadfile(testpath)
inc_categories = ['cond-mat.mes-hall',
'cond-mat.mtrl-sci',
'cond-mat.stat-mech',
'cond-mat.str-el',
'cond-mat.supr-con',
'cond-mat.soft',
'quant-ph',
'cond-mat.dis-nn',
'cond-mat.quant-gas',
'hep-th']
# inc_categories = ['cond-mat.mes-hall',
# 'cond-mat.mtrl-sci',
# 'cond-mat.stat-mech',]
train_X,train_y = dp.generate_Xy_data_categories(traindata, inc_categories, ignore_others = True,
shuffle_seed = 0, ydatatype = 'onehot',
clean_x = True, keep_latex_tags = True)
test_X,test_y = dp.generate_Xy_data_categories(testdata, inc_categories, ignore_others = True,
shuffle_seed = 0, ydatatype = 'onehot',
clean_x = True, keep_latex_tags = True)
#load stopwords inferred from correlations:
with open('save/inferred_stop_words.obj','rb') as f:
inferred_stop_words = pickle.load(f)
tfidf_params = {'stop_words' : None,
'min_df' : 5,
'ngram_range' : (1,1)}
lgbm_params = {'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'xentropy',
'num_leaves': 20,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 1,
'verbose': 0,
'num_boost_round': 500,
'early_stopping_rounds': 5,}
savename = 'save/lm_save'
lm = lgbmTextClassifier((train_X,train_y), (test_X,test_y),ylabels = inc_categories,
savename = savename, train_split = 0.7,
random_seed = 0,run_transform = True,tfidf_params = tfidf_params,
lgbm_params = lgbm_params)
lm.train_models()
## post analysis on val set:
y_true = lm.val_y
y_pred = np.round(lm.predict(lm.val_X))
true_pos = np.mean(np.logical_and((y_pred == 1.), (y_true == 1.)), axis = 0)
true_neg = np.mean(np.logical_and((y_pred == 0.), (y_true == 0.)), axis = 0)
false_neg = np.mean(np.logical_and((y_pred == 0.), (y_true == 1.)), axis = 0)
false_pos = np.mean(np.logical_and((y_pred == 1.), (y_true == 0.)), axis = 0)
accs = true_pos + true_neg
sens = true_pos / (true_pos + false_neg)
prec = true_pos / (true_pos + false_pos)
for i,cat in enumerate(inc_categories):
print('\n\nFor category {}'.format(cat))
print('True positive rate is {:.3f}'.format(true_pos[i]))
print('True negative rate is {:.3f}'.format(true_neg[i]))
print('False positive rate is {:.3f}'.format(false_pos[i]))
print('False negative rate is {:.3f}'.format(false_neg[i]))
print('Accuracy is {:.3f}'.format(accs[i]))
print('Sensitivity is {:.3f}'.format(sens[i]))
print('Precision is {:.3f}'.format(prec[i]))
print('\n\nCategory sens prec acc')
for i,cat in enumerate(inc_categories):
print('{:<20} {:.3f} {:.3f} {:.3f}'.format(cat, sens[i], prec[i], accs[i]))
print('\n\nAverage accuracy: {:.3f}'.format(np.mean(accs)))
print('Average precission: {:.3f}'.format(np.mean(prec)))
print('Average sensitivity: {:.3f}'.format(np.mean(sens)))