# Text Classification with Hugging Face Transformers in ktrain

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
from sklearn.datasets import fetch_20newsgroups
import ktrain
from ktrain import text

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   

## STEP 0: Load Data Into Arrays


In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']

train_b = fetch_20newsgroups(subset='train', categories=categories, 
                             shuffle=True, random_state=42)

test_b = fetch_20newsgroups(subset='test', categories=categories, 
                            shuffle=True, random_state=42)


print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target

size of training set: 2257
size of validation set: 1502
classes: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


## STEP 1: Preprocess Data and Build a Transformer Model

In [3]:
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

preprocessing train...
language: en
train sequence lengths:
	mean : 308
	95percentile : 837
	99percentile : 1938


preprocessing test...
language: en
test sequence lengths:
	mean : 343
	95percentile : 979
	99percentile : 2562


## STEP 2 [Optional]: Estimate a Good Learning Rate¶

In [4]:
#learner.lr_find(show_plot=True, max_epochs=2)

## STEP 3: Train Model

In [5]:
learner.fit_onecycle(5e-5, 4)



begin training using onecycle policy with max lr of 5e-05...
Train for 377 steps, validate for 47 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f19c5d8ba10>

## STEP 4: Evaluate/Inspect Model

In [6]:
learner.validate(class_names=t.get_classes())

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.95      0.94       319
         comp.graphics       0.97      0.98      0.98       389
               sci.med       0.98      0.96      0.97       396
soc.religion.christian       0.98      0.96      0.97       398

              accuracy                           0.97      1502
             macro avg       0.97      0.97      0.97      1502
          weighted avg       0.97      0.97      0.97      1502



array([[304,   3,   5,   7],
       [  4, 383,   2,   0],
       [  6,   7, 382,   1],
       [ 12,   3,   0, 383]])

In [7]:
# the one we got most wrong
learner.view_top_losses(n=1, preproc=t)

----------
id:756 | loss:7.47 | true:soc.religion.christian | pred:comp.graphics)



In [8]:
#print(x_test[1393])

## STEP 5: Make Predictions on New Data in Deployment

In [9]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [10]:
predictor.predict('Jesus Christ is the central figure of Christianity.')

'soc.religion.christian'

In [11]:
predictor.explain('Jesus Christ is the central figure of Christianity.')

Contribution?,Feature
2.335,jesus
2.201,christ
2.029,christianity
0.407,central
0.404,figure
0.341,christ is
0.289,of christianity
0.135,the
0.107,figure of
0.076,the central


In [12]:
predictor.save('fastai-project/homework/models/20newsgroups_model')

In [13]:
#reloaded_predictor = ktrain.load_predictor('fastai-project/homework/models/20newsgroups_model')

In [14]:
#reloaded_predictor.get_classes()

In [18]:
print(1)

1


In [None]:
unfreeze