In [1]:
import pandas as pd
import ktrain

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('../data/annotated.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,idx,msg_txt,annotation
0,0,0,"In Kyiv, fragments of a downed rocket damage...",1
1,1,1,"Kyiv region is attacked by drones again, air...",1
2,2,2,Explosions are also heard in the capital. Ai...,1
3,3,4,Kyiv region is attacked by drones - Kuleba ...,1
4,5,5,The rocket that fell in Shevchenkivskyi dist...,0


In [4]:
combat  = data[data['annotation']==1.0]

not_combat = data[data['annotation']==0.0]

In [5]:
data_sample = pd.concat([combat, not_combat])

In [6]:
data_sample.describe()

Unnamed: 0.1,Unnamed: 0,idx,annotation
count,953.0,953.0,953.0
mean,724.318993,500.455404,0.295908
std,819.59721,294.900703,0.456689
min,0.0,0.0,0.0
25%,246.0,244.0,0.0
50%,501.0,490.0,0.0
75%,766.0,754.0,1.0
max,3000.0,1027.0,1.0


## Modeling

Next, I'll load in a pre-trained transformer model. For now, I will use 'distilbert'.

In [6]:
train, val, preprocess = ktrain.text.texts_from_df(
    data_sample,
    text_column = 'msg_txt',
    label_columns = ['annotation'],
    val_df = None,
    max_features = 20000,
    maxlen = 128,
    val_pct = 0.1,
    ngram_range = 1,
    preprocess_mode = 'distilbert',
    verbose = 1
)

['not_annotation', 'annotation']
     not_annotation  annotation
333             1.0         0.0
599             0.0         1.0
524             0.0         1.0
587             0.0         1.0
674             1.0         0.0
['not_annotation', 'annotation']
     not_annotation  annotation
790             1.0         0.0
701             1.0         0.0
93              1.0         0.0
651             0.0         1.0
526             0.0         1.0
preprocessing train...
language: en
train sequence lengths:
	mean : 54
	95percentile : 170
	99percentile : 248


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 48
	95percentile : 136
	99percentile : 178


In [7]:
model = preprocess.get_classifier()
learner = ktrain.get_learner(model, train_data=train, val_data=val, batch_size=32)

In [8]:
history=learner.autofit(lr=1e-4, epochs=10, early_stopping=True)



begin training using triangular learning rate policy with max lr of 0.0001...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping
Weights from best epoch have been loaded into model.


In [9]:
predictor = ktrain.get_predictor(learner.model, preproc=preprocess)

In [10]:
validation = learner.validate(val_data=val, print_report=True)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        66
           1       0.80      0.80      0.80        30

    accuracy                           0.88        96
   macro avg       0.85      0.85      0.85        96
weighted avg       0.88      0.88      0.88        96



In [11]:
predictor.save('distilbert')