## 1. Construct datasets in cycle (+for ablation study)

In [1]:
from ner_deberta_multi.dataset_construction import construct_semeval_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
disco_depth = 2

# can be modified
options = [
    [True, True, True, True],
   [True, True, False, True],
   [True, False, False, True],
   [False, False, False, True]
]


for part in ['train', 'dev', 'test']:
    for (use_nucsat, use_rels, use_paths, use_start_end) in options:
        construct_semeval_dataset(
            part = part,
            disco_depth=disco_depth,
            use_rels=use_rels,
            use_nucsat=use_nucsat,
            use_start_end=use_start_end,
            use_paths=use_paths)

In [None]:
#add no-feats version to train the base model
import json

for part in ['train', 'dev', 'test']:
    fn = f'datasets/deberta_propaganda_full/{part}_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi.json'

    with open(fn, 'r') as outfile:
        data = json.load(outfile)

    for el in data:
        del el['feature']

    with open(fn.replace('.json', '_nofeat.json'), 'w') as outfile:
        json.dump(data, outfile)

## 2. Save pos weights

In [1]:
import json
import pickle

In [2]:
with open(f'datasets/deberta_propaganda/train_binary_customfeats_v1_lvl3_multi.json', 'r') as outfile:
    data_train = json.load(outfile)

In [3]:
weights =  []
for i in range(len(data_train[0]['label'][0])):
    label_vals = []
    for item in data_train:
        label_vals.extend([el[i] for el in item['label']])
    num_pos = max(sum(label_vals), 1)
    num_neg = len(label_vals) - num_pos
    weights.append(min(70, num_neg / num_pos))

In [4]:
with open('ner_deberta_multi/pos_weights.pkl', 'wb') as f:
    pickle.dump(weights, f)

## 3. Train Model

In [5]:
!ls datasets/deberta_propaganda_full

'dev_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi.json'
'dev_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi_nofeat.json'
'dev_binary_custom_feats=42_lvl2--use_rels=True--use_nucsat=True--use_paths=False--use_start_end=True_multi.json'
'dev_binary_custom_feats=4_lvl2--use_rels=False--use_nucsat=True--use_paths=False--use_start_end=True_multi.json'
'dev_binary_custom_feats=50_lvl2--use_rels=True--use_nucsat=True--use_paths=True--use_start_end=True_multi.json'
'test_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi.json'
'test_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi_nofeat.json'
'test_binary_custom_feats=42_lvl2--use_rels=True--use_nucsat=True--use_paths=False--use_start_end=True_multi.json'
'test_binary_custom_feats=4_lvl2--use_rels=False--use_nucsat

In [None]:
#RUN BASE
#--label_all_tokens    checkpointing_steps 'epoch' 10000; 2 -- extra_feature_size=42 2e-5

!CUDA_VISIBLE_DEVICES=0 python ner_deberta_multi/run_ner_no_trainer_custom.py \
  --model_name_or_path "microsoft/deberta-v3-base" \
  --train_file 'datasets/deberta_propaganda_full/train_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi_nofeat.json' \
  --validation_file 'datasets/deberta_propaganda_full/dev_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi_nofeat.json' \
  --text_column_name 'token' \
  --label_column_name 'label' \
  --max_length 256 \
  --pad_to_max_length \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --learning_rate 3e-5 \
  --num_train_epochs 45 \
  --checkpointing_steps 10000 \
  --output_dir "checkpoint/deberta_ner_binary_noo_test_lr3e-5-8-45ep_w100_1lin__base" \
  --with_tracking

### Run custom model training in loop

In [3]:
from ner_deberta_multi.train_loop import run_train

In [4]:
options = [
    [True, True, True, True],
    [True, True, False, True],
    [True, False, False, True],
    [False, False, False, True]
]


for (use_nucsat, use_rels, use_paths, use_start_end) in options:
    run_train(level=2,
              lr=3e-5,
              bs=8,
              n_epochs=45,
              use_rels=use_rels,
              use_nucsat=use_nucsat,
              use_paths=use_paths,
              use_start_end=use_start_end,
              #save_eval_metric='macro_f1',
              device='cuda:0')

## 4. Run inference

In [15]:
import os
import sys
sys.path.insert(0, 'ner_deberta_multi/')

In [16]:
from inference_with_eval import run_inference

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name_or_path =  "checkpoint/deberta_ner_v1-3e-5-4_binary_multi_45ep_w100_base_dev_cp/best_macro_f1/"
dev_json = 'datasets/deberta_propaganda_full/test_binary_custom_feats=2_lvl2--use_rels=False--use_nucsat=False--use_paths=False--use_start_end=True_multi_nofeat.json'

In [26]:
model_name_or_path = 'checkpoint/deberta_ner_binary_lr3e-05-16-45ep_w100_1lin__lvl2--use_rels=True--use_nucsat=True--use_paths=True--use_start_end=True_multi'

for fn in os.listdir('datasets/deberta_propaganda_full/'):
    if fn.endswith(model_name_or_path.split('--', 1)[1] + '.json') and fn.startswith('test'):
        dev_json = 'datasets/deberta_propaganda_full/' + fn

In [27]:
metrics, preds_df = run_inference(model_name_or_path, dev_json, thresh=0.5, compute_global=True)

Found cached dataset json (/home/alexander.chernyavsk/.cache/huggingface/datasets/json/default-e8548153b097c645/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 206.00it/s]
Loading cached processed dataset at /home/alexander.chernyavsk/.cache/huggingface/datasets/json/default-e8548153b097c645/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-9eb3399e4380a096.arrow
100%|█████████████████████████████████████████| 211/211 [04:23<00:00,  1.25s/it]


In [6]:
model_name_or_path =  "checkpoint/deberta_ner_binary_lr3e-05-8-40ep_w100_1lin__lvl2--use_rels=True--use_nucsat=True--use_paths=True--use_start_end=True_multi/"
dev_json = 'datasets/deberta_propaganda_full/test_binary_custom_feats=50_lvl2--use_rels=True--use_nucsat=True--use_paths=True--use_start_end=True_multi.json'
metrics, preds_df = run_inference(model_name_or_path, dev_json, thresh=0.5)

Downloading and preparing dataset json/default to /home/alexander.chernyavsk/.cache/huggingface/datasets/json/default-e8548153b097c645/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files: 100%|████████████████████| 1/1 [00:00<00:00, 851.81it/s]
Extracting data files: 100%|█████████████████████| 1/1 [00:00<00:00, 246.87it/s]
                                                             

Dataset json downloaded and prepared to /home/alexander.chernyavsk/.cache/huggingface/datasets/json/default-e8548153b097c645/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 106.38it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|█████████████████████████████████████████| 211/211 [04:32<00:00,  1.29s/it]
