In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Colle

In [None]:
!git clone https://github.com/webis-de/argmining-21-keypoint-analysis-sharedtask-code

Cloning into 'argmining-21-keypoint-analysis-sharedtask-code'...
remote: Enumerating objects: 491, done.[K
remote: Counting objects: 100% (491/491), done.[K
remote: Compressing objects: 100% (234/234), done.[K
remote: Total 491 (delta 258), reused 479 (delta 247), pack-reused 0[K
Receiving objects: 100% (491/491), 4.10 MiB | 12.54 MiB/s, done.
Resolving deltas: 100% (258/258), done.


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, RobertaTokenizer
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, AdamW
from scipy.special import softmax
from collections import defaultdict
from statistics import mean

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
cov_model = BertForSequenceClassification.from_pretrained("Path_to_BERT", num_labels=2)
cov_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
param_optimizer = list(cov_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-6, correct_bias=False)



In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def get_test_ds(pairs):

  MAX_LEN = 999 #512
  token_ids = []
  mask_ids = []
  seg_ids = []

  for prem, hyp in pairs:
    premise_id = tokenizer.encode(prem, add_special_tokens = False)
    hypothesis_id = tokenizer.encode(hyp, add_special_tokens = False)
    pair_token_ids = [tokenizer.cls_token_id] + premise_id + [tokenizer.sep_token_id] + hypothesis_id + [tokenizer.sep_token_id]
    premise_len = len(premise_id)
    hypothesis_len = len(hypothesis_id)

    segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))
    attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))
    token_ids.append(torch.tensor(pair_token_ids))
    seg_ids.append(segment_ids)
    mask_ids.append(attention_mask_ids)

  token_ids = pad_sequence(token_ids, batch_first=True)
  mask_ids = pad_sequence(mask_ids, batch_first=True)
  seg_ids = pad_sequence(seg_ids, batch_first=True)
  test_ds = TensorDataset(token_ids, mask_ids, seg_ids)
  return test_ds

In [None]:
def get_predictions(test_loader, model):
  predictions = []
  with torch.no_grad():
        for batch_idx, (pair_token_ids, mask_ids, seg_ids) in enumerate(test_loader):
          optimizer.zero_grad()
          pair_token_ids = pair_token_ids.to(device)
          mask_ids = mask_ids.to(device)
          seg_ids = seg_ids.to(device)

          preds = model(pair_token_ids, mask_ids, seg_ids)
          predictions.extend(preds[0].cpu().detach().numpy())
  soft_predictions = softmax(predictions, axis=1)
  return predictions, soft_predictions

In [None]:
def get_coverage_softmax_one_max(preds, u, pairs, thresh = 0):
  mentioned_kps = defaultdict(list)
  selected_kps = set()
  all_kps = []

  for kp in u:
    all_kps.append(kp)


  for i in range(len(soft_predictions)):
    if soft_predictions[i][1] > soft_predictions[i][0] and soft_predictions[i][1] > thresh:
      label = '1'
      mentioned_kps[pairs[i][0]].append([soft_predictions[i][1], pairs[i][1]])

  for k in mentioned_kps:
    cur_arg = mentioned_kps[k]
    selected_kp = max(cur_arg, key=lambda x: x[0])[1]
    selected_kps.add(selected_kp)

  return len(selected_kps)/len(all_kps)

In [None]:
!git clone https://github.com/IBM/KPA_2021_shared_task

Cloning into 'KPA_2021_shared_task'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 44 (delta 14), reused 27 (delta 4), pack-reused 0[K
Receiving objects: 100% (44/44), 352.46 KiB | 2.94 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [None]:
test_args = pd.read_csv('/content/KPA_2021_shared_task/test_data/arguments_test.csv')
test_kps = pd.read_csv('/content/KPA_2021_shared_task/test_data/key_points_test.csv')
test_lbl = pd.read_csv('/content/KPA_2021_shared_task/test_data/labels_test.csv')
test_topic_id = test_kps['topic'].unique()
test_kp_id = test_lbl['key_point_id'].unique()

In [None]:
from collections import defaultdict

args_for_kp_by_topic = defaultdict(dict)

for topic in test_topic_id:
  topic_rows = test_kps.loc[test_kps['topic'] == topic]
  test_kp_id = topic_rows['key_point_id'].unique()
  args_for_kp_by_topic[topic] = {}
  args_for_kp = defaultdict(dict)
  for id in test_kp_id:
    all_args = test_lbl.loc[test_lbl['key_point_id'] == id]
    args = all_args.loc[all_args['label'] == 1]
    args_for_kp[id] = list(args['arg_id'])

  args_for_kp_by_topic[topic] = args_for_kp

In [None]:
args_kp_by_topic = defaultdict(dict)

for topic in args_for_kp_by_topic.keys():
  args_kp = defaultdict(list)
  for k in args_for_kp_by_topic[topic].keys():
    arg_ids = args_for_kp_by_topic[topic][k]
    args = []
    for arg_id in arg_ids:
      args.append(test_args.loc[test_args['arg_id'] == arg_id]['argument'].values[0])
    args_kp[test_kps.loc[test_kps['key_point_id'] == k]['key_point'].values[0]] = args

  args_kp_by_topic[topic] = args_kp

In [None]:
key_points_by_topic = []

for key in args_kp_by_topic:
  key_points_by_topic.append(list(args_kp_by_topic[key].keys()))

In [None]:
argkp_limits = [4+5, 5+5, 7+7]

## Sample Output

In [None]:
output = [['People all around the world vaccinate their children to protect them from any life threatening disease.'],
 ['social platforms must be regulated by governments to avoid hate crimes as well as political disinformation'],
 [" The poorest in society don't have access to either good health care or an adequate benefits system."]]



## Get Coverage

In [None]:
def get_pairs(output):
  pairs = []
  for i in range(len(output)):
    tmp = []
    for e in output[i]:
      for g in key_points_by_topic[i]:
        tmp.append([e, g])
    pairs.append(tmp)
  return pairs

In [None]:
method_pairs = get_pairs(output)

topics = list(args_kp_by_topic.keys())
coverage_scores_by_topic = []
for i in range(len(topics)):
  test_ds = get_test_ds(method_pairs[i])
  test_loader = DataLoader(test_ds, shuffle=False, batch_size=32)
  predictions, soft_predictions = get_predictions(test_loader, cov_model)
  coverage_scores_by_topic.append(get_coverage_softmax_one_max(soft_predictions, args_kp_by_topic[topics[i]], method_pairs[i]))

print(coverage_scores_by_topic)
print(mean(coverage_scores_by_topic))