In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import joblib
import numpy as np
import pandas as pd

In [None]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [None]:
model = joblib.load('gdrive/My Drive/final_model_1.sav')

In [None]:
tdf = pd.read_csv('gdrive/My Drive/test_set_final_release.tsv',sep ='\t')
tdf.head()

Unnamed: 0,id,EventSnippet
0,1,The US warplanes pounded the village of Kashka...
1,2,After a five-week drop of Covid-19 cases in Pu...
2,3,Officers brutally attacked and injured a peace...
3,4,"On December 13, ISIS fighters recaptured the v..."
4,5,Vice-President Mike Pence said Thursday that t...


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
sents = tdf['EventSnippet']
ids = tdf['id']
print(sents)

0       The US warplanes pounded the village of Kashka...
1       After a five-week drop of Covid-19 cases in Pu...
2       Officers brutally attacked and injured a peace...
3       On December 13, ISIS fighters recaptured the v...
4       Vice-President Mike Pence said Thursday that t...
                              ...                        
1018    The Chinese foreign minister has has urged his...
1019    People of different walks of life, including r...
1020    Heavily armed al-Shabab mujahideen fighters ha...
1021    Monitoring Desk. FARAH. The People's Peace Mov...
1022    Field sources affiliated to the militants back...
Name: EventSnippet, Length: 1023, dtype: object


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
encoded_data_val = tokenizer.batch_encode_plus(
    sents, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=140, 
    return_tensors='pt'
)
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
dataset_val = TensorDataset(input_ids_val, attention_masks_val)
batch_size = 1
valid_dataloader = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import random
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
model.eval()
# Reset the validation loss for this epoch.
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    
    
    with torch.no_grad():
        # This will return the logits rather than the loss because we have not provided labels.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)
        
    outputs.keys()
    
    # logits = outputs.detach().cpu().numpy()
    # label_ids = b_labels.to('cpu').numpy()

    logits = outputs.logits
    templst = logits.tolist()
    # print(type(templst[0][1]))

    # eval_loss += outputs.mean().item()
    predictions.append(templst[0].index(max(templst[0])))
    # true_labels.extend(label_ids)

# eval_loss = eval_loss / len(valid_dataloader)
# validation_loss_values.append(eval_loss)
# print("Validation loss: {}".format(eval_loss))
# pred_tags = [tags_vals[p_i] for p, l in zip(predictions, true_labels)
                              # for p_i, l_i in zip(p, l) if tags_vals[l_i] != "PAD"]
print(predictions)

[1, 18, 2, 14, 18, 1, 8, 0, 14, 14, 8, 11, 7, 0, 18, 15, 10, 8, 0, 8, 10, 5, 7, 4, 3, 15, 7, 18, 7, 10, 4, 5, 8, 4, 3, 15, 18, 18, 17, 15, 17, 18, 3, 4, 9, 12, 12, 4, 12, 12, 8, 4, 8, 6, 10, 7, 8, 12, 11, 14, 5, 12, 17, 18, 4, 12, 18, 12, 11, 11, 14, 0, 7, 8, 2, 7, 12, 18, 18, 11, 4, 12, 5, 2, 2, 18, 4, 6, 8, 7, 7, 10, 0, 1, 15, 7, 14, 14, 0, 10, 18, 15, 15, 0, 3, 7, 4, 5, 2, 0, 7, 8, 18, 7, 11, 8, 3, 2, 8, 10, 0, 0, 14, 10, 15, 4, 4, 3, 0, 8, 6, 0, 7, 10, 2, 11, 4, 14, 15, 11, 14, 14, 4, 1, 2, 7, 4, 7, 0, 12, 10, 2, 4, 11, 18, 0, 4, 18, 3, 4, 5, 11, 12, 18, 18, 4, 4, 4, 12, 18, 0, 5, 7, 3, 1, 11, 3, 11, 0, 1, 18, 11, 4, 11, 18, 14, 10, 18, 18, 11, 14, 14, 8, 11, 18, 7, 9, 18, 8, 11, 12, 4, 12, 17, 14, 1, 8, 12, 8, 5, 14, 18, 18, 0, 10, 14, 0, 17, 3, 12, 8, 7, 12, 12, 0, 17, 6, 15, 12, 4, 9, 17, 18, 7, 12, 7, 12, 18, 3, 9, 3, 2, 7, 12, 10, 7, 18, 4, 0, 4, 7, 7, 12, 10, 12, 5, 3, 4, 3, 18, 4, 18, 4, 10, 0, 4, 4, 10, 10, 10, 5, 0, 7, 0, 0, 0, 0, 4, 4, 4, 4, 0, 11, 5, 14, 7, 8, 0, 18, 18,

In [None]:
print(len(predictions))

1023


In [None]:
df = pd.read_csv('gdrive/My Drive/2018-09-17-2021-09-27-Middle_East-South_Asia.csv')

In [None]:
possible_labels = df.sub_event_type.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Abduction/forced disappearance': 11,
 'Agreement': 16,
 'Air/drone strike': 1,
 'Armed clash': 4,
 'Arrests': 9,
 'Attack': 7,
 'Change to group/activity': 18,
 'Chemical weapon': 24,
 'Disrupted weapons use': 8,
 'Excessive force against protesters': 13,
 'Government regains territory': 14,
 'Grenade': 15,
 'Headquarters or base established': 21,
 'Looting/property destruction': 6,
 'Mob violence': 5,
 'Non-state actor overtakes territory': 23,
 'Non-violent transfer of territory': 19,
 'Other': 17,
 'Peaceful protest': 0,
 'Protest with intervention': 10,
 'Remote explosive/landmine/IED': 12,
 'Sexual violence': 22,
 'Shelling/artillery/missile attack': 3,
 'Suicide bomb': 20,
 'Violent demonstration': 2}

In [None]:
key_list = list(label_dict.keys())
val_list = list(label_dict.values())

In [None]:
pred_labels=[]
for i in range(len(predictions)):
  pred_labels.append(key_list[val_list.index(predictions[i])])
  # print(i+1, pred_labels[i])

In [None]:
print(pred_labels);

['Air/drone strike', 'Change to group/activity', 'Violent demonstration', 'Government regains territory', 'Change to group/activity', 'Air/drone strike', 'Disrupted weapons use', 'Peaceful protest', 'Government regains territory', 'Government regains territory', 'Disrupted weapons use', 'Abduction/forced disappearance', 'Attack', 'Peaceful protest', 'Change to group/activity', 'Grenade', 'Protest with intervention', 'Disrupted weapons use', 'Peaceful protest', 'Disrupted weapons use', 'Protest with intervention', 'Mob violence', 'Attack', 'Armed clash', 'Shelling/artillery/missile attack', 'Grenade', 'Attack', 'Change to group/activity', 'Attack', 'Protest with intervention', 'Armed clash', 'Mob violence', 'Disrupted weapons use', 'Armed clash', 'Shelling/artillery/missile attack', 'Grenade', 'Change to group/activity', 'Change to group/activity', 'Other', 'Grenade', 'Other', 'Change to group/activity', 'Shelling/artillery/missile attack', 'Armed clash', 'Arrests', 'Remote explosive/la

In [None]:
labeldf = pd.read_csv('gdrive/My Drive/test_set_final_release_with_labels.tsv',sep ='\t')
labeldf.head()



Unnamed: 0,id,EventSnippet,SubType
0,1,The US warplanes pounded the village of Kashka...,AIR_STRIKE
1,2,After a five-week drop of Covid-19 cases in Pu...,NATURAL_DISASTER
2,3,Officers brutally attacked and injured a peace...,FORCE_AGAINST_PROTEST
3,4,"On December 13, ISIS fighters recaptured the v...",NON_STATE_ACTOR_OVERTAKES_TER
4,5,Vice-President Mike Pence said Thursday that t...,AGREEMENT


In [None]:
correct_labels_inwords = labeldf['SubType']
given_ids = labeldf['id']
given_sents = labeldf['EventSnippet']
# print(len(given_sents))
# print(given_ids)
given_ids = given_ids.tolist()
# print((given_ids))
print("ids of missing sentences are:\n")
for i in range(1,1024):
  if i not in given_ids:
    print(i)
  
    

ids of missing sentences are:

142
205
737
957


In [None]:
possible_test_labels = labeldf.SubType.unique()
label_test_dict = {'AIR_STRIKE':1,'NATURAL_DISASTER':25 ,'FORCE_AGAINST_PROTEST':13,'NON_STATE_ACTOR_OVERTAKES_TER':23,'AGREEMENT':16,'CHEM_WEAP':24,
                   'PEACE_PROTEST':0,'GOV_REGAINS_TERIT':14,'DISR_WEAP':8,'PROPERTY_DISTRUCT':6,'OTHER':17,'CHANGE_TO_GROUP_ACT':18,'GRENADE':15,'VIOL_DEMONSTR':2,
                   'MAN_MADE_DISASTER':26,'ATTRIB':27,'MOB_VIOL':5,'ATTACK':7,'ARMED_CLASH':4,'ART_MISS_ATTACK':3,'NON_VIOL_TERRIT_TRANSFER':19,
                   'PROTEST_WITH_INTER':10, 'DIPLO':28, 'SUIC_BOMB':20, 'ARREST':9, 'REM_EXPLOS':12, 'SEX_VIOL':22, 'ORG_CRIME':29, 'ABDUCT_DISSAP':11,
 'HQ_ESTABLISHED':21}
label_test_dict


{'ABDUCT_DISSAP': 11,
 'AGREEMENT': 16,
 'AIR_STRIKE': 1,
 'ARMED_CLASH': 4,
 'ARREST': 9,
 'ART_MISS_ATTACK': 3,
 'ATTACK': 7,
 'ATTRIB': 27,
 'CHANGE_TO_GROUP_ACT': 18,
 'CHEM_WEAP': 24,
 'DIPLO': 28,
 'DISR_WEAP': 8,
 'FORCE_AGAINST_PROTEST': 13,
 'GOV_REGAINS_TERIT': 14,
 'GRENADE': 15,
 'HQ_ESTABLISHED': 21,
 'MAN_MADE_DISASTER': 26,
 'MOB_VIOL': 5,
 'NATURAL_DISASTER': 25,
 'NON_STATE_ACTOR_OVERTAKES_TER': 23,
 'NON_VIOL_TERRIT_TRANSFER': 19,
 'ORG_CRIME': 29,
 'OTHER': 17,
 'PEACE_PROTEST': 0,
 'PROPERTY_DISTRUCT': 6,
 'PROTEST_WITH_INTER': 10,
 'REM_EXPLOS': 12,
 'SEX_VIOL': 22,
 'SUIC_BOMB': 20,
 'VIOL_DEMONSTR': 2}

In [None]:
key_list_new = list(label_test_dict.keys())
val_list_new = list(label_test_dict.values())

In [None]:
correct_list = [0]*1024
for i in range(len(correct_labels_inwords)):
  correct_list[given_ids[i]] = (val_list_new[key_list_new.index(correct_labels_inwords[i])])
print(correct_list)



[0, 1, 25, 13, 23, 16, 24, 24, 0, 14, 14, 8, 6, 17, 0, 18, 15, 2, 26, 27, 8, 13, 5, 7, 4, 3, 15, 7, 19, 5, 10, 8, 5, 8, 4, 3, 15, 18, 16, 28, 15, 28, 18, 20, 16, 9, 27, 12, 8, 20, 8, 8, 4, 19, 26, 13, 22, 24, 26, 29, 14, 29, 26, 24, 28, 4, 12, 19, 26, 27, 9, 14, 22, 24, 8, 9, 24, 26, 27, 28, 11, 4, 25, 6, 25, 2, 28, 4, 26, 8, 12, 22, 2, 0, 24, 15, 22, 14, 23, 0, 2, 21, 15, 15, 0, 25, 7, 4, 2, 2, 0, 24, 8, 25, 22, 11, 24, 1, 2, 8, 5, 0, 0, 4, 2, 15, 16, 29, 3, 0, 8, 25, 0, 7, 10, 2, 9, 3, 11, 15, 1, 14, 0, 14, 14, 13, 26, 16, 2, 5, 8, 13, 2, 3, 11, 18, 0, 3, 21, 3, 27, 26, 11, 26, 18, 16, 4, 3, 9, 12, 18, 0, 5, 5, 3, 1, 29, 3, 8, 0, 20, 16, 28, 4, 11, 16, 16, 9, 18, 21, 11, 14, 14, 8, 29, 18, 7, 29, 28, 8, 11, 26, 3, 12, 17, 0, 1, 8, 12, 8, 4, 23, 28, 21, 0, 0, 14, 16, 28, 25, 12, 8, 7, 12, 12, 2, 28, 25, 15, 20, 4, 9, 15, 17, 24, 26, 20, 25, 23, 25, 29, 26, 6, 20, 26, 13, 15, 18, 4, 28, 8, 20, 20, 12, 2, 12, 22, 24, 4, 25, 28, 15, 28, 17, 2, 2, 23, 4, 9, 13, 13, 22, 28, 7, 0, 28, 2, 0,

In [None]:
correct_class = [0]*30
total_in_class = [0]*30

In [None]:
score=0
for i in range(1,len(correct_list)):
  if(predictions[i-1]==correct_list[i]):
    score = score+1
    correct_class[predictions[i-1]]+=1
  total_in_class[correct_list[i]] +=1
print("Total correctly predicted labels =", score)


Total correctly predicted labels = 446


In [None]:
print("Accuracy in each class:\n")
for i in range(len(correct_class)):
  print(key_list_new[val_list_new.index(i)],":",correct_class[i],"/",total_in_class[i],"=",correct_class[i]/total_in_class[i])

Accuracy in each class:

PEACE_PROTEST : 53 / 61 = 0.8688524590163934
AIR_STRIKE : 31 / 36 = 0.8611111111111112
VIOL_DEMONSTR : 26 / 53 = 0.49056603773584906
ART_MISS_ATTACK : 27 / 36 = 0.75
ARMED_CLASH : 60 / 66 = 0.9090909090909091
MOB_VIOL : 8 / 17 = 0.47058823529411764
PROPERTY_DISTRUCT : 4 / 21 = 0.19047619047619047
ATTACK : 21 / 27 = 0.7777777777777778
DISR_WEAP : 45 / 58 = 0.7758620689655172
ARREST : 12 / 34 = 0.35294117647058826
PROTEST_WITH_INTER : 19 / 22 = 0.8636363636363636
ABDUCT_DISSAP : 16 / 20 = 0.8
REM_EXPLOS : 35 / 36 = 0.9722222222222222
FORCE_AGAINST_PROTEST : 0 / 23 = 0.0
GOV_REGAINS_TERIT : 28 / 38 = 0.7368421052631579
GRENADE : 32 / 48 = 0.6666666666666666
AGREEMENT : 0 / 31 = 0.0
OTHER : 2 / 8 = 0.25
CHANGE_TO_GROUP_ACT : 27 / 30 = 0.9
NON_VIOL_TERRIT_TRANSFER : 0 / 21 = 0.0
SUIC_BOMB : 0 / 41 = 0.0
HQ_ESTABLISHED : 0 / 22 = 0.0
SEX_VIOL : 0 / 23 = 0.0
NON_STATE_ACTOR_OVERTAKES_TER : 0 / 24 = 0.0
CHEM_WEAP : 0 / 37 = 0.0
NATURAL_DISASTER : 0 / 37 = 0.0
MAN_MADE_