<a href="https://colab.research.google.com/github/amirsartipi13/NSURL/blob/main/NSURL_Build_KG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import re
import pandas as pd
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

In [34]:
train_path = './drive/MyDrive/data_sets/PERLEX/train.txt'
test_path = './drive/MyDrive/data_sets/PERLEX/test.txt'

# train_path = './drive/MyDrive/dataset/perlex/train.txt'
# test_path = './drive/MyDrive/dataset/perlex/test.txt'

In [35]:
def get_e1(text):
  text = re.sub('<e2>', '', text)
  text = re.sub('</e2>', '', text)
  e = re.findall("<e1>(.*?)</e1>", text)
  pre_process = set()
  for ent in e:
    if ent != ' ' and len(ent.strip())>1:
      pre_process.add(ent.strip())
  return pre_process

In [36]:
def get_e2(text):
  text = re.sub('<e1>', '', text)
  text = re.sub('</e1>', '', text)
  e = re.findall("<e2>(.*?)</e2>", text)
  pre_process = set()
  for ent in e:
    if ent != ' ' and len(ent.strip())>1:
      pre_process.add(ent.strip())
  return pre_process

In [37]:
SEMEVAL_RELATION_LABELS = ['Other', 'Message-Topic(e1,e2)', 'Message-Topic(e2,e1)',
                   'Product-Producer(e1,e2)', 'Product-Producer(e2,e1)',
                   'Instrument-Agency(e1,e2)', 'Instrument-Agency(e2,e1)',
                   'Entity-Destination(e1,e2)', 'Entity-Destination(e2,e1)',
                   'Cause-Effect(e1,e2)', 'Cause-Effect(e2,e1)',
                   'Component-Whole(e1,e2)', 'Component-Whole(e2,e1)',
                   'Entity-Origin(e1,e2)', 'Entity-Origin(e2,e1)',
                   'Member-Collection(e1,e2)', 'Member-Collection(e2,e1)',
                   'Content-Container(e1,e2)', 'Content-Container(e2,e1)']

In [38]:
indx2label = dict(enumerate(SEMEVAL_RELATION_LABELS))
indx2label

{0: 'Other',
 1: 'Message-Topic(e1,e2)',
 2: 'Message-Topic(e2,e1)',
 3: 'Product-Producer(e1,e2)',
 4: 'Product-Producer(e2,e1)',
 5: 'Instrument-Agency(e1,e2)',
 6: 'Instrument-Agency(e2,e1)',
 7: 'Entity-Destination(e1,e2)',
 8: 'Entity-Destination(e2,e1)',
 9: 'Cause-Effect(e1,e2)',
 10: 'Cause-Effect(e2,e1)',
 11: 'Component-Whole(e1,e2)',
 12: 'Component-Whole(e2,e1)',
 13: 'Entity-Origin(e1,e2)',
 14: 'Entity-Origin(e2,e1)',
 15: 'Member-Collection(e1,e2)',
 16: 'Member-Collection(e2,e1)',
 17: 'Content-Container(e1,e2)',
 18: 'Content-Container(e2,e1)'}

In [39]:
label2index = {v:k for k,v in indx2label.items()}
label2index

{'Cause-Effect(e1,e2)': 9,
 'Cause-Effect(e2,e1)': 10,
 'Component-Whole(e1,e2)': 11,
 'Component-Whole(e2,e1)': 12,
 'Content-Container(e1,e2)': 17,
 'Content-Container(e2,e1)': 18,
 'Entity-Destination(e1,e2)': 7,
 'Entity-Destination(e2,e1)': 8,
 'Entity-Origin(e1,e2)': 13,
 'Entity-Origin(e2,e1)': 14,
 'Instrument-Agency(e1,e2)': 5,
 'Instrument-Agency(e2,e1)': 6,
 'Member-Collection(e1,e2)': 15,
 'Member-Collection(e2,e1)': 16,
 'Message-Topic(e1,e2)': 1,
 'Message-Topic(e2,e1)': 2,
 'Other': 0,
 'Product-Producer(e1,e2)': 3,
 'Product-Producer(e2,e1)': 4}

In [40]:
label2index['Other']

0

In [41]:
def label_spliter(label):
  # component-whole(e2,e1)
  # return [whole , component ]
  order = re.findall("\((.*?)\)", label)[0].split(',')
  label_names = label.split("(")[0].split("-")
  if order[0] == "e1":
    return label_names
  else:
    return list(reversed(label_names))

def make_dataframe_row(sentence,label):
  
  if label == "Other":
    labels =[label, label]
  else:
    labels = label_spliter(label=label)

  result = []
  e1s = get_e1(sentence)  
  e2s = get_e2(sentence)
  for e1 in e1s:
    for e2 in e2s:

      result.append( {
          "e1":e1,
          "e2":e2,
          "e1_label":labels[0],
          "e2_label":labels[1],
          "label":label,
          "nlabel": label2index[label],
          "sentence":sentence
      })

  return result
    
def make_dataframe(path):
    f = open(path, 'r')
    data = [x.rstrip() for x in f] 
    data_set_rows = []
    for i in range(0, len(data)-4, 4):
      item = data[i].split('\t')
      sentence = re.sub('[!@#$،.]', '', item[1])

      label = data[i+1]
      rows = make_dataframe_row(sentence , label)
      data_set_rows += rows

    return pd.DataFrame(data_set_rows)

In [42]:
df = make_dataframe(train_path)
df_test = make_dataframe(test_path)
df_test

Unnamed: 0,e1,e2,e1_label,e2_label,label,nlabel,sentence
0,حسابرسی‌ها,ضایعات,Message,Topic,"Message-Topic(e1,e2)",1,"""معمول ترین <e1>حسابرسی‌ها </e1>مربوط به <e2>ض..."
1,شرکت,صندلی‌های,Producer,Product,"Product-Producer(e2,e1)",4,"""این <e1>شرکت <e2></e1>صندلی‌های </e2>پلاستیکی..."
2,استاد,چوب,Agency,Instrument,"Instrument-Agency(e2,e1)",6,"""<e1>استاد </e1>مدرسه با یک <e2>چوب </e2>درس م..."
3,بدن,آب‌انبار,Entity,Destination,"Entity-Destination(e1,e2)",7,"""مظنون <e1>بدن </e1>مرده را به یک <e2>آب‌انبار..."
4,آنفولانزای,ویروس,Effect,Cause,"Cause-Effect(e2,e1)",10,"""<e1>آنفولانزای </e1>مرغی یک بیماری عفونی پرند..."
...,...,...,...,...,...,...,...
2745,بقایای,طوفان,Entity,Origin,"Entity-Origin(e1,e2)",13,"""هوا دیروز بادی و سرد بود و هنوز <e1>بقایای </..."
2746,پادشاه,جارو می کشد,Agency,Instrument,"Instrument-Agency(e2,e1)",6,"""پس از جاگذاری تمام بتها که خود ساعت‌ها طول م..."
2747,مصالح,صنایع,Product,Producer,"Product-Producer(e1,e2)",3,"""وزیر تولید کندِ <e1>مصالح </e1>توسط <e2>صنایع..."
2748,چتر,قاب,Whole,Component,"Component-Whole(e2,e1)",12,"""<e2>قاب <e1></e2>چتر </e1>دارای یك گیره متحرك..."


In [43]:
possible_labels = df.nlabel.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{0: 1,
 1: 7,
 2: 13,
 3: 14,
 4: 8,
 5: 17,
 6: 2,
 7: 5,
 8: 18,
 9: 11,
 10: 4,
 11: 12,
 12: 0,
 13: 10,
 14: 15,
 15: 3,
 16: 9,
 17: 6,
 18: 16}

# Bert Model

In [44]:
X_train = df.sentence.tolist()
X_val =  df_test.sentence.tolist()
y_train = df.nlabel.tolist()
y_val =  df_test.nlabel.tolist()

In [45]:
!pip install transformers



In [46]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [47]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")  
device

device(type='cuda')

In [48]:
model_name = 'bert-base-multilingual-cased'
# model_name = 'xlm-roberta-base'
# model_name = 'HooshvareLab/bert-fa-zwnj-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model  = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2index))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [49]:
# device = torch.device('cpu')

In [50]:
model.to(device)

RuntimeError: ignored

In [51]:
encoded_data_train = tokenizer.batch_encode_plus(
    X_train, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [52]:
encoded_data_val = tokenizer.batch_encode_plus(
    X_val, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)




In [53]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [54]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)
dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [55]:
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [56]:
from sklearn.metrics import f1_score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    tt = 0
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print("Acc with percent:", len(y_preds[y_preds==label])/len(y_true))
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        tt += len(y_preds[y_preds==label])/len(y_true)
    return tt


In [57]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }  
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    # torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    

    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tt_accuracy = accuracy_per_class(predictions, true_vals)
    
    tqdm.write(f'accuracy: {tt_accuracy}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

    training_stats.append(
        {
            'epoch': epoch,
            'Training Loss': loss_train_avg,
            'Valid. Loss': val_loss,
            'Valid. Accur.': tt_accuracy/3
        }
    )


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=507.0, style=ProgressStyle(description_widt…

RuntimeError: ignored

In [None]:
torch.save(model.state_dict(), f'/content/drive/MyDrive/finetuned_HooshvareLab_epoch_{epoch}.model')

In [None]:
import pandas as pd
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats[]
df_stats = df_stats.set_index('epoch')
df_stats

# R-BERT


In [14]:
! git clone https://github.com/mickeystroller/R-BERT.git

fatal: destination path 'R-BERT' already exists and is not an empty directory.


In [20]:
! cp -a ./R-BERT/* ./


In [None]:
!pip install transformers

In [25]:
# import utils
import r_bert

In [26]:
! CUDA_VISIBLE_DEVICES=0 python r_bert.py --config config.ini


2021-08-05 08:44:30.776047: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
08/05/2021 08:44:32 - INFO - filelock -   Lock 139799525361232 acquired on /root/.cache/huggingface/transformers/1cf090f220f9674b67b3434decfe4d40a6532d7849653eac435ff94d31a4904c.1d03e5e4fa2db2532c517b2cd98290d8444b237619bd3d2039850a6d5e86473d.lock
Downloading: 100% 571/571 [00:00<00:00, 564kB/s]
08/05/2021 08:44:32 - INFO - filelock -   Lock 139799525361232 released on /root/.cache/huggingface/transformers/1cf090f220f9674b67b3434decfe4d40a6532d7849653eac435ff94d31a4904c.1d03e5e4fa2db2532c517b2cd98290d8444b237619bd3d2039850a6d5e86473d.lock
08/05/2021 08:44:33 - INFO - filelock -   Lock 139799525361360 acquired on /root/.cache/huggingface/transformers/e12f02d630da91a0982ce6db1ad595231d155a2b725ab106971898276d842ecc.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
Downloading: 100% 232k/232k [00:00<00:00, 1.71MB/s]
08/05/202