### **The Following Code Snippets are a documentation of the Github Repository "https://github.com/BNLNLP/PPI-Relation-Extraction".**

###**The codes were copied and pasted here so that I can add my interpretation of the code. The model was actually debugged in local device, using python 3.11, through Visual Studio Code.**

### class **transformers.PreTrainedTokenizer**

Parameters:


* **model_max_length** (int, optional) — The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is loaded with from_pretrained(), this will be set to the value stored for the associated model in max_model_input_sizes (see above). If no value is provided, will default to VERY_LARGE_INTEGER (int(1e30)).
* **padding_side** (str, optional) — The side on which the model should have padding applied. Should be selected between [‘right’, ‘left’]. Default value is picked from the class attribute of the same name.
* **truncation_side** (str, optional) — The side on which the model should have truncation applied. Should be selected between [‘right’, ‘left’]. Default value is picked from the class attribute of the same name.
* **chat_template** (str, optional) — A Jinja template string that will be used to format lists of chat messages. See https://huggingface.co/docs/transformers/chat_templating for a full description.
* **model_input_names** (List[string], optional) — The list of inputs accepted by the forward pass of the model (like "token_type_ids" or "attention_mask"). Default value is picked from the class attribute of the same name.
* **bos_token** (str or tokenizers.AddedToken, optional) — A special token representing the beginning of a sentence. Will be associated to self.bos_token and self.bos_token_id.
* **eos_token** (str or tokenizers.AddedToken, optional) — A special token representing the end of a sentence. Will be associated to self.eos_token and self.eos_token_id.
* **unk_token** (str or tokenizers.AddedToken, optional) — A special token representing an out-of-vocabulary token. Will be associated to self.unk_token and self.unk_token_id.
* **sep_token** (str or tokenizers.AddedToken, optional) — A special token separating two different sentences in the same input (used by BERT for instance). Will be associated to self.sep_token and self.sep_token_id.
* **pad_token** (str or tokenizers.AddedToken, optional) — A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. Will be associated to self.pad_token and self.pad_token_id.
* **cls_token** (str or tokenizers.AddedToken, optional) — A special token representing the class of the input (used by BERT for instance). Will be associated to self.cls_token and self.cls_token_id.
* **mask_token** (str or tokenizers.AddedToken, optional) — A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). Will be associated to self.mask_token and self.mask_token_id.
* **additional_special_tokens** (tuple or list of str or tokenizers.AddedToken, optional) — A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with skip_special_tokens is set to True. If they are not part of the vocabulary, they will be added at the end of the vocabulary.
* **clean_up_tokenization_spaces** (bool, optional, defaults to True) — Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process.
* **split_special_tokens** (bool, optional, defaults to False) — Whether or not the special tokens should be split during the tokenization process. The default behavior is to not split special tokens. This means that if <s> is the bos_token, then tokenizer.tokenize("<s>") = ['<s>]. Otherwise, if split_special_tokens=True, then tokenizer.tokenize("<s>") will be give ['<', 's', '>']. This argument is only supported for slow tokenizers for the moment.




In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
from transformers import PreTrainedTokenizerBase

@dataclass

class DataCollatorForRelationClassification:

  tokenizer: PreTrainedTokenizerBase # default tokenizer set to PreTrainedTokenizerBase
  padding: Union[bool, str] = True
  max_length: Optional[int] = None # Optional[int] means that the value can either be an integer or a NONE datatype
  pad_to_multiple_of: Optional[int] = None
  label_pad_token_id: int = -100 # default value of label_pad_token_id [integer] set to -100
  return_tensors: str = 'pt' # tensor default value set to 'pytorch'

  def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: # What is a Method Signature?
    import torch

    label_name = "label" if "label" in features[0].keys() else "labels"
    labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None

    batch = self.tokenizer.pad(
        features,
        padding = self.padding,
        max_length = self.max_length,
        pad_to_multiple_of = self.pad_to_multiple_of,
        # Conversion to tensors will fail if we have labels as they are not of the same length yet.
        '''
        Both input text data (features) and labels need to be converted into tensors before they can be
        used to train or test. However, there is a potential problem with the labels:
        - The features can be processed and padded to have the same length within a batch as usual.
          However, the labels may not have the same length across different examples in the batch,
          especially if the relations they represent are of the same length.
        For example:

        batch = [
            {
                "text": "John likes pizza.",
                "label": "likes"
            },
            {
                "text": "Sara studies mathematics.",
                "label": "studies"
            },
            {
                "text": "Mark enjoys playing guitar.",
                "label": "enjoys"
            }
        ]

        The following comment referred to the different text sequence lengths. In order to convert the
        text and labels to tensors directly, we will require the text sequences to be of the same length.
        Hence, all the texts, or in this case, features, are padded in accordance to the max length text
        sequence to resolve this issue.

        However, the no. of labels in a text sequence in this case may vary too, so we will need to handle
        them accordingly.

        Therefore, the following code states:
        '''

        )

    label_max_length = max(map(len, batch["labels"]))

    # [START][GP] - padding 'relations' for relation classification
    if "relations" in batch:

      # TODO: this will be needed when multiple relations are in a single input are supported.
      # rel_max_length = max(map(len, batch["relations"]))
      """
      Here the function calculates the maximum entity length
      """
      e1_max_length = max(map(len, [x[0] for x in batch["relations"]]))
      e2_max_length = max(map(len, [x[1] for x in batch["relations"]]))
      e_max_length = max(e1_max_length, e2_max_length)
    # [END][GP] - padding 'relations' for relation classification

    # [START][GP] - padding 'tokens_seq' for relation classification
    if "tokens_seq" in batch:
      tokens_seq_max_length = max(map(len, batch["tokens_seq"]))
    # [END][GP] - padding 'tokens_seq' for relation classification

    # [START][GP] - padding 'tokens_to_ignore' for relation classification
    if "tokens_to_ignore" in batch:
      tokens_to_ignore_max_length = max(map(len, batch["tokens_to_ignore"]))
    # [END][GP] - padding 'tokens_to_ignore' for relation classification

    padding_side = self.toeknizer.padding_side
    if padding_side == 'right'
    batch[label_name] = [
        list(label) + [self.label_pad_token_id] * (label_max_length - len(label)) for label in labels
    ]

    """
    THe following code block is responsible for padding and processing various elements within a batch, such as:
    "relations", "tokens_seq", "tokens_to_ignore," in order to ensure consistent lengths for efficient model
    training.
    """
    # [START][GP] - padding 'relations' for relation classification
    if "relations" in batch:
      """
      The following line checks if "relations" key exists in the 'batch' dictionary
      The task of this code block is to pad the relations elements
      """
      # TODO: this will be needed when multiple relations in a single input are supported. 04-21-2022
      # batch["relations"] = [relation + [self.label_pad_token_id] * (rel_max_length - len(relation)) for relation in batch["relations"]]

      for x in batch["relations"]:
        """
        For each relation pair, in the "relations" data within the batch, the code pads the first entity (x[0]),
        and then the second entity (x[1]) separately.
        The code pads the relation pair by adding a specific padding (self.label_pad_token_id) to each entity list (sequence of words)

        The number of each padding tokens added to each entity list is determined by the difference between the maximum entity length
        (e_max_length) and the current length of the entity list. This ensures that all entity lists within a batch have the same
        length after padding.
        """
        x[0] = x[0] + [[self.label_pad_token_id, self.label_pad_token_id]] * (e_max_length - len(x[0]))
        x[1] = x[1] + [[self.label_pad_token_id, self.label_pad_token_id]] * (e_max_length - len(x[1]))
    # [END][GP] - padding 'relations' for relation classification

      # [START][GP] - padding 'tokens_seq' for relation classification.
      if "tokens_seq" in batch:
        batch["tokens_seq"] = [tokens_seq + [self.label_pad_token_id] * (tokens_seq_max_length - len(tokens_seq)) for tokens_seq in batch["tokens_seq"]]
      # [END][GP] - padding 'tokens_seq' for relation classification

      # [START][GP] - padding 'tokens_to_ignore' for relation classification.
      if "tokens_to_ignore" in batch:
        batch["tokens_to_ignore"] = [tokens_to_ignore + [self.label_pad_token_id] * (tokens_to_ignore_max_length - len(tokens_to_ignore)) in batch["tokens_to_ignore"]]
      # [END][GP] - padding 'tokens_to_ignore' for relation classification

    else:
      ### TODO: handle this case.
      pass

    batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}

    return batch

In [None]:
import os
import sys
import re
import pickle
import json
import pandas as pd
import numpy as np
from datasets import ClassLabel, load_dataset, load_metric, Dataset, DatasetDict, concatenate_datasets
from transformers import BertTokenizerFast, RobertaTokenizerFast
import logging

logging.basicConfig(level=logging.info)
logger = logging.getLogger(__name__)

def read_dataset(dataset_num=0, task_name=None, data_args=None):
  data_files = {}
  if data_args.train_file is not None and data_args.test_file is not None:
    data_files["train"] = data_args.train_file
    data_files["test"] = data_args.test_file

  else:
    data_dir = os.path.join(data_args.dataset_dir, data_args.dataset_name)
    data_files["train"] = os.path.join(data_dir, 'train_' + str(dataset_num) + '.json')
    data_files["test"] = os.path.join(data_dir, 'test_' + str(dataset_num) + '.json')
    if os.path.isfile(os.path.join(data_dir, 'dev_' + str(dataset_num) + '.json')):
      data_files["validation"] = os.path.join(data_dir, 'dev_' + str(dataset_num) + '.json')

  extension = data_files["train"].split(".")[-1]

  return load_dataset(extension, data_files=data_files)


def tokenize_and_set_relation_labels(examples, tokenizer, padding, max_seq_length, relation_representation, use_context):
  '''
  The following definition is designed to handle the tokenization of text data in two scenarios:
  - untokenized text data
  - already tokenized text data

  The choice of which field to tokenize ('text' or 'tokens') is determined by the existence of specific keys
  in the 'examples' dictionary and the value of 'relation_representation'.
  Finally, the code also handles a case where neither 'text', nor 'tokens' is found in the data.
  '''
  if 'text' in examples:
    '''
    The following code is tokenizing text data using the 'tokenizer' object from the HuggingFace library.
    The block of code checks the 'relation_representation' variable. If it starts with the prefix 'EM',
    it sets variable 'token_key' => 'text_with_entity_marker'. Else it sets variable 'token_key' =>
    'text'.
    Here, the choice of 'token_key' essentially determines which field of the 'examples' dictionary will be
    tokenized.
    '''
    if relation_representation.startswith('EM'):
      token_key = 'text_with_entity_marker'
    else:
      token_key = 'text'

    tokenized_inputs = tokenizer(
        examples[token_key],
        padding = padding,
        truncation = True,
        max_length = max_seq_length,
    )

    '''
    ALTERNATIVE SCENARIO: If the 'text' key does not exist in 'examples', it checks if 'tokens' is present.
    If 'tokens' exists, it implies that the dataset already contains tokenized text data stored in as list of words.
    Thus it sets 'token_key' based on the value of 'relation_representation.'

    Also this time, the tokenizer function takes in another argument in comparison to previous scenario,
    i.e: 'is_split_into_words=True'

    '''
    elif 'tokens' in examples:
      if relation_representation.startswith('EM'):
        token_key = 'tokens_with_marker'
      else:
        token_key = 'tokens'

      tokenized_inputs = tokenizer(
          examples[token_key],
          padding = padding,
          truncation = True,
          max_length = max_seq_length,
          # We use this argument because the texts in our dataset are lists of words.
          is_split_into_words = True,
      )
      else:
        raise Exception("There is no tokens element in the data!")


      '''
      In this following secton of code, the program focuses on extracting entity and relation information.
      It also handles different representations of entities based on the value of 'relation_representation'
      '''
      labels = []
      relations = []

      tokens_seq = []
      tokens_to_ignore = []

      # Most data has a single relation per example, but some data such as SciERC has multiple relations in
      # a sentence.
      for i, rel_list in enumerate(examples['relation']): # 'rel_list' = relation_list. enumerate => mention 1 by 1
        if 'text' in examples:

          # ref: https://www.lighttag.io/blog/sequence-labeling-with-transformers/example
          # ref: https://github.com/huggingface/transformers/issues/9326
          def get_token_idx(char_idx):
            while True:
              # if its the last index, return the last token
              if char_idx == len(examples[token_key][i]):
                return len(tokenized_inputs[i]) - 1

              token_idx = tokenized_inputs.char_to_token(batch_or_char_index=i, char_index=char_idx)
              # Whitespaces have no token and will return None.
              if token_idx is not None:
                return token_idx

              char_idx += 1

          e1_span_idx_list, e2_span_idx_list = [], []

          if relation_representation.startswith('EM'):
            e1_idx = rel['entity_1_idx_in_text_with_entity_marker']
            e2_idx = rel['entity_2_idx_in_text_with_entity_marker']

            ## TODO: remove this! The first token is used for separate tokens.
            if np.asarray(e1_idx).ndim > 1 or np.asarray(e2_idx).ndim > 1:
              raise Exception("For now, entity marker representations do not support separate entities.")

            else:
              e1_idx = rel['entity_1_idx']
              e2_idx = rel['entity_2_idx']

          e1_idx = [e1_idx] if np.asarray(e1_idx).ndim == 1 else e1_idx
          e2_idx = [e2_idx] if np.asarray(e2_idx).ndim == 1 else e2_idx

          for e1_s, e1_e in e1_idx:
            e1_span_s = get_token_idx(e1_s)
            e1_span_e = get_token_idx(e1_s)
            e1_span_idx_list.append((e1_span_s, e1_span_e))

          for e2_s, e2_e in e2_idx:
            e2_span_s = get_token_idx(e2_s)
            e2_span_e = get_token_idx(e2_s)
            e2_span_idx_list.append((e2_span_s, e2_span_e))

          entity_1_type_id = rel['entity_1_type_id']
          entity_2_type_id = rel['entity_2_type_id']

          label_ids.append(rel['relation_id'])
          relation_spans.extend([e1_span_idx_list, e2_span_idx_list])
          ent_types.extend([entity_1_type_id, entity_2_type_id])

        labels.append(label_ids)
        relations.append(relation_spans)

        '''
        This code block processes tokens and context based on the 'attn_based' context type. It extract
        tokens, identifies entity indices within relation spans, determines whether each token should be
        ignored, and associates 'labels' and 'relations' with the 'tokenized_inputs'. The code appears to
        part of a data preparation pipeline for NLP tasks involving attention-based contexts.
        '''
        if use_context == 'attn_based':
          input_tokens = tokenized_inputs.tokens(batch_index=i)

          entity_indice = []

          for r_s in relation_spans:
            for span_s, span_e in r_s:
              entity_indice.extend(list(range(span_s, span_e)))

          entity_indice = list(set(entity_indice))

          '''
          Tokens Sequence and Tokens to Ignore:
          - two lists, 'tokens_seq' and 'tokens_to_ignore', are introduced here to capture information about
          tokens
          - 'tokens_seq': is a binary list where each element is 1 if the corresponding token starts with '##'
          (which indicates the continuation of a subword) and 0 otherwise.
          - 'tokens_to_ignore': a list where each element is set to -100 if the token meets certain conditions.
          I.e: token does not contain any alphanumeric characters (letters or digits)
               token is not part of the set of special tokens defined by the tokenizer (excluding additional special
               tokens)
               The token index is in the 'entity_indices' list.
          '''
          tokens_seq.append([1 if tok.startswith('##') else 0 for tok in input_tokens])
          tokens_to_ignore.append([-100 if re.search('[a-zA-Z0-9]', tok) == None or \
                                      tok in list(set(tokenizer.all_special_tokens) - set(tokenizer.additional_special_tokens)) or \
                                      idx in entity_indice \
                                   else 0 for idx, tok in enumerate(input_tokens)
                                   ])

          tokenized_inputs['labels'] = labels
          tokenized_inputs['relations'] = relations

  return tokenized_inputs


def featurize_data(dataset, tokenizer, padding, max_seq_length, relation_representation, use_context):
  convert_func_dict = tokenize_and_set_relation_labels

  if use_context = "attn_based":
    columns = ['input_ids', 'attention_mask', 'labels', 'token_type_ids', 'relations', 'tokens_seq', 'tokens_to_ignore']
  else:
    columns = ['inputs_ids', 'attention_mask', 'labels', 'token_type_ids', 'relations']


  features = {}

  for phase, phase_dataset in dataset.items():
    features[phase] = phase_dataset.map(
        convert_func_dict,
        fn_kwargs= {
            'tokenizer':tokenizer,
            'padding':padding,
            'max_seq_length':max_seq_length,
            'relation_representation':relation_representation,
            'use_context':use_context
        },
        batched = True,
        load_from_cache_file = False,
    )

    features[phase].set_format(
        type=None,
        columns=columns,
    )

  return features


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from transformers import (
    BertModel,
    BertPreTraninedModel,
    RobertaModel,
    RobertaPreTrainedModel,
)

from torch.nn import MSELoss, CrossEntropyLoss

from modeling_outputs import RelationClassifierOutput

class BertForRelationalClassification(BertPreTrainedModel):

  def __init__(self, config, **kwargs):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.config = config

    # Creating an instance of the Bert Model.
    self.bert = BertModel(config)
    # assigning classifier_dropout, given the condition that if classifier_dropout is not None.
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )

    self.dropout = nn.Dropout(classifier_dropout)

    self.hidden_size = config.hidden_size
    self.finetuning_task = config.finetuning_task

    self.relation_representation = kwargs['relation_representation']
    self.use_context = kwargs['use_context']
    self.tokenizer = kwargs['tokenizer']

    if self.relation_representation in ['STANDARD_mention_pooling', 'EM_mention_pooling', 'EM_entity_start']:
      # double sized input for prediction head for RE task since it concats two embeddings. 04-04-2021
      pred_head_input_size = 2
    else:
      pred_head_input_size = 1

    if self.use_context:
      pred_head_input_size += 1

    self.classifier = nn.Linear(config.hidden_size*pred_head_input_size, config.num_labels)

    # Initialize weights and apply final processing
    self.post_init()

    '''
    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    '''
  def forward(
      self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      labels=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,

      relations=None,
      tokens_seq=None,
      tokens_to_ignore=None,

      directed=None,
      reverse=None,

    ):
      return_dict = return_dict if return_dict is not None else self.config.use_return_dict

      outputs = self.bert(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          position_ids=position_ids,
          head_mask=head_mask,
          inputs_embeds=inputs_embeds,
          output_attentions=output_attentions,
          output_hidden_states=output_hidden_states,
          return_dict=return_dict,
        )

      pooled_output = outputs.pooler_output # outputs[1]
      sequence_output = outputs.last_hidden_state # outputs[0]
      sequence_output = self.dropout(sequence_output)

      if self.config.output_attentions:
        attention_output = outputs.attentions # attention_probs
        attention_output = attention_output[-1] # last layer

      # offset used to find local context. In case of entity markers, ignore marker tokens for local context.
      # E.g., in the sample [E1] gene1 [/E1] activates [E2] gene2 [/E2], the local context should be just 'activates'.
      lc_offset = 1 if self.relation_representation.startswith('EM') else 0

      # This is used for [CLS] token.
      pooled_output_dropout = self.dropout(pooled_output)

      buffer = []

      # iterate batch & collect
      for i in range(sequence_output.size()[0]):
        rel_list = [x for x in torch.split(relations[i], 2) if all(xx == -100 for xx in x.tolist()) is False] # e1_span_idx_list, e2_span_idx_list

            # In case of EM, a sample has a single relation. In case of marker-free, a sample can have multiple relations.
        for rel in rel_list:
          e1_span_idx_list = rel[0]
          e2_span_idx_list = rel[1]

                # Delete pad index [-100, -100].
          e1_span_idx_list = e1_span_idx_list[e1_span_idx_list.sum(dim=1) > 0]
          e2_span_idx_list = e2_span_idx_list[e2_span_idx_list.sum(dim=1) > 0]

          if self.relation_representation in ['STANDARD_cls_token', 'EM_cls_token']:
            cls_token = pooled_output_dropout[i]

          elif self.relation_representation in ['EM_entity_start']:
            ## TODO: find a better way later.
            # Get the min start index.
            e1_start = torch.min(e1_span_idx_list, dim=0)[0][0]
            e2_start = torch.min(e2_span_idx_list, dim=0)[0][0]

            e1_rep = sequence_output[i, e1_start-1, :]
            e2_rep = sequence_output[i, e2_start-1, :]

          elif self.relation_representation in ['STANDARD_mention_pooling', 'EM_mention_pooling']:
            all_e1_rep = None
            for e1_start, e1_end in e1_span_idx_list:
              e1_rep = sequence_output[i, e1_start:e1_end, :]
              all_e1_rep = torch.cat((all_e1_rep, e1_rep)) if all_e1_rep is not None else e1_rep

              e1_rep = torch.max(all_e1_rep, dim=0)[0] # max_pooling
              del all_e1_rep

              all_e2_rep = None
              for e2_start, e2_end in e2_span_idx_list:
                e2_rep = sequence_output[i, e2_start:e2_end, :]
                all_e2_rep = torch.cat((all_e2_rep, e2_rep)) if all_e2_rep is not None else e2_rep

              e2_rep = torch.max(all_e2_rep, dim=0)[0] # max_pooling
              del all_e2_rep

            if self.use_context == 'attn_based':
              all_e1_attn = None
              for e1_start, e1_end in e1_span_idx_list:
                e1_attn = attention_output[i,:,e1_start:e1_end,:]
                all_e1_attn = torch.cat((all_e1_attn, e1_attn), dim=1) if all_e1_attn is not None else e1_attn

              e1_attn = torch.max(all_e1_attn.sum(0), dim=0)[0] # max_pooling
              del all_e1_attn

              all_e2_attn = None
              for e2_start, e2_end in e2_span_idx_list:
                e2_attn = attention_output[i,:,e2_start:e2_end,:]
                all_e2_attn = torch.cat((all_e2_attn, e2_attn), dim=1) if all_e2_attn is not None else e2_attn

              e2_attn = torch.max(all_e2_attn.sum(0), dim=0)[0] # max_pooling
              del all_e2_attn

              b = tokens_to_ignore[i] == -100

              e1_attn[b.nonzero()] = float("-Inf")
              e2_attn[b.nonzero()] = float("-Inf")

              num_of_attentive_tokens = torch.round((e1_attn != float("-Inf")).count_nonzero()*0.2)

              all_contexts = None

              ctx_tok_cnt = 0
              for _ in range(num_of_attentive_tokens.int()):
                e1_e2_attn_most_idx = torch.argmax(torch.add(e1_attn, e2_attn))

                # check if a token is a part of a split token.
                if tokens_seq[i][e1_e2_attn_most_idx] == 1 or tokens_seq[i][e1_e2_attn_most_idx+1] == 1:

                  def get_index(list, start, reverse=False):
                    step = -1 if reverse else 1
                    for ii, tt in enumerate(list[start::step]):
                      if tt != 1:
                        break
                    return start-ii if reverse else start+ii

                  word_s = get_index(tokens_seq[i].tolist(), e1_e2_attn_most_idx, reverse=True) if tokens_seq[i][e1_e2_attn_most_idx] == 1 else e1_e2_attn_most_idx
                  word_e = get_index(tokens_seq[i].tolist(), e1_e2_attn_most_idx+1, reverse=False)

                  context = sequence_output[i, word_s:word_e, :]
                else:
                  # To match dimension with the case above.
                  word_s = e1_e2_attn_most_idx
                  word_e = e1_e2_attn_most_idx+1
                  context = sequence_output[i, word_s:word_e, :]

                e1_attn[word_s:word_e] = float("-Inf")
                e2_attn[word_s:word_e] = float("-Inf")

                ctx_tok_cnt += (word_e - word_s)

                if all_contexts is None:
                  all_contexts = context
                else:
                  all_contexts = torch.cat((all_contexts, context))

                if all_contexts is None:
                  context = torch.zeros([self.hidden_size], dtype=sequence_output.dtype, device=sequence_output.device)
                else:
                  context = torch.max(all_contexts, dim=0)[0] # max_pooling
                  del all_contexts

                elif self.use_context == 'local':
                  # Get the min start index and max end index.
                  e1_start = torch.min(e1_span_idx_list, dim=0)[0][0]
                  e1_end = torch.max(e1_span_idx_list, dim=0)[0][1]
                  e2_start = torch.min(e2_span_idx_list, dim=0)[0][0]
                  e2_end = torch.max(e2_span_idx_list, dim=0)[0][1]

                  # if entity 1 appears before entity 2, and there is at least one token exists betweeen them.
                  if e1_end + lc_offset < e2_start - lc_offset:
                    context = sequence_output[i, e1_end+lc_offset:e2_start-lc_offset, :]
                    context = torch.transpose(context, 0, 1)
                    context = torch.max(context, dim=1)[0] # max_pooling
                  # if entity 2 appears before entity 1, and there is at least one token exists betweeen them.
                  elif e2_end + lc_offset < e1_start - lc_offset:
                    context = sequence_output[i, e2_end+lc_offset:e1_start-lc_offset, :]
                    context = torch.transpose(context, 0, 1)
                    context = torch.max(context, dim=1)[0] # max_pooling
                  else:
                    context = torch.zeros([self.hidden_size], dtype=sequence_output.dtype, device=sequence_output.device)

                if self.relation_representation in ['STANDARD_cls_token', 'EM_cls_token']:
                  rel_rep = torch.cat((cls_token, context)) if self.use_context else cls_token
                else:
                  rel_rep = torch.cat((e1_rep, context, e2_rep)) if self.use_context else torch.cat((e1_rep, e2_rep))

                buffer.append(rel_rep)

                if self.relation_representation in ['STANDARD_cls_token', 'EM_cls_token']:
                  del cls_token
                else:
                  del e1_rep
                  del e2_rep

                if self.use_context:
                  del context

      rel_rep = torch.stack([x for x in buffer], dim=0)
      del buffer

      logits = self.classifier(rel_rep)

      loss_fct = CrossEntropyLoss()

      loss = loss_fct(logits, labels.squeeze(1))

      if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

      return RelationClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        )


### ***In this section, we cloned the model repository to investigate some bugs which withheld the model from executing***

In [None]:
!git clone https://github.com/BNLNLP/PPI-Relation-Extraction.git

Cloning into 'PPI-Relation-Extraction'...
remote: Enumerating objects: 899, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 899 (delta 7), reused 14 (delta 6), pack-reused 884[K
Receiving objects: 100% (899/899), 20.93 MiB | 12.54 MiB/s, done.
Resolving deltas: 100% (499/499), done.
Updating files: 100% (199/199), done.


In [None]:
%cd /content/PPI-Relation-Extraction/script
!./run.sh

/content/PPI-Relation-Extraction/script
1) Run RE
2) Quit
Please enter your choice: 1
you chose RE.
./run.sh: line 36: srun: command not found


In [None]:
%cd /content/PPI-Relation-Extraction/script
%bash run.sh

/content/PPI-Relation-Extraction/script


UsageError: Line magic function `%bash` not found (But cell magic `%%bash` exists, did you mean that instead?).


In [None]:
%cd /content/PPI-Relation-Extraction/script
%%bash run.sh

/content/PPI-Relation-Extraction/script


UsageError: Line magic function `%%bash` not found.


In [None]:
!chmod +x /content/PPI-Relation-Extraction/script/run.sh


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m143.4/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


## ***Recent attempts to update Preprocessing module***

In [None]:
import PyPDF2
import os
import json


# Function to clean the text
def clean_text(text):
    # Split the text into lines
    lines = text.split('\n')
    cleaned_lines = []

    # Iterate through lines and clean
    for line in lines:
        # Remove extra spaces and leading/trailing whitespaces
        cleaned_line = ' '.join(line.split())
        if cleaned_line:
            cleaned_lines.append(cleaned_line)

    # Join the cleaned lines
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

# Set the path to the directory containing the PDF files
pdf_directory = "/content/drive/MyDrive/Dengue Vaccine Development Papers"
output_json_file = "literature_data.json"

# Initialize a dictionary to store the PDF content
pdf_data = {
    'Title': '',
    'Abstract': ''
    }

# Variables to track title and abstract
title = ''
abstract = ''
abstract_found = False  # Flag to check if abstract has been found

# Iterate through the PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file_path = os.path.join(pdf_directory, filename)

        # Open the PDF file
        with open(pdf_file_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            # Extract the Title and Abstract
            title = pdf_reader.pages[0].extract_text()  # Assuming title is on the first page
            abstract = ""
            for page_num in range(1, min(4, len(pdf_reader.pages))):  # Extract text from the first 3 pages as abstract
                abstract += pdf_reader.pages[page_num].extract_text()
                # Clean the page text
                cleaned_text = clean_text(page_text)
                # Check if the page contains title or abstract
                if not title:
                    # Assume the first non-empty line as the title
                    lines = cleaned_text.split('\n')
                    for line in lines:
                        if line.strip():
                            title = line.strip()
                            break

                # Check if the abstract has been found and the page does not contain a heading
                if abstract_found and not any(line.strip().isupper() for line in cleaned_text.split('\n')):
                    # Discard content after the first heading following the abstract
                    continue

                if "Abstract:" in cleaned_text:
                    # Extract the abstract
                    abstract = cleaned_text
                    abstract_found = True

            # Store the title and abstract
            pdf_data['Title'] = title
            pdf_data['Abstract'] = abstract

# Create a JSON object from the extracted text
json_data = json.dumps(pdf_data, indent=4)

# Save the JSON data to a file
with open('pdf_data.json', 'w') as json_file:
    json_file.write(json_data)

# Read and print the JSON data from the file
with open('pdf_data.json', 'r') as json_file:
    loaded_json_data = json.load(json_file)
    print("\nLoaded JSON Data:")
    print(loaded_json_data)

print(f"Extracted data from {len(literature_data)} literature reviews and saved to {output_json_file}")





Loaded JSON Data:
{'Title': 'The Potent and Broadly Neutralizing Human Dengue Virus-Speciﬁc\nMonoclonal Antibody 1C19 Reveals a Unique Cross-Reactive Epitopeon the bc Loop of Domain II of the Envelope Protein\nScott A. Smith ,a,bA. Ruklanthi de Alwis ,cNurgun Kose ,bEva Harris ,dKristie D. Ibarra ,dKristen M. Kahle ,eJennifer M. Pfaff ,e\nXiaoxiao Xiang ,eBenjamin J. Doranz ,eAravinda M. de Silva ,cS. Kyle Austin ,fSoila Sukupolvi-Petty ,fMichael S. Diamond ,f\nJames E. Crowe , Jr.b,g,h\nDepartment of Medicine, Vanderbilt University Medical Center, Vanderbilt University, Nashville, Tennessee, USAa; The Vanderbilt Vaccine Center, Vanderbilt University\nMedical Center, Vanderbilt University, Nashville, Tennessee, USAb; Department of Microbiology and Immunology, University of North Carolina School of Medicine, Chapel\nHill, North Carolina, USAc; Division of Infectious Diseases and Vaccinology, School of Public Health, University of California, Berkeley, Berkeley, California, USAd; Integr

In [None]:

# Open the PDF file
with open('/content/drive/MyDrive/Dengue Vaccine Development Papers/A Review on Dengue Vaccine Development.pdf', 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Initialize a dictionary to store the PDF content
    pdf_text = {
        'content' : ''
    }

    # Iterate through pages and extract text
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()

        # Append page text to the content
        pdf_text['content'] += page_text


# Create a JSON object from the extracted text
json_data = json.dumps(pdf_text, indent=4)

# Save the JSON data to a file
with open('pdf_text.json', 'w') as json_file:
    json_file.write(json_data)

# Print the JSON data
print(json_data)


{
    "content": "Review\nA Review on Dengue Vaccine Development\nSheng-Qun Deng1,y, Xian Yang1,y, Yong Wei1, Jia-Ting Chen1, Xiao-Jun Wang2and\nHong-Juan Peng1,*\n1Department of Pathogen Biology, Guangdong Provincial Key Laboratory of Tropical Disease Research,\nSchool of Public Health, Southern Medical University, Guangzhou 510515, China;\ndengshengqun@163.com (S.-Q.D.); xianluebuzixian28@163.com (X.Y.); smuweiyong@163.com (Y.W.);\njiating723@i.smu.edu.cn (J.-T.C.)\n2Department of Epidemiology and Biostatistics, School of Public Health, Guangdong Medical University,\nDongguan 523808, China; wangxj1664@gdmu.edu.cn\n*Correspondence: \ufb02oriapeng@hotmail.com; Tel.: +86-20-61648526\nyThese authors contributed equally to this work.\nReceived: 31 December 2019; Accepted: 31 January 2020; Published: 2 February 2020\n/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045 /gid00001\n/gid00048/gid00043/gid00031/gid00028/gid00047/gid00032/gid00046\nAbstract: Dengue 

In [None]:



# Open the PDF file
with open('sample.pdf', 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)

    # Initialize a dictionary to store the PDF content
    pdf_data = {
        'Title': '',
        'Abstract': ''
    }

    # Variables to track title and abstract
    title = ''
    abstract = ''

    # Iterate through pages and extract text
    for page_num in range(pdf_reader.numPages):
        page = pdf_reader.getPage(page_num)
        page_text = page.extractText()

        # Clean the page text
        cleaned_text = clean_text(page_text)

        # Check if the page contains title or abstract
        if "Abstract:" in cleaned_text:
            # Extract the abstract
            abstract = cleaned_text
        elif not title:
            # Assume the first non-empty line as the title
            lines = cleaned_text.split('\n')
            for line in lines:
                if line.strip():
                    title = line.strip()
                    break

    # Store the title and abstract
    pdf_data['Title'] = title
    pdf_data['Abstract'] = abstract

# Create a JSON object from the extracted text
json_data = json.dumps(pdf_data, indent=4)

# Save the JSON data to a file
with open('pdf_data.json', 'w') as json_file:
    json_file.write(json_data)

# Read and print the JSON data from the file
with open('pdf_data.json', 'r') as json_file:
    loaded_json_data = json.load(json_file)
    print("\nLoaded JSON Data:")
    print(loaded_json_data)


In [None]:
import pandas as pd

def load_links_from_file(file_path, column_name):
    """
    Load links from a CSV or Excel file and store them in a list.

    Args:
        file_path (str): Path to the CSV or Excel file.
        column_name (str): Name of the column containing the links.

    Returns:
        list: A list of links extracted from the specified column.
    """
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")

        # Extract links from the specified column and store them in a list.
        links = df[column_name].tolist()

        return links
    except Exception as e:
        print(f"Error: {str(e)}")
        return []

# Example usage:
file_path = 'links.csv'  # Replace with your file path
column_name = 'Links'  # Replace with the name of the column containing links
links = load_links_from_file(file_path, column_name)

# Now, 'links' contains the list of links from the file.


In [None]:
import os
import requests

def save_html_from_urls(links, output_folder):
    """
    Fetch HTML content from a list of URLs and save them as HTML files.

    Args:
        links (list): A list of URLs to fetch HTML content from.
        output_folder (str): The directory where HTML files will be saved.

    Returns:
        list: A list of filenames of the saved HTML files.
    """
    saved_files = []

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for i, link in enumerate(links):
        try:
            response = requests.get(link)
            if response.status_code == 200:
                # Generate a unique filename based on the URL
                filename = f"{i + 1}.html"
                file_path = os.path.join(output_folder, filename)

                with open(file_path, "w", encoding="utf-8") as html_file:
                    html_file.write(response.text)

                saved_files.append(filename)
                print(f"Saved: {file_path}")
            else:
                print(f"Failed to fetch URL: {link}")
        except Exception as e:
            print(f"Error fetching URL {link}: {str(e)}")

    return saved_files

# Example usage:
links = ["https://example.com/page1", "https://example.com/page2"]
output_folder = "html_files"  # Replace with your desired output directory
saved_files = save_html_from_urls(links, output_folder)

# 'saved_files' now contains the list of filenames of the saved HTML files.


In [None]:
from bs4 import BeautifulSoup

def extract_title_from_html(html_file_path):
    """
    Extract the title of a scientific literature review from an HTML file.

    Args:
        html_file_path (str): Path to the HTML file.

    Returns:
        str: The extracted title.
    """
    with open(html_file_path, "r", encoding="utf-8") as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
        # Replace 'title_tag' with the actual HTML tag or class containing the title.
        title_tag = soup.find("title")  # Modify as needed.
        if title_tag:
            title = title_tag.get_text()
            return title.strip()

    return None


In [None]:
from bs4 import BeautifulSoup

def extract_abstract_from_html(html_file_path):
    """
    Extract the abstract of a scientific literature review from an HTML file.

    Args:
        html_file_path (str): Path to the HTML file.

    Returns:
        str: The extracted abstract.
    """
    with open(html_file_path, "r", encoding="utf-8") as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
        # Replace 'abstract_tag' with the actual HTML tag or class containing the abstract.
        abstract_tag = soup.find("p", class_="abstract")  # Modify as needed.
        if abstract_tag:
            abstract = abstract_tag.get_text()
            return abstract.strip()

    return None
