In [None]:
print(model.num_parameters())

In [3]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
model = AutoModelForMaskedLM.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
    
text = 'the a/t-rich mut sequence indicates that normal splicing was abolished by a g-to-a transition at the first [MASK] of intron 2.'

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
print("mask_token_index: ", mask_token_index)

mask_token_logits = token_logits[0, mask_token_index, :]
print("mask_token_logits: ", mask_token_logits)
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 10020, dim=1).indices[0].tolist()

token_logits

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


mask_token_index:  tensor([29])
mask_token_logits:  tensor([[-4.6412, -3.5301, -3.9047,  ..., -2.6799, -4.3767, -3.4781]],
       grad_fn=<IndexBackward0>)


tensor([[[ -7.1181,  -7.3490,  -7.2368,  ...,  -5.9377,  -5.8142,  -5.6505],
         [ -8.9271,  -9.0926,  -9.4347,  ...,  -5.4978,  -7.8965,  -7.1630],
         [ -9.7153, -10.0035, -10.7065,  ...,  -9.1228,  -9.3101, -10.5179],
         ...,
         [ -7.5503,  -7.9151,  -7.2499,  ...,  -5.8271,  -6.6470,  -5.8421],
         [-11.1996, -11.9432, -11.5165,  ..., -10.1310, -11.2889,  -9.8898],
         [ -7.1008,  -8.0511,  -7.6138,  ...,  -6.1989,  -6.6256,  -6.3939]]],
       grad_fn=<ViewBackward0>)

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy==3.5.0
!python -m spacy download en_core_web_sm

In [5]:
import re

def assign_labels(sentence, arguments):
    # Tokenize the sentence into words
    sentence = sentence.lower()
    words = re.findall(r'\w+|[;,.]', sentence)

    # Create a list to store labels for each word
    labels = ['O'] * len(words)

    # Iterate through the arguments and assign labels
    for arg_id, arg_text in arguments.items():
        # Tokenize the argument into words
        arg_text = arg_text.lower()
        arg_words = re.findall(r'\w+|[;,.]', arg_text)

        # Iterate through the words in the sentence
        for i in range(len(words) - len(arg_words) + 1):
            if words[i:i+len(arg_words)] == arg_words:
                # Assign a label based on the argument key
                for j in range(len(arg_words)):
                    if j == 0:
                        labels[i+j] = f'B-A{arg_id}'
                    else:
                        labels[i+j] = f'I-A{arg_id}'

    return words, labels

# Example usage:
sentence = "A G-to-A transition at the first nucleotide of intron 2 of patient 1 abolished normal splicing."
arguments = {0: "a G-to-A transition at the first nucleotide of intron 2", 1: "normal splicing"}

words, labels = assign_labels(sentence, arguments)

for word, label in zip(words, labels):
    print(f"{word}: {label}")


a: B-A0
g: I-A0
to: I-A0
a: I-A0
transition: I-A0
at: I-A0
the: I-A0
first: I-A0
nucleotide: I-A0
of: I-A0
intron: I-A0
2: I-A0
of: O
patient: O
1: O
abolished: O
normal: B-A1
splicing: I-A1
.: O


In [None]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define a function to detect base verbs
def detect_base_verbs(sentence):
    # Process the input sentence with spaCy
    doc = nlp(sentence)

    # Initialize a list to store base verbs
    base_verbs = []

    # Iterate through the tokens in the sentence
    for i, token in enumerate(doc):
        
        print("token:", token)
        print("token.pos:", token.pos_)
        print("token.dep:", token.dep_)
        print("token.lemma:", token.lemma_)
        print('\n')
        # Check if the token is a verb (POS tag starts with 'V') and not a auxiliary verb (aux)
        if (token.pos_.startswith('V')) and token.dep_ != 'aux':
            if token.lemma_ == 'truncate':
                # If the token is a pronoun, use the text of the token
                print("DAY NE", token)
            base_verbs.append(token.lemma_)

    return base_verbs

# Example sentence
sentence = "frameshift sox9 mutations, as our data show, have the probability of actually truncating its two activation domains, while all missense mutations reported to date lie in the high mobility group (hmg) dna-binding domain." 
sen = "electrophoretic truncation shift assays truncated that the full-length bcl6 protein extracted from transfected cos cells and a bacterially expressed protein that contains the bcl6 zinc fingers and may be remarkably truncated can bind specifically to dna from the u3 promoter/enhancer region of hiv-1."

sen2 = "Specifically, the Stat5a molecule in which the C-terminus can be truncated at amino acids 740 or 751 effectively blocked the induction of both CIS and OSM, whereas the C-terminal truncations at amino acids 762 or 773 had no effect on the induction of either gene."
# Detect base verbs in the sentence
detect_base_verbs(sen2)

In [None]:
words = ['a', 'g-to-a', 'transition', 'at', 'the', 'first', 'nucleotide', 'of', 'intron', '2', 'of', 'patient', '1', '#abolish', 'normal', 'splicing', '.']

words = ['frameshift', 'sox9', 'mutations', ',', 'as', 'our', 'data', 'show', ',', 'have', 'the', 'probability', 'of', 'actually', 'truncating', 'its', 'two', 'activation', 'domains', ',', 'while', 'all', 'missense', 'mutations', 'reported', 'to', 'date', 'lie', 'in', 'the', 'high', 'mobility', 'group', '(hmg)', 'dna-binding', 'domain', '.']
def analyze_word(word, lowercase=True):
    token = nlp(word)
   
    if token.pos_.startswith('V') and token.dep_ != 'aux':
       
        lemma = token[0].lemma_
        if lowercase: lemma = lemma.lower()
    return lemma, token[0].pos_

print(analyze_word('truncated'))

In [39]:
!python xml2conll/xml2conll.py --input='./MLM/data/GramVar/abolish_full.xml' --output='tessst'

Converting 0 documents
[nltk_data] Downloading package punkt to /home/phatpham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0 lines written
