In [1]:
# https://huggingface.co/transformers/master/notebooks.html
!pip install datasets transformers seqeval



Collecting datasets
  Downloading datasets-1.5.0-py3-none-any.whl (192 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
Collecting tqdm<4.50.0,>=4.27
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
Collecting xxhash
  Downloading xxhash-2.0.0-cp38-cp38-win_amd64.whl (35 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.11.1-py38-none-any.whl (126 kB)
Collecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.7-py3-none-any.whl (33 kB)
Collecting dill
  Downloading dill-0.3.3-py2.py3-none-any.whl (81 kB)
Collecting pyarrow>=0.17.1
  Downloading pyarrow-3.0.0-cp38-cp38-win_amd64.whl (12.7 MB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16175 sha256=93cb7eda31e28308c5e9cb4dc9dd3fe5a8bb4fdab9db06fe8e0ea8cdd4d6b096
  Stored in directory: c:\users\관리자\appdat

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyserini 0.11.0.0 requires tqdm>=4.56.0, but you have tqdm 4.49.0 which is incompatible.


In [None]:
import pandas as pd
import os
import re
from pathlib import Path
traindata = []
testdata = []

# root = 'E:/2021 NLP/BIOCorpus/Chemical'
root = 'E:/2021 NLP/BIOCorpus/Gene'

for (path1, dir, files) in os.walk(root):
    for i in files:
        name, ext = os.path.splitext(i)
        if name=='train' or name=='valid':
            round = Path(path1+'/'+name+ext)
            raw_text = round.read_text().strip()
            raw_docs = re.split(r'\n\t?\n', raw_text)
            traindata.append(raw_docs)
        elif name=='test':
            round = Path(path1+'/'+name+ext)
            raw_text = round.read_text().strip()
            raw_docs = re.split(r'\n\t?\n', raw_text)
            testdata.append(raw_docs)


In [None]:
from pathlib import Path
import re, pdb

def read_wnut(merged):
    # file_path = Path(file_path)
    # raw_text = file_path.read_text().strip()
    # raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    # https://github.com/BaderLab/Biomedical-Corpora
    for raw_docs in merged:
        for doc in raw_docs:
            tokens = []
            tags = []
            for line in doc.split('\n'):
                token, tag = line.split('\t')
#                if tag not in ['B-CHED','B-CLLN','B-DISO', 'B-LIVB', 'B-PRGE', 'B-CLTP', 'O', 'I-CHED','I-CLLN','I-DISO','I-LIVB','I-PRGE','I-CLTP']:
#                if tag not in ['B-CHED', 'O','I-CHED']:
                if tag not in ['B-PRGE', 'O','I-PRGE']:
                    tag ='O'
                tokens.append(token)
                tags.append(tag)
            token_docs.append(tokens)
            tag_docs.append(tags)

    return token_docs, tag_docs

train_texts, train_tags = read_wnut(traindata)
val_texts, val_tags = read_wnut(testdata)

from sklearn.model_selection import train_test_split
# train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(sorted(unique_tags,reverse=True))}
id2tag = {id: tag for tag, id in tag2id.items()}


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# from transformers import DistilBertTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=256)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=256)

In [None]:
import numpy as np
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []

    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        truelabel_length = len ( doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] )
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels[:truelabel_length]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

class BioDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = BioDataset(train_encodings, train_labels)
val_dataset = BioDataset(val_encodings, val_labels)


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import concatenate_datasets, ReadInstruction, load_dataset, load_metric, list_datasets, list_metrics

model = AutoModelForTokenClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=len(unique_tags))
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")
def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions.argmax(-1)
#    pdb.set_trace()

    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
import pickle
# with open('NER.p', 'wb') as file:    # james.p 파일을 바이너리 쓰기 모드(wb)로 열기
#     pickle.dump(train_dataset, file)
#     pickle.dump(val_dataset, file)
#     pickle.dump(compute_metrics, file)

# with open('NER.p', 'rb') as file:    # james.p 파일을 바이너리 읽기 모드(rb)로 열기
#     train_dataset = pickle.load(file)
#     val_dataset = pickle.load(file)
#     compute_metrics = pickle.load(file)

In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
batch_size = 16

args = TrainingArguments(
    f"{task}",
#    output_dir="Trained2/",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
#    label_names=id2tag,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10)

# https://huggingface.co/transformers/master/main_classes/trainer.html
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)
trainer.train()

trainer.save_model('E:/2021 NLP/Trained/')
trainer.tokenizer.save_pretrained('E:/2021 NLP/Trained/')
trainer.tokenizer.save_vocabulary('E:/2021 NLP/Trained/')

with open('id2tag.pickle','wb') as fw:
    pickle.dump(id2tag, fw)

with open('tag2id.pickle','wb') as fw:
    pickle.dump(tag2id , fw)

with open('uniquetags.pickle','wb') as fw:
    pickle.dump(unique_tags , fw)

In [None]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import DataCollatorForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("E:/2021 NLP/Trained_Gene/")
model = AutoModelForTokenClassification.from_pretrained("E:/2021 NLP/Trained_Gene")

from transformers import pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

example = "Glioblastoma and lung cancer can be treated with anti-VEGF inhibitors."
example="Postoperative radiotherapy combined with chemotherapy is a commonly used treatment for glioblastoma (GBM) but radiotherapy often fails to achieve the expected results mainly due to tumor radioresistance. In this study, we established a radioresistant subline from human glioma cell line U251 and found that Cathepsin D (CTSD), a gene closely related to the clinical malignancy and prognosis in glioma, had higher expression level in radioresistant clones than that in parental cells, and knocking down CTSD by small interfering RNA (siRNA) or its inhibitor Pepstatin‐A increased the radiosensitivity. The level of autophagy was enhanced in the radioresistant GBM cells compared with its parent cells, and silencing autophagy by light chain 3 (LC3) siRNA significantly sensitized GBM cells to ionizing radiation (IR). Moreover, the protein expression level of CTSD was positively correlated with the autophagy marker LC3 II/I and negatively correlated with P62 after IR in radioresistant cells. As expected, through the combination of Western blot and immunofluorescence assays, inhibition of CTSD increased the formation of autophagosomes, while decreased the formation of autolysosomes, which indicating an attenuated autophagy level, leading to radiosensitization ultimately. Our results revealed for the first time that CTSD regulated the radiosensitivity of glioblastoma by affecting the fusion of autophagosomes and lysosomes. In significance, CTSD might be a potential molecular biomarker and a new therapeutic target in glioblastoma."

example="Postoperative radiotherapy combined with chemotherapy is a commonly used treatment for glioblastoma (GBM) but radiotherapy often fails to achieve the expected results mainly due to tumor radioresistance."
        "In this study, we established a radioresistant subline from human glioma cell line U251 and found that Cathepsin D (CTSD), a gene closely related to the clinical malignancy and prognosis in glioma, had higher expression level in radioresistant clones than that in parental cells, and knocking down CTSD by small interfering RNA (siRNA) or its inhibitor Pepstatin‐A increased the radiosensitivity. " \
        "The level of autophagy was enhanced in the radioresistant GBM cells compared with its parent cells, and silencing autophagy by light chain 3 (LC3) siRNA significantly sensitized GBM cells to ionizing radiation (IR). " \
        "Moreover, the protein expression level of CTSD was positively correlated with the autophagy marker LC3 II/I and negatively correlated with P62 after IR in radioresistant cells. " \
        "As expected, through the combination of Western blot and immunofluorescence assays, inhibition of CTSD increased the formation of autophagosomes, while decreased the formation of autolysosomes, which indicating an attenuated autophagy level, leading to radiosensitization ultimately. " \

example = "NF1 deficiency was identified as a probable cause of differential chemotaxis in mesenchymal GBMs. Other alterations, such as phosphatase and tensin homolog (PTEN) deletion in glioma cells, have been shown to activate the transcription factor Yes-associated protein 1, which directly upregulates LOX expression. LOX is a po- tent chemokine recruiting macrophages via activation of the β1 integrin/proline-rich tyrosine kinase 2 pathway in macrophages. Inhibition of LOX suppresses macrophage infiltration and tumor progression specifically in PTEN-null glioma models.60 Amplification of the epidermal growth factor receptor (EGFR) gene and its truncation mutant EGFR variant (v)III is another common genetic alteration in glioblastoma. EGFR and EGFRvIII cooperate to recruit macrophages in GBM via induction of chemokine MCP-1."
ner_results = nlp(example)
print(ner_results)