<a href="https://colab.research.google.com/github/bbanzai88/Data-Science-Repository/blob/main/UMLS_Triplet_Extraction_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook takes a file with a doi and  an abstract. For each sentence, it identifies the UMLS related words. If there are two such words in a sentence and a verb in between them, it identifies a triple consisting of a subject, verb, and object for each abstract. When  running each cell, look at  the result. If the result requests restart run time please do so or the rest of the code will not run correctly.

In [None]:
!pip install scispacy
!pip install allennlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy<3.5.0,>=3.4.0
  Using cached spacy-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
Collecting thinc<8.2.0,>=8.1.0
  Using cached thinc-8.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (803 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.0.17
    Uninstalling thinc-8.0.17:
      Successfully uninstalled thinc-8.0.17
  Attempting uninstall: spacy
    Found existing installation: spacy 3.3.1
    Uninstalling spacy-3.3.1:
      Successfully uninstalled spacy-3.3.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.0 requires spacy<3.4,>=2.1.0, but you have spacy 3.4.1 which is incompatible.[0m
Successfully installed spacy-3.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy<3.4,>=2.1.0
  Using cached spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
Collecting thinc<8.1.0,>=8.0.14
  Using cached thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.1
    Uninstalling thinc-8.1.1:
      Successfully uninstalled thinc-8.1.1
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.1
    Uninstalling spacy-3.4.1:
      Successfully uninstalled spacy-3.4.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scispacy 0.5.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.3.1 which is incompatible.
en-core-web-sm 3.4.0 requires spacy<3.

In [None]:
import os
import spacy
from scispacy.umls_linking import UmlsEntityLinker
from allennlp.predictors.predictor import Predictor
import json
import re

In [None]:
def check_uml(ent):
    '''
    Get uml concept
    :param entity:
    :return:
    '''
    uml_ent_cls = ent._.umls_ents
    # print (uml_ent_cls)
    # Check entities in uml with confidence = 100
    entity = [e for e, s in uml_ent_cls if s == 1.0]
    # print ('Found UML Entity:' + str(entity))
    try:
        # print(entity[0])
        return entity[0]
    except Exception:
        return []

In [None]:
def get_oie(sentence, oie_pred, entities, relation):
    '''
    Get the information in form of triples
    :param sentence: original sentence
    :param oie_pred: ie prediction tags
    :param entities: list of entities predicted using spacy
    :param relation: relations predicted from oie, basically the verb
    :return: multiple triples 
    '''
    triples = []
    arg0 = [w for j, w in enumerate(sentence.split()) if 'ARG0' in oie_pred[j]]
    arg1 = [w for j, w in enumerate(sentence.split()) if 'ARG1' in oie_pred[j]]
    subjects = [u for e, u in entities if str(e) in ' '.join(arg0)]
    objects = [u for e, u in entities if str(e) in ' '.join(arg1)]
    if subjects and objects:
        for s in subjects:
            for o in objects:
                triples.append([s, relation, o])
    return triples


def get_triples(sentence):
    '''
    Get triples using OIE
    :param sentence: sentence
    :return:
    '''
    predicted_ie = predictor.predict(sentence)
    entities = get_entities(sentence)
    for p_i in predicted_ie['verbs']:
        relation = p_i['verb']
        oie_tag = p_i['tags']
        triples = get_oie(sentence, oie_tag, entities, relation)
    if 'triples' in locals():
        # print (triples)
        # triples = [t for triple in triples for t in triple]
        # print (triples)
        return triples
    else:
        return []

In [None]:
def get_entities(sentence):
    # Get entity labels and uml entities
    # print ('Getting entities for sentence:'+sentence)
    doc = nlp(sentence)
    entities = doc.ents
    # entity_text= [e.text for e in entities]
    uml_ent = [(e.text, check_uml(e)) for e in entities]
    uml_ent = [(e, uml_e) for e, uml_e in uml_ent if uml_e]
    return uml_ent


def get_paper_triples(abstract,doi):
    #write triples into text
    # print (json_file)
    #with open(json_file, 'r') as f:
    paper_text= abstract
    paper_text = ''.join(t for t in paper_text)
    for sent in paper_text.split('.'):
        print ('Getting triples for text')
        try:
           triples = get_triples(sent)
           if triples:
            print ('Found triples:'+str(len(triples)))
            for t in triples:
                if t not in all_triples:
                    all_triples.append(t)
                    print (t)
                    print ('Writing triples to file, length of all triples ='+str(len(all_triples)))
                    out_kg.write(t[0]+'\t'+t[1]+'\t'+t[2]+'\n')
        except KeyError as e:
                continue

def add_bi(abstractList, doiList):
    i = 0
    table= {"doi":[], "triples":[]}
    for doc in nlp_bi.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

In [None]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English



In [None]:
import pandas as pd

#Read in file
meta_dfu = pd.read_csv("/content/sample_data/PM_dfu.csv",sep=",", error_bad_lines=False,engine='python')

#Sort out blank abstracts
df = meta_dfu.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
!pip install allennlp-models

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz (15.9 MB)
Collecting spacy<3.5.0,>=3.4.1
  Using cached spacy-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
Collecting thinc<8.2.0,>=8.1.0
  Using cached thinc-8.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (803 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.0.17
    Uninstalling thinc-8.0.17:
      Successfully uninstalled thinc-8.0.17
  Attempting uninstall: spacy
    Found existing installation: spacy 3.3.1
    Uninstalling spacy-3.3.1:
      Successfully uninstalled spacy-3.3.1
[31mERROR: pip's dependency resolver does not currently take into account all the

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import pandas as pd
import en_core_web_sm
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

linker = nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

meta_dfu = pd.read_csv("/content/sample_data/PM_dfu.csv",sep=",", error_bad_lines=False,engine='python')

#Sort out blank abstracts
df = meta_dfu.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

#Sort out blank abstracts
df = meta_dfu.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def check_uml(ent):
    '''
    Get uml concept
    :param entity:
    :return:
    '''
    uml_ent_cls = ent._.umls_ents
    # print (uml_ent_cls)
    # Check entities in uml with confidence = 100
    entity = [e for e, s in uml_ent_cls if s == 1.0]
    # print ('Found UML Entity:' + str(entity))
    try:
        # print(entity[0])
        return entity[0]
    except Exception:
        return []

In [None]:
def get_oie(sentence, oie_pred, entities, relation):
    '''
    Get the information in form of triples
    :param sentence: original sentence
    :param oie_pred: ie prediction tags
    :param entities: list of entities predicted using spacy
    :param relation: relations predicted from oie, basically the verb
    :return: multiple triples 
    '''
    triples = []
    arg0 = [w for j, w in enumerate(sentence.split()) if 'ARG0' in oie_pred[j]]
    arg1 = [w for j, w in enumerate(sentence.split()) if 'ARG1' in oie_pred[j]]
    subjects = [u for e, u in entities if str(e) in ' '.join(arg0)]
    objects = [u for e, u in entities if str(e) in ' '.join(arg1)]
    if subjects and objects:
        for s in subjects:
            for o in objects:
                triples.append([s, relation, o])
    return triples


def get_triples(sentence):
    '''
    Get triples using OIE
    :param sentence: sentence
    :return:
    '''
    predicted_ie = predictor.predict(sentence)
    entities = get_entities(sentence)
    for p_i in predicted_ie['verbs']:
        relation = p_i['verb']
        oie_tag = p_i['tags']
        triples = get_oie(sentence, oie_tag, entities, relation)
    if 'triples' in locals():
        # print (triples)
        # triples = [t for triple in triples for t in triple]
        # print (triples)
        return triples
    else:
        return []

In [None]:
def get_entities(sentence):
    # Get entity labels and uml entities
    # print ('Getting entities for sentence:'+sentence)
    doc = nlp(sentence)
    entities = doc.ents
    # entity_text= [e.text for e in entities]
    uml_ent = [(e.text, check_uml(e)) for e in entities]
    uml_ent = [(e, uml_e) for e, uml_e in uml_ent if uml_e]
    return uml_ent


def get_paper_triples(abstract,doi):
    #write triples into text
    # print (json_file)
    #with open(json_file, 'r') as f:
    paper_text= abstract
    paper_text = ''.join(t for t in paper_text)
    for sent in paper_text.split('.'):
        print ('Getting triples for text')
        try:
           triples = get_triples(sent)
           if triples:
            print ('Found triples:'+str(len(triples)))
            for t in triples:
                if t not in all_triples:
                    all_triples.append(t)
                    print (t)
                    print ('Writing triples to file, length of all triples ='+str(len(all_triples)))
                    out_kg.write(t[0]+'\t'+t[1]+'\t'+t[2]+'\n')
        except KeyError as e:
                continue


In [None]:
#def get_paper_triples(abstract,doiList,i):
  #  #write triples into text
  #  # print (json_file)
  #  #with open(json_file, 'r') as f:
 #   paper_text= abstract
  #  paper_text = ''.join(t for t in paper_text)
   # i = 0
   # doi = doiList[i]
   # table= {"doi":[], "triples":[]}
    #for sent in paper_text.split('.'):
     #   print ('Getting triples for text')
     #   try:
      #     triples = get_triples(sent)
       #    if triples:
        #    print ('Found triples:'+str(len(triples)))
         #   for t in triples:
         #       if t not in all_triples:
         #           all_triples.append(t)
         #           table["doi"].append(doi)
         #           table['triples'].append(all_triples)
        #            print (t)
        #            i +=1
       #     return table
      #  except KeyError as e:
         #      continue



In [None]:
#def get_abstract_triples(abstract,doiList,i):
#  paper_text= abstract
#  paper_text = ''.join(t for t in paper_text)
#  doi = doiList[i]
#  all_triples = pd.DataFrame(columns=['doi', 'subject', 'relation', 'object'])
#  table=
#  for sent in paper_text.split('.'):
#        try:
#           triples = get_triples(sent)
#           if len(triples)<2:
#              table["doi"].append(doi)
#              continue
#           else:
#             all_triples.append(triples)
#             table['triples'].append(all_triples)
#             return table
#        except KeyError as e:
#                continue

In [None]:
#def get_paper_triples(abstract,doiList,i):
 #   #write triples into text
 #   # print (json_file)
 #   #with open(json_file, 'r') as f:
 #   paper_text= abstract
 #   paper_text = ''.join(t for t in paper_text)
 #   i = 0
 #   doi = doiList[i]
 #   table= {"doi":[], "triples":[]}
 #   for sent in paper_text.split('.'):
 #       print ('Getting triples for text')
 #       try:
 #          triples = get_triples(sent)
 #          if triples:
 #           print ('Found triples:'+str(len(triples)))
 #           for t in triples:
 #               if t not in all_triples:
 #                   all_triples.append(t)
 #                   table["doi"].append(doi)
 #                   table['triples'].append(all_triples)
 #                   print (t)
 #                   i +=1
 #           return table
 #       except KeyError as e:
 #               continue



In [None]:
def get_abstract_triples(abstractList,doiList,i):
 triples=[]
 doi=doiList[i]
 paper_text= abstractList[i]
 paper_text = ''.join(t for t in paper_text)
 table= {"doi":[], "triples":[]}
 for sent in paper_text.split('.'):
           table["doi"].append(doi)
           triple = get_triples(sent)
           table['triples'].append(triple)
           if len(triple)<2:
             continue
           else:
             table['triples'].append(triple)
             table["doi"].append(doi)
 table_df=pd.DataFrame(table)
 table_df2 = table_df.apply(pd.Series.explode).reset_index(drop=True)
 table_df3 = table_df2[table_df2['triples'].notna()]
 #table_df3['Subject', 'Relation', 'Object'] = table_df3.triples.str.split(',',expand=True).tolist()
 #table_df3 = pd.DataFrame(table_df3['triples'].str.split(',').tolist(), columns = ['Subject', 'Relation', 'Object'])
 triple1=pd.DataFrame(table_df3['triples'].tolist(), columns=['Subject', 'Relation', 'Object'])
 triple2=triple1.reset_index(drop=True)
 doi=table_df3['doi'].reset_index(drop=True)
 triple3=pd.concat([doi, triple2], axis=1)
 return triple3
#table_df3

In [None]:
i=2
get_abstract_triples(abstractList,doiList,i)

Unnamed: 0,doi,Subject,Relation,Object
0,10.1016/j.bone.2022.116421,C0004048,cause,C0175256
1,10.1016/j.bone.2022.116421,C0220825,elevated,C0229671
2,10.1016/j.bone.2022.116421,C0220825,elevated,C0428152
3,10.1016/j.bone.2022.116421,C0220825,elevated,C0016327
4,10.1016/j.bone.2022.116421,C0220825,elevated,C0229671
5,10.1016/j.bone.2022.116421,C0220825,elevated,C0428152
6,10.1016/j.bone.2022.116421,C0220825,elevated,C0016327


In [None]:
import tqdm

from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import pandas as pd
import en_core_web_sm
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

linker = nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})


SyntaxError: ignored

In [None]:
import tqdm
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import pandas as pd
import en_core_web_sm
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

linker = nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})


#fcn subset
meta_dfu = pd.read_csv("/content/sample_data/FCN_Pubmed3.csv",sep=",", error_bad_lines=False,engine='python')

#All
#meta_dfu = pd.read_csv("/content/sample_data/PM_dfu.csv",sep=",", error_bad_lines=False,engine='python')

#Sort out blank abstracts
df = meta_dfu.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

#Sort out blank abstracts
df = meta_dfu.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

#All_triples = pd.DataFrame(columns=['doi','Subject','Relation','Object'])

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

All_triples = pd.DataFrame()

for i in tqdm.tqdm(range(len(abstractList))):
 abstract_triples=get_abstract_triples(abstractList,doiList,i)
 #All_triples=All_triples.append(abstract_triples)
 All_triples=pd.concat([All_triples, abstract_triples])

 All_triples.to_csv('/content/sample_data/All_triples.csv')

100%|██████████| 491/491 [13:32<00:00,  1.65s/it]


In [None]:
ll_triples.to_csv('/content/sample_data/All_triples.csv')

Unnamed: 0,doi,Subject,Relation,Object
0,10.1016/j.envint.2013.02.005,C1418481,depending,C1418481
0,10.1016/j.parkreldis.2022.07.005,C0243144,showed,C0589120
1,10.1016/j.parkreldis.2022.07.005,C0205329,showed,C0589120
2,10.1016/j.parkreldis.2022.07.005,C0301630,showed,C0589120
3,10.1016/j.parkreldis.2022.07.005,C0243144,showed,C0589120
...,...,...,...,...
1,10.1016/j.envint.2017.07.024,C2919405,contained,C0086045
2,10.1016/j.envint.2017.07.024,C2919405,contained,C0599638
3,10.1016/j.envint.2017.07.024,C2919405,contained,C1418482
4,10.1016/j.envint.2017.07.024,C2919405,contained,C0086045
