<a href="https://colab.research.google.com/github/akash-kaul/Using-scispaCy-for-Named-Entity-Recognition/blob/master/scispaCyNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Packages

In [0]:
!pip install -U spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_craft_md-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bionlp13cg_md-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

#Import Packages

In [0]:
import scispacy
import spacy

#Core models
import en_core_sci_sm
import en_core_sci_lg

#NER specific models
import en_ner_craft_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_bionlp13cg_md

#Tools for extracting & displaying data
from spacy import displacy
import pandas as pd


#Read in Single Text (Test Run)

In [0]:
#Read in csv file
meta_df = pd.read_csv("/content/sample.csv")

#Pick specific abstract to use (row 0, column "abstract")
text = meta_df.loc[0, "abstract"]

#Load specific model and pass text through
nlp = en_ner_jnlpba_md.load()
doc = nlp(text)

#Display resulting entity extraction
displacy_image = displacy.render(doc, jupyter=True,style='ent')


# Load Models

In [0]:
  #Load the models
  nlp_cr = en_ner_craft_md.load()
  nlp_bc = en_ner_bc5cdr_md.load()
  nlp_bi = en_ner_bionlp13cg_md.load()
  nlp_jn = en_ner_jnlpba_md.load()

#Methods to add entity/value pairs to table

In [0]:
def add_cr(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_cr.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table


In [0]:
def add_bc(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_bc.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

In [0]:
def add_jn(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_jn.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

In [0]:
def add_bi(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_bi.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

#Read in Entire File (Main Function)

In [0]:
#Read in file
meta_df = pd.read_csv("/content/sample.csv")

#Sort out blank abstracts
df = meta_df.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

#Add all entity value pairs to table (run one at a time, each ones takes ~20 min)
table = add_cr(abstractList, doiList)

# table = add_bc(abstractList, doiList)

# table = add_bi(abstractList, doiList)

# table = add_jn(abstractList, doiList)

#Turn table into an exportable CSV file (returns normalized file of entity/value pairs)
trans_df = pd.DataFrame(table)
trans_df.to_csv ("Entity_all.csv", index=False)