<a href="https://colab.research.google.com/github/akash-kaul/Using-scispaCy-for-Named-Entity-Recognition/blob/master/scispaCyNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Preview

In [1]:
import pandas as pd

#Read in csv file
meta_df = pd.read_csv("/content/sample.csv")
meta_df.head()

Unnamed: 0,title,doi,license,abstract,publish_time,authors,journal,url
0,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,green-oa,Nidovirus subgenomic mRNAs contain a leader se...,12/17/01,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...
1,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,green-oa,CEACAM1 is a member of the carcinoembryonic an...,5/1/02,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...
2,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,no-cc,Hepatitis C virus (HCV) is an important human ...,7/16/01,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,green-oa,The key enzyme in coronavirus polyprotein proc...,7/1/02,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,http://europepmc.org/articles/pmc126080?pdf=re...
4,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,green-oa,"Arteri-, corona-, toro- and roniviruses are ev...",12/1/02,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,http://europepmc.org/articles/pmc136939?pdf=re...


#Install Packages

In [1]:
!pip install -U spacy
!pip install scispacy

## Install scispaCy models
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.5)
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
[?25l  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz (15.6MB)
[K     |████████████████████████████████| 15.6MB 34.4MB/s 
Building wheels for collected packages: en-core-sci-sm
  Building wheel for en-core-sci-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-sci-sm: filename=en_core_sci_sm-0.4.0-cp37-none-any.whl size=15660355 sha256=3516e0d8bc9cef405a4e8183ae82e265090c4a9ce1f16f90861cdb1137da6c88
  Stored in directory: /root/.cache/pip/wheels/f9/f0/40/7b2fce8bf7438ab151361245b1e91d4dc78e690189e8d83271
Successfully built en-core-sci-sm
Installing collected packages: en-core-sci-sm
  Found existing installation: en-core-sci-sm 0.2.4
    Uninstalling en-core-sci-sm-0.2.4:
      Successfully uninstalled en-core-sci-sm-0.2.4

#Import Packages

In [2]:
import scispacy
import spacy

#Core models
import en_core_sci_sm
import en_core_sci_lg

#NER specific models
import en_ner_craft_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_bionlp13cg_md

#Tools for extracting & displaying data
from spacy import displacy
import pandas as pd


#Read in Single Text (Test Run)

In [5]:
#Read in csv file
meta_df = pd.read_csv("/content/sample.csv")

#Pick specific abstract to use (row 0, column "abstract")
text = meta_df.loc[0, "abstract"]

#Load specific model and pass text through
nlp = en_core_sci_lg.load()
doc = nlp(text)

#Display resulting entity extraction
displacy_image = displacy.render(doc, jupyter=True,style='ent')


# Load Models

In [None]:
  #Load the models
  nlp_cr = en_ner_craft_md.load()
  nlp_bc = en_ner_bc5cdr_md.load()
  nlp_bi = en_ner_bionlp13cg_md.load()
  nlp_jn = en_ner_jnlpba_md.load()

#Methods to add entity/value pairs to table

In [None]:
def add_cr(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_cr.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table


In [None]:
def add_bc(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_bc.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

In [None]:
def add_jn(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_jn.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

In [None]:
def add_bi(abstractList, doiList):
    i = 0
    table= {"doi":[], "Entity":[], "Class":[]}
    for doc in nlp_bi.pipe(abstractList):
        doi = doiList[i]
        for x in doc.ents:
          table["doi"].append(doi)
          table["Entity"].append(x.text)
          table["Class"].append(x.label_)
        i +=1
    return table

#Read in Entire File (Main Function)

In [None]:
#Read in file
meta_df = pd.read_csv("/content/sample.csv")

#Sort out blank abstracts
df = meta_df.dropna(subset=['abstract'])

#Create lists
doiList = df['doi'].tolist()
abstractList = df['abstract'].tolist()

#Add all entity value pairs to table (run one at a time, each ones takes ~20 min)
table = add_cr(abstractList, doiList)

# table = add_bc(abstractList, doiList)

# table = add_bi(abstractList, doiList)

# table = add_jn(abstractList, doiList)

#Turn table into an exportable CSV file (returns normalized file of entity/value pairs)
trans_df = pd.DataFrame(table)
trans_df.to_csv ("Entity_pairings.csv", index=False)