# 4. Named Entity Recognition

In [2]:
%run __init__.py

In [3]:
import os
import pandas as pd



In [15]:
from spacy import displacy
from collections import Counter

class NamedEntityRecognizer():
    def __init__(self, spacy_model, disable=None):
        self.nlp = spacy_model.load()
        self.disable = disable if disable is not None else []
    
    def get_entities(self, text):
        doc = self.nlp(text)
        return [(x.text, x.label_) for x in doc.ents 
                if x.label_ not in self.disable
                and len(x.text) > 2]
    
    def get_most_common_entities(self, text, n=10):
        entities = self.get_entities(text)
        labels = [x[0] for x in entities]
        return Counter(labels).most_common(n)
    
    def visualize_entities(self, text, jupyter=True):
        doc = self.nlp(text)
        displacy.render(doc, jupyter=jupyter, style='ent')


## Agriculture

In [5]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)
publications = pmc_df['text_cleaned'].values

In [29]:
import en_core_sci_lg

ner = NamedEntityRecognizer(en_core_sci_lg)

In [30]:
text = publications[-1]

ents = ner.get_entities(text)
ents[:10]

[('Ectomycorrhizal', 'ENTITY'),
 ('fungi', 'ENTITY'),
 ('live', 'ENTITY'),
 ('symbiosis', 'ENTITY'),
 ('tree', 'ENTITY'),
 ('shrubs', 'ENTITY'),
 ('forest functioning', 'ENTITY'),
 ('biogeochemical cycles', 'ENTITY'),
 ('boreal forests', 'ENTITY'),
 ('carbon', 'ENTITY')]

In [31]:
ner.get_most_common_entities(text)

[('soil', 60),
 ('aestivum', 37),
 ('trees', 29),
 ('production', 28),
 ('DNA', 26),
 ('temperature', 25),
 ('species', 16),
 ('tree', 15),
 ('II-9', 14),
 ('melanosporum', 12)]

In [32]:
ner.visualize_entities(text[:1500])

In [26]:
import en_core_web_md

disallowed_types = ['CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PERCENT', 'QUANTITY', 'TIME']
ner_basic = NamedEntityRecognizer(en_core_web_md, disable=disallowed_types)
ner_basic.visualize_entities(text[:1500])

In [24]:
import en_ner_bc5cdr_md

ner_basic = NamedEntityRecognizer(en_ner_bc5cdr_md)
ner_basic.visualize_entities(text[:1500])

In [25]:
import en_ner_craft_md

ner_basic = NamedEntityRecognizer(en_ner_craft_md)
ner_basic.visualize_entities(text[:1500])