## install Spcy
- install spacy
- download the langauage package
  - web core small package

In [None]:
!pip install spacy -q
!python -m spacy download en_core_web_lg -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m842.4 kB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Text
- need a text to perform named entity recognition (NER)

In [None]:
text = '''
Abraham Lincoln (/ˈlɪŋkən/ LINK-ən; February 12, 1809 – April 15, 1865) was an
American lawyer, politician, and statesman who served as the 16th president of
the United States from 1861 until his assassination in 1865. He led the
United States through the American Civil War, defending the nation as a
constitutional union, defeating the insurgent Confederacy, playing a major
role in the abolition of slavery, expanding the power of the federal government,
and modernizing the U.S. economy.
'''

## Perform NER
- convert the text to doc object
- find the entities from the doc object (doc.ents)
  - ent.label_ -> label of the ner
  - ent.text -> ner identified, text

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

doc = nlp(text)

items = {}
for ent in doc.ents:
  if ent.label_ not in items:
    items[ent.label_] = []
    items[ent.label_].append(ent.text)
  else:
    if ent.text not in items[ent.label_]:
      items[ent.label_].append(ent.text)

for key, value in items.items():
  print(f'{key}: {value}')

PERSON: ['Abraham Lincoln']
DATE: ['February 12, 1809', 'April 15, 1865', '1861', '1865']
NORP: ['American']
ORDINAL: ['16th']
GPE: ['the United States', 'United States', 'U.S.']
EVENT: ['the American Civil War']
ORG: ['Confederacy']


## Rendering the NER identified

In [None]:
from spacy import displacy

displacy.render(doc, style='ent')

## Train a Language Model using Spacy
- quick start guide: https://spacy.io/usage/training
- config.cfg file
  - generate initial configuration file from the quick start page
  - hyper parameters
- run `init config` command
- run `train` command
- data
  - https://www.kaggle.com/datasets/finalepoch/medical-ner
  - required data format for spacy
    - dict
    - `{text: 'text', entities: [(start, end, label)]}`



In [None]:
# load and process the data
# data is as json object
import json

with open('data/Corona2.json', 'r') as f:
  data = json.load(f)

data['examples'][0]
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

### process the data
- as per the spacy requirements

In [None]:
training_data = []

for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

print(training_data[0])

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]", 'entities': [(360, 371, 'MEDICINE'), (383, 408, 'MEDICINE'), (104, 112, 'MEDICALCONDITION'), (679,

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

# doc -> entities (span)
nlp = spacy.blank('en') # load a new spacy model
doc_bin = DocBin()

for training_example in tqdm(training_data):
  text = training_example['text']
  labels = training_example['entities']
  doc = nlp.make_doc(text) # geneate a doc from the text
  ents = [] # generate entities for the doc
  for start, end, label in labels:
    span = doc.char_span(start, end, label=label, alignment_mode='contract')
    # now need to create the span for the docs
    # (start, end, label)
    if span is None: # sanity check for the span
      print('skipping entity')
    else:
      ents.append(span) # add span to the entities
    filtered_ents = filter_spans(ents) # filter the identified span (sanity check)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk('train.spacy')

 74%|███████▍  | 23/31 [00:00<00:00, 101.48it/s]

skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity


100%|██████████| 31/31 [00:00<00:00, 112.51it/s]


skipping entity


### config file
- config.cfg
- https://spacy.io/usage/training#quickstart
- create a file base_config.cfg
- copy the cofig and save it to the file

In [None]:
! python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     76.71    1.86    0.99   14.50    0.02
  0     200        255.15   4397.39   45.29   51.04   40.70    0.45
  1     400        126.23   2032.84   61.80   66.50   57.73    0.62
  2     600         61.28   1960.53   71.71   68.94   74.72    0.72
  2     800         83.53   1801.73   68.95   68.38   69.52    0.69
  3    1000         53.86   1657.82   71.14   73.72   68.73    0.71
  4    1200         64.99   1763.98   62.73   80.16   51.53    0.63
  5    1400         91.99   1813.12   67.75   74.66   62.01    0.68
  6    1600         85.56   1941.82   49.97   86.37   35.15    0.50
  7    1800        103.68   2083.13   74.24   70.09

In [None]:
nlp_ner = spacy.load('./model-best')

text = '''Diabetes mellitus refers to a group of diseases that affect how the
body uses blood sugar (glucose). Glucose is an important source of energy for
the cells that make up the muscles and tissues. It's also the brain's main
source of fuel.'''

doc = nlp_ner(text)

spacy.displacy.render(doc, style='ent')

