# Import the existing Indonesian model

In [33]:
import spacy

In [34]:
from spacy.lang.id import Indonesian

In [4]:
nlp = Indonesian()

# Save it to a folder so we can manipulate it / add to it outside of the mainline Spacy

In [5]:
nlp.to_disk('Indo')
nlp = spacy.load('Indo')

# Tokenization

In [6]:
doc = nlp("Saya berasal dari Australia")

tokens = [token.norm_ for token in doc]

assert (tokens == ['saya','berasal','dari','australia'])

# Lemmatization

In [50]:
lemmas = [token.lemma_ for token in doc]

In [51]:
try:
    assert (len(lemmas) == 4)
    print ("There are 4 lemmas")
except:
    print("Lemmatization didn't work. There are only {} lemmas".format(len(lemmas)))

There are 4 lemmas


In [52]:
assert (lemmas[1] == 'asal'),"Berasal should have been lemmatized to asal. Instead it was {}".format(lemmas[1])

# Parts of Speech

In [10]:
pos = [token.pos_ for token in doc]

In [11]:
pos

['', '', '', '']

In [22]:
assert (pos[3] == 'PROPN'),'{} is not a proper noun, but it should be if the POS tagging worked'.format(doc[3])

AssertionError: Australia is not a proper noun, but it should be if the POS tagging worked

# Compare with the English model

In [13]:
import en_core_web_sm
English = en_core_web_sm.load()
eng_doc = English("I am from Australia")

In [14]:
[token.pos_ for token in eng_doc]

['PRON', 'VERB', 'ADP', 'PROPN']

# Train a new model on the Universal Dependencies

In [15]:
import pyconll

In [16]:
train = pyconll.load_from_file('UD_Indonesian-GSD/id_gsd-ud-train.conllu')

In [21]:
for sentence in train[0:1]:
    for token in sentence:
        print(token.form,token.upos)

Sembungan PROPN
adalah AUX
sebuah DET
desa NOUN
yang PRON
terletak VERB
di ADP
kecamatan NOUN
Kejajar PROPN
, PUNCT
kabupaten NOUN
Wonosobo PROPN
, PUNCT
Jawa PROPN
Tengah PROPN
, PUNCT
Indonesia PROPN
. PUNCT


In [26]:
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-train.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-test.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-dev.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu

[38;5;2m✔ Generated output file (4477 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-train.json
[38;5;2m✔ Generated output file (557 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-test.json
[38;5;2m✔ Generated output file (559 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-dev.json


In [76]:
!python -m spacy train -n 1 id --pipeline tagger,parser,ner models 'Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-train.json' 'Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-dev.json'

Training pipeline: ['tagger', 'parser', 'ner']
Starting with blank model 'id'
Counting training words (limit=0)

Itn    Dep Loss    NER Loss      UAS    NER P    NER R    NER F    Tag %  Token %  CPU WPS  GPU WPS
---  ----------  ----------  -------  -------  -------  -------  -------  -------  -------  -------
  0   83473.979       0.000   78.112    0.000    0.000    0.000   87.274  100.000     6070        0
[38;5;2m✔ Saved model to output directory[0m
models/model-final
[2K[38;5;2m✔ Created best model[0m
models/model-best


### Let's have a look

In [77]:
nlp = spacy.load('models/model-best/')

In [78]:
nlp.to_disk('Indo')

In [79]:
doc = nlp("Saya berasal dari Australia")

In [80]:
for token in doc:
    print(token)
    print(token.pos_)
    if token.lemma_ != token.text:
        print("Lemmatized to {}".format(token.lemma_))

Saya
PRON
berasal
VERB
Lemmatized to asal
dari
ADP
Australia
NOUN


### Seems to have worked

In [82]:
token = doc[0]

In [83]:
token.ent_id_

''

In [46]:
pos = [token.pos_ for token in doc]
assert (pos[3] == 'PROPN'),'{} is not a proper noun, but it should be if the POS tagging worked. It is tagged as a {}'.format(doc[3],pos[3])

AssertionError: Australia is not a proper noun, but it should be if the POS tagging worked. It is tagged as a NOUN

### Hmm, still not tagging Australia as a proper noun

# Now let's add NER

In [181]:
!python -m spacy debug-data id 'Spacy_NER_Indonesian/training_data.json' 'Spacy_NER_Indonesian/testing_data.json'

[1m
[2K[38;5;2m✔ Loaded training_data.json[0m
[2K[38;5;2m✔ Loaded testing_data.json[0m
[38;5;2m✔ Training data JSON format is valid[0m
[38;5;2m✔ Development data JSON format is valid[0m
[2K[38;5;2m✔ Corpus is loadable[0m
[1m
Training pipeline: tagger, parser, ner
Starting with blank model 'id'
30969 training docs
7715 evaluation docs
[38;5;3m⚠ 1593 training examples also in evaluation data[0m
[1m
[38;5;4mℹ 30969 total words in the data (7402 unique)[0m
[38;5;4mℹ No word vectors present in the model[0m
[1m
[38;5;4mℹ 3 new labels, 0 existing labels[0m
0 missing values (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[1m
[38;5;4mℹ 2 labels in data (85 labels in tag map)[0m
[38;5;1m✘ Label '-' not found in tag map for language 'id'[0m
[38;5;1m✘ Label '' not found in tag map for language 'i

In [182]:
!python -m spacy convert 'training_data.iob' 'Spacy_NER_Indonesian'
!python -m spacy convert 'testing_data.iob' 'Spacy_NER_Indonesian'
!rm -rf ner_models
!python -m spacy train -n 100 \
id \
--pipeline ner \
ner_models \
'Spacy_NER_Indonesian/training_data.json' \
'Spacy_NER_Indonesian/testing_data.json'

[38;5;2m✔ Generated output file (30969 documents)[0m
Spacy_NER_Indonesian/training_data.json
[38;5;2m✔ Generated output file (7715 documents)[0m
Spacy_NER_Indonesian/testing_data.json
Training pipeline: ['ner']
Starting with blank model 'id'
Counting training words (limit=0)

Itn    Dep Loss    NER Loss      UAS    NER P    NER R    NER F    Tag %  Token %  CPU WPS  GPU WPS
---  ----------  ----------  -------  -------  -------  -------  -------  -------  -------  -------
  0       0.000    5539.709    0.000   71.782   67.532   69.592   19.482  100.000     6104        0
  1       0.000    2753.326    0.000   72.582   69.395   70.952   19.482  100.000     6819        0
  2       0.000    2021.067    0.000   74.082   71.124   72.573   19.482  100.000     6826        0
  3       0.000    1616.341    0.000   72.771   70.060   71.390   19.482  100.000     6632        0
  4       0.000    1402.029    0.000   71.468   68.995   70.210   19.482  100.000     6522        0
  5       0.000    

In [183]:
nlp = spacy.load('ner_models/model-best/')

In [184]:
doc = nlp('''Wabah virus corona yang menyebar di sejumlah negara direspons secara global, termasuk Indonesia. Sejauh ini, pemerintah menyebutkan bahwa virus corona belum masuk ke Indonesia meski sudah banyak pasien yang diduga terinfeksi virus dengan nama 2019-nCoV. Namun hasil pemeriksaannya pun sejauh ini selalu negatif. Menteri Koordinator Pembangunan Manusia dan Kebudayaan (Menko PMK) Muhadjir Effendy menegaskan, Indonesia masih aman dari virus corona. ''')

In [185]:
spacy.displacy.render(doc, style='ent')

In [186]:
eng = spacy.load("en_core_web_sm")

In [187]:
eng_doc = eng('''The corona virus outbreak that spread in a number of countries responded globally, including Indonesia. So far, the government has stated that the corona virus has not yet entered Indonesia even though there have been many patients suspected of being infected with the virus with the name 2019-nCoV. But the results of the examination so far have always been negative. The Coordinating Minister for Human Development and Culture (Menko PMK) Muhadjir Effendy stressed that Indonesia was still safe from the corona virus. This was conveyed by Muhadjir after holding a coordination meeting with the Minister of Health, the Minister of Foreign Affairs, the Minister of Communication and Information, the Minister of Maritime Affairs and Fisheries and the leadership of the BNPB and BPOM discussing the spread of the corona virus, Tuesday (1/28/2020).
''')

In [188]:
spacy.displacy.render(eng_doc, style='ent')