# Import the existing Indonesian model

In [2]:
import spacy

In [3]:
from spacy.lang.id import Indonesian

In [4]:
nlp = Indonesian()

# Save it to a folder so we can manipulate it / add to it outside of the mainline Spacy

In [5]:
nlp.to_disk('Indo')
nlp = spacy.load('Indo')

# Tokenization

In [6]:
doc = nlp("Saya berasal dari Australia")

tokens = [token.norm_ for token in doc]

assert (tokens == ['saya','berasal','dari','australia'])

# Lemmatization

In [50]:
lemmas = [token.lemma_ for token in doc]

In [51]:
try:
    assert (len(lemmas) == 4)
    print ("There are 4 lemmas")
except:
    print("Lemmatization didn't work. There are only {} lemmas".format(len(lemmas)))

There are 4 lemmas


In [52]:
assert (lemmas[1] == 'asal'),"Berasal should have been lemmatized to asal. Instead it was {}".format(lemmas[1])

# Parts of Speech

In [10]:
pos = [token.pos_ for token in doc]

In [11]:
pos

['', '', '', '']

In [22]:
assert (pos[3] == 'PROPN'),'{} is not a proper noun, but it should be if the POS tagging worked'.format(doc[3])

AssertionError: Australia is not a proper noun, but it should be if the POS tagging worked

# Compare with the English model

In [13]:
import en_core_web_sm
English = en_core_web_sm.load()
eng_doc = English("I am from Australia")

In [14]:
[token.pos_ for token in eng_doc]

['PRON', 'VERB', 'ADP', 'PROPN']

# Train a new model on the Universal Dependencies

In [15]:
import pyconll

In [16]:
train = pyconll.load_from_file('UD_Indonesian-GSD/id_gsd-ud-train.conllu')

In [21]:
for sentence in train[0:1]:
    for token in sentence:
        print(token.form,token.upos)

Sembungan PROPN
adalah AUX
sebuah DET
desa NOUN
yang PRON
terletak VERB
di ADP
kecamatan NOUN
Kejajar PROPN
, PUNCT
kabupaten NOUN
Wonosobo PROPN
, PUNCT
Jawa PROPN
Tengah PROPN
, PUNCT
Indonesia PROPN
. PUNCT


In [26]:
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-train.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-test.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu
!python -m spacy convert 'UD_Indonesian-GSD/id_gsd-ud-dev.conllu' 'Spacy_Universal_Dependencies_Indonesian' --converter conllu

[38;5;2m✔ Generated output file (4477 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-train.json
[38;5;2m✔ Generated output file (557 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-test.json
[38;5;2m✔ Generated output file (559 documents)[0m
Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-dev.json


In [68]:
!python -m spacy train -n 10 id models 'Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-train.json' 'Spacy_Universal_Dependencies_Indonesian/id_gsd-ud-dev.json'

Training pipeline: ['tagger', 'parser', 'ner']
Starting with blank model 'id'
Counting training words (limit=0)

Itn    Dep Loss    NER Loss      UAS    NER P    NER R    NER F    Tag %  Token %  CPU WPS  GPU WPS
---  ----------  ----------  -------  -------  -------  -------  -------  -------  -------  -------
  0   83473.979       0.000   78.112    0.000    0.000    0.000   87.274  100.000     5887        0
  1   66288.797       0.000   80.813    0.000    0.000    0.000   89.423  100.000     8411        0
  2   62484.110       0.000   81.499    0.000    0.000    0.000   90.580  100.000     8537        0
  3   60027.811       0.000   82.109    0.000    0.000    0.000   91.254  100.000     8149        0
  4   57800.709       0.000   82.242    0.000    0.000    0.000   91.595  100.000     8171        0
  5   55631.824       0.000   82.646    0.000    0.000    0.000   91.881  100.000     7554        0
  6   55007.907       0.000   82.723    0.000    0.000    0.000   92.134  100.000     7

### Let's have a look

In [72]:
nlp = spacy.load('models/model-final/')

In [73]:
doc = nlp("Saya berasal dari Australia")

In [84]:
for token in doc:
    print(token)
    print(token.pos_)
    if token.lemma_ != token.text:
        print("Lemmatized to {}".format(token.lemma_))

Saya
PRON
berasal
VERB
Lemmatized to asal
dari
ADP
Australia
NOUN


### Seems to have worked

In [85]:
pos = [token.pos_ for token in doc]
assert (pos[3] == 'PROPN'),'{} is not a proper noun, but it should be if the POS tagging worked. It is tagged as a {}'.format(doc[3],pos[3])

AssertionError: Australia is not a proper noun, but it should be if the POS tagging worked. It is tagged as a NOUN

### Hmm, still not tagging Australia as a proper noun