In [1]:
import spacy

### English trial

In [3]:
nlp_en = spacy.load('en')

In [6]:
par_en = (
    'After an uneventful first half, Romelu Lukaku gave United the lead on 55 minutes with a close-range volley.'
    'Sanchez was then fouled by Huddersfield defender Michael Hefele to win a penalty and the Chilean, a January signing from Arsenal, stepped up to take the spot-kick.'
    'The forward saw his low shot saved by Jonas Lossl, but made no mistake with the rebound to double United\'s lead on his home debut.'
)

doc_en = nlp_en(par_en)

for sentence in doc_en.sents:
    print(sentence)

After an uneventful first half, Romelu Lukaku gave United the lead on 55 minutes with a close-range volley.
Sanchez was then fouled by Huddersfield defender Michael Hefele to win a penalty and the Chilean, a January signing from Arsenal, stepped up to take the spot-kick.
The forward saw his low shot saved by Jonas Lossl, but made no mistake with the rebound to double United's lead on his home debut.


In [5]:
sent

The forward saw his low shot saved by Jonas Lossl, but made no mistake with the rebound to double United's lead on his home debut.

### In Bahasa

In [10]:
nlp = spacy.blank('id')

In [4]:
nlp = spacy.load('id')

OSError: [E050] Can't find model 'id'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

### Tokenizer

In [5]:
s = 'Galaxy Note 8, flagship terbaru dari Samsung, bisa ditebus dengan harga 11 juta rupiah (cashback 1 juta).'
doc = nlp(s)
print(type(doc))
print(type(doc[0]))

<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.token.Token'>


In [6]:
for i, token in enumerate(doc):
    print(f'token-{i}', token)

token-0 Galaxy
token-1 Note
token-2 8
token-3 ,
token-4 flagship
token-5 terbaru
token-6 dari
token-7 Samsung
token-8 ,
token-9 bisa
token-10 ditebus
token-11 dengan
token-12 harga
token-13 11
token-14 juta
token-15 rupiah
token-16 (
token-17 cashback
token-18 1
token-19 juta
token-20 )
token-21 .


In [7]:
par = ('Seiring perkembangan, kebutuhan kini semakin mahal saja harganya. '
'Lalu apa yang bisa kita lakukan? '
'Alih-alih mengeluh sepanjang hari dan menyalahkan banyak orang, kini Anda harus memulai perubahan pada kehidupan Anda.'
)
print(par)

doc_par = nlp(par)
for sentence in doc_par.sents:
    print(sentence)

Seiring perkembangan, kebutuhan kini semakin mahal saja harganya. Lalu apa yang bisa kita lakukan? Alih-alih mengeluh sepanjang hari dan menyalahkan banyak orang, kini Anda harus memulai perubahan pada kehidupan Anda.


ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: nlp.add_pipe(nlp.create_pipe('sentencizer')) Alternatively, add the dependency parser, or set sentence boundaries by setting doc[i].is_sent_start.

In [11]:
from spacy.lang.id.stop_words import STOP_WORDS

# STOP WORDS is a set
# convert to list
print(list(STOP_WORDS)[:10])

['sekalipun', 'bertutur', 'mungkin', 'khususnya', 'aku', 'bung', 'bermacam', 'terkira', 'antara', 'dipertanyakan']


In [14]:
doc = nlp('saya menyukai kemewahan')
for token in doc:
    print(token, token.is_stop)

saya True
menyukai False
kemewahan False


### Attributes

In [15]:
token = doc[0]
attributes = [attr for attr in dir(token) if not attr.startswith('_')]
for attr in attributes:
    print(attr)

ancestors
check_flag
children
cluster
conjuncts
dep
dep_
doc
ent_id
ent_id_
ent_iob
ent_iob_
ent_kb_id
ent_kb_id_
ent_type
ent_type_
get_extension
has_extension
has_vector
head
i
idx
is_alpha
is_ancestor
is_ascii
is_bracket
is_currency
is_digit
is_left_punct
is_lower
is_oov
is_punct
is_quote
is_right_punct
is_sent_start
is_space
is_stop
is_title
is_upper
lang
lang_
left_edge
lefts
lemma
lemma_
lex_id
like_email
like_num
like_url
lower
lower_
morph
n_lefts
n_rights
nbor
norm
norm_
orth
orth_
pos
pos_
prefix
prefix_
prob
rank
remove_extension
right_edge
rights
sent
sent_start
sentiment
set_extension
shape
shape_
similarity
string
subtree
suffix
suffix_
tag
tag_
tensor
text
text_with_ws
vector
vector_norm
vocab
whitespace_


In [16]:
doc = nlp('HP Samsung Galaxy Note 8 bisa ditebus dengan harga 11 juta rupiah (cashback 1jt).')
str_template = '{:>15} {:>10} {:>10} {:>10} {:>10} {:>10}'
print(str_template.format('token', 'is_lower', 'is_title', 'is_upper', 'is_digit', 'is_punct'))
for token in doc:
    print(str_template.format(str(token),
                              str(token.is_lower),
                              str(token.is_title),
                              str(token.is_upper),
                              str(token.is_digit),
                              str(token.is_punct)))

          token   is_lower   is_title   is_upper   is_digit   is_punct
             HP      False      False       True      False      False
        Samsung      False       True      False      False      False
         Galaxy      False       True      False      False      False
           Note      False       True      False      False      False
              8      False      False      False       True      False
           bisa       True      False      False      False      False
        ditebus       True      False      False      False      False
         dengan       True      False      False      False      False
          harga       True      False      False      False      False
             11      False      False      False       True      False
           juta       True      False      False      False      False
         rupiah       True      False      False      False      False
              (      False      False      False      False       True
      

### Lemma

In [18]:
from spacy.lang.id import LOOKUP
import random
lemma_as_list = list(LOOKUP.items())
samples = random.choices(lemma_as_list, k=20)
for k, v in samples:
    print(f'{k}: {v}')

ImportError: cannot import name 'LOOKUP' from 'spacy.lang.id' (/Users/mtjokro/miniconda3/envs/ipy37/lib/python3.7/site-packages/spacy/lang/id/__init__.py)

In [21]:
import spacy.lang.id as id

In [22]:
id.

<module 'spacy.lang.id' from '/Users/mtjokro/miniconda3/envs/ipy37/lib/python3.7/site-packages/spacy/lang/id/__init__.py'>

In [23]:
doc = nlp('tertidur tidur tercyduk')
for token in doc:
    ori = token.text
    lemma = token.lemma_  # token.lemma is integer index
    print(ori, lemma)

tertidur tertidur
tidur tidur
tercyduk tercyduk


In [26]:
from spacy.lang.id import Indonesian

In [27]:
nlp = Indonesian()

In [28]:
tes = nlp("Saya berasal dari Australia")

In [29]:
tes.ents

()

In [30]:
for token in tes:
    print(token.pos_)







In [36]:
for token in tes:
    print(token)
    print(token.pos_)
    print(token.lemma_)

Saya

Saya
berasal

berasal
dari

dari
Australia

Australia


In [None]:
nlp.s

In [35]:
tes.ents

()