# 0. Installation

Install the modules.

```
pip install config==0.4.2 gensim==3.8.1 gpustat==0.6.0 GPUtil==1.4.0 h5py==2.10.0 JPype1==0.7.1 Keras==2.2.4 konlpy==0.5.2 nltk==3.4.5 numpy==1.18.1 pandas==1.0.1 scikit-learn==0.22.1 scipy==1.4.1 silence-tensorflow==1.1.1 soynlp==0.0.493 tensorflow==1.14.0 tensorflow-gpu==1.14.0
```

Follow the descriptions and install _keras-contrib_.

```
git clone https://www.github.com/keras-team/keras-contrib.git 
cd keras-contrib 
python setup.py install
```

Ignore WARNING messages via following modules.

In [9]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# 1. Data Preparation

## 1.1 Labels

In [1]:
from connlp.analysis import NER_Labels

label_dict = {'NON': 0,     #None
              'PER': 1,     #PERSON
              'FOD': 2,}    #FOOD

ner_labels = NER_Labels(label_dict=label_dict)

Using TensorFlow backend.


## 1.2 Corpus

In [2]:
from connlp.preprocess import EnglishTokenizer
from connlp.analysis import NER_LabeledSentence, NER_Corpus
tokenizer = EnglishTokenizer()

data_sents = {'sent1': 'Sam likes pizza',
              'sent2': 'Erik eats pizza',
              'sent3': 'Erik and Sam are drinking soda',
              'sent4': 'Flora cooks chicken',
              'sent5': 'Sam ordered a chicken',
              'sent6': 'Flora likes chicken sandwitch',
              'sent7': 'Erik likes to drink soda'}
data_labels = {'sent1': [1, 0, 2],
               'sent2': [1, 0, 2],
               'sent3': [1, 0, 1, 0, 0, 2],
               'sent4': [1, 0, 2],
               'sent5': [1, 0, 0, 2],
               'sent6': [1, 0, 2, 2],
               'sent7': [1, 0, 0, 0, 2]}

docs = []
for tag, sent in data_sents.items():
    words = [str(w) for w in tokenizer.tokenize(text=sent)]
    labels = data_labels[tag]
    docs.append(NER_LabeledSentence(tag=tag, words=words, labels=labels))

max_sent_len = 10
ner_corpus = NER_Corpus(docs=docs, ner_labels=ner_labels, max_sent_len=max_sent_len)

## 1.3 Word Embedding

In [5]:
from connlp.preprocess import EnglishTokenizer
from connlp.embedding import Vectorizer
tokenizer = EnglishTokenizer()
vectorizer = Vectorizer()

tokenized_sents = [tokenizer.tokenize(sent) for sent in data_sents.values()]
w2v_model = vectorizer.word2vec(docs=tokenized_sents)

word2vector = vectorizer.get_word_vectors(w2v_model)
feature_size = w2v_model.vector_size
ner_corpus.word_embedding(word2vector=word2vector, feature_size=feature_size)

# 2. Model Development

## 2.1 Initialization

In [8]:
from connlp.analysis import NER_Model

parameters = {
    # Parameters for Bi-LSTM.
    'lstm_units': 512,
    'lstm_return_sequences': True,
    'lstm_recurrent_dropout': 0.2,
    'dense_units': 100,
    'dense_activation': 'relu',

    # Parameters for model training.
    'test_size': 0.3,
    'batch_size': 1,
    'epochs': 100,
    'validation_split': 0.1,
}

ner_model = NER_Model()
ner_model.initialize(ner_corpus=ner_corpus, parameters=parameters)

## 2.2 Training

In [10]:
ner_model.train(parameters=parameters)

Train on 3 samples, validate on 1 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100


Epoch 100/100


## 2.3 Evaluation

In [11]:
ner_model.evaluate()

|--------------------------------------------------
|Confusion Matrix:
[[ 2  1  1  4]
 [ 0  3  1  4]
 [ 1  0  1  2]
 [ 3  4  3 10]]
|--------------------------------------------------
|F1 Score: 0.680
|--------------------------------------------------
|    [NON]: 0.571
|    [PER]: 0.750
|    [FOD]: 0.400


## 2.4 Save & Load

In [12]:
from connlp.util import makedir

fpath_model = 'test/ner/model.pk'
makedir(fpath=fpath_model)
ner_model.save(fpath_model=fpath_model)

In [13]:
fpath_model = 'test/ner/model.pk'
ner_model = NER_Model()
ner_model.load(fpath_model=fpath_model, ner_corpus=ner_corpus, parameters=parameters)

# 3. Application

## 3.1 Prediction

In [14]:
from connlp.preprocess import EnglishTokenizer
vectorizer = Vectorizer()

new_sent = 'Tom eats apple'
tokenized_sent = tokenizer.tokenize(new_sent)
ner_result = ner_model.predict(sent=tokenized_sent)
print(ner_result)

Tom/PER eats/NON apple/FOD
