In [92]:
import pandas as pd
import numpy as np
import os
import sagemaker
import tensorflow as tf

from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sagemaker.tensorflow import TensorFlow

In [93]:
tf.__version__

'1.14.0'

## Initial data load

In [94]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


## Preprocessing

In [95]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [96]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [97]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [98]:
getter = SentenceGetter(data)
sentences = getter.sentences

labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]

In [99]:
word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

max_len = 50
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i for i, w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i, t in enumerate(tags)}

X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

y = [[tag2idx[l_i] for l_i in l] for l in labels]

y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

## Train-test split

In [100]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=666)

## Upload data to S3

In [101]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

prefix = 'sagemaker/named_entity_recognition'
data_dir = 'data'

In [102]:
pd.concat([pd.DataFrame(y_tr), pd.DataFrame(X_tr)], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [103]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

## Model using GPU instance

In [104]:
! pygmentize train/train_bilstm.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m, [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mimport[39;49;00m [04m[36mkeras[39;49;00m
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m [34mimport[39;49;00m backend [34mas[39;49;00m K
[34mfrom[39;49;00m [04m[36mkeras.models[39;49;00m [34mimport[39;49;00m Sequential
[34mfrom[39;49;00m [04m[36mkeras.layers[39;49;00m [34mimport[39;49;00m LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional


[34mif[39;49;00m [31m__name__[39;49;00m == [33m'[39;49;00m[33m__main__[39;49;00m[33m'[39;49;00m:
        
    parser = argparse.ArgumentParser()

    parser.add_argument([33m'[39;49;0

In [105]:
tf_estimator = TensorFlow(entry_point='train_bilstm.py', 
                          source_dir="train",
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.p2.xlarge',
                          framework_version='2.0.0', 
                          py_version='py3',
                          script_mode=True,
                          distributions={'parameter_server': {'enabled': True}},
                          hyperparameters={
                              'epochs': 3,
                              'batch-size': 32,
                              'max-len': max_len,
                              'n-tags': n_tags,
                              'n-words': n_words
                          }
                         )

In [110]:
tf_estimator.fit({'training': input_data})

## Deploy model

In [109]:
tf_predictor = tf_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
from IPython.display import Audio, display
def all_done():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

Evaluate on test set

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import f1_score

In [None]:
predictions = tf_predictor.predict(X_te)

In [None]:
test_f1 = f1_score(predictions, y_te)
print(f"Test F1-Score: {test_f1}")