In [1]:
import pandas as pd
import numpy as np
import os
import sagemaker
import tensorflow as tf

from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sagemaker.tensorflow import TensorFlow

Using TensorFlow backend.


## Initial data load

In [3]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1").fillna(method="ffill")
# data.tail(10)

## Preprocessing

In [4]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [5]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)
sentences = getter.sentences

labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]

In [8]:
word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

max_len = 50
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i for i, w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i, t in enumerate(tags)}

X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

y = [[tag2idx[l_i] for l_i in l] for l in labels]

y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

## Train-test split

In [9]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=666)

## Upload data to S3

In [10]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

prefix = 'named_entity_recognition/data'
data_dir = 'data'

In [11]:
pd.concat([pd.DataFrame(y_tr), pd.DataFrame(X_tr)], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [12]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

## Model using GPU instance

In [53]:
! pygmentize train/train_bilstm.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m, [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mfrom[39;49;00m [04m[36mtensorflow.keras.models[39;49;00m [34mimport[39;49;00m Sequential
[34mfrom[39;49;00m [04m[36mtensorflow.keras.layers[39;49;00m [34mimport[39;49;00m LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional


[34mif[39;49;00m [31m__name__[39;49;00m == [33m'[39;49;00m[33m__main__[39;49;00m[33m'[39;49;00m:
        
    parser = argparse.ArgumentParser()

    parser.add_argument([33m'[39;49;00m[33m--epochs[39;49;00m[33m'[39;49;00m, [36mtype[39;49;00m=[36mint[39;49;00m, default=[34m5[39;49;00m)
    parser.add

In [26]:
tf_estimator = TensorFlow(entry_point='train_bilstm.py', 
                          source_dir="train",
#                           model_dir = '/opt/ml/model',
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.p2.xlarge',
                          framework_version='2.1.0', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 3,
                              'batch-size': 32,
                              'max-len': max_len,
                              'n-tags': n_tags,
                              'n-words': n_words
                          }
                         )

In [27]:
tf_estimator.fit({'training': input_data})

2020-05-25 17:22:51 Starting - Starting the training job...
2020-05-25 17:22:53 Starting - Launching requested ML instances.........
2020-05-25 17:24:25 Starting - Preparing the instances for training......
2020-05-25 17:25:36 Downloading - Downloading input data...
2020-05-25 17:26:19 Training - Downloading the training image......
2020-05-25 17:27:08 Training - Training image download completed. Training in progress..[34m2020-05-25 17:27:13,114 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-05-25 17:27:13,784 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "training": "/opt/ml/input/data/training"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "batch-size": 32,
        "max-len": 

In [48]:
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=job_name)

s3_output_path = description["DebugHookConfig"]["S3OutputPath"] + job_name + '/' + 'debug-output/'
print(s3_output_path)

s3://sagemaker-eu-west-1-087816224558/tensorflow-training-2020-05-25-17-22-50-737/debug-output/


In [30]:
%%sh -s "$s3_output_path"

aws s3 ls --recursive $1

2020-05-25 17:27:26       6465 tensorflow-training-2020-05-25-17-22-50-737/debug-output/collections/000000000/worker_0_collections.json
2020-05-25 17:27:26        274 tensorflow-training-2020-05-25-17-22-50-737/debug-output/events/000000000000/000000000000_worker_0.tfevents
2020-05-25 17:27:27        280 tensorflow-training-2020-05-25-17-22-50-737/debug-output/events/000000000029/000000000029_worker_0.tfevents
2020-05-25 17:27:27        292 tensorflow-training-2020-05-25-17-22-50-737/debug-output/events/000000000032/000000000032_worker_0.tfevents
2020-05-25 17:27:28        292 tensorflow-training-2020-05-25-17-22-50-737/debug-output/events/000000000065/000000000065_worker_0.tfevents
2020-05-25 17:27:28        295 tensorflow-training-2020-05-25-17-22-50-737/debug-output/events/000000000098/000000000098_worker_0.tfevents
2020-05-25 17:27:26        302 tensorflow-training-2020-05-25-17-22-50-737/debug-output/index/000000000/000000000000_worker_0.json
2020-05-25 17:27:27        301 tensorf

## Deploy model

In [31]:
tf_predictor = tf_estimator.deploy(initial_instance_count=1,
                                   instance_type='ml.m4.xlarge')
#                                    endpoint_type='tensorflow-serving')

-----------!

## Evaluate on test set

In [33]:
!pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Running setup.py bdist_wheel for seqeval ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [35]:
from seqeval.metrics import f1_score

In [36]:
predictions = tf_predictor.predict(X_te)

In [44]:
test_pred = np.array(predictions['predictions'][0])

In [45]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [47]:
test_f1 = f1_score(pred_labels, test_labels)
print(f"Test F1-Score: {test_f1}")

Test F1-Score: 0.00693953874922086


## Delete endpoint

In [50]:
tf_predictor.delete_endpoint()

## Empty Bucket

In [52]:
import boto3
bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '296A44D39494426D',
   'HostId': 'r+P9//ZtlQKSobW9RYylTLoFuO58h2z9CB1a5sjgU11MUSiwsgk+o1LvxlEB4VWyztX7I9u8vj4=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'r+P9//ZtlQKSobW9RYylTLoFuO58h2z9CB1a5sjgU11MUSiwsgk+o1LvxlEB4VWyztX7I9u8vj4=',
    'x-amz-request-id': '296A44D39494426D',
    'date': 'Mon, 25 May 2020 17:58:58 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'tensorflow-training-2020-05-25-17-16-19-751/debug-output/collections/000000000/worker_0_collections.json'},
   {'Key': 'tensorflow-training-2020-05-25-17-16-19-751/debug-output/index/000000000/000000000032_worker_0.json'},
   {'Key': 'tensorflow-training-2020-05-25-17-20-35-168/source/sourcedir.tar.gz'},
   {'Key': 'tensorflow-training-2020-05-25-17-22-50-737/debug-output/collections/000000000/worker_0_collections.json'},
   {'Key': 'tensorflo