# Data preparation for BiLSTM model

In [1]:
import pandas as pd
import numpy as np
import os
import sagemaker
import boto3
import json

from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from helpers import SentenceGetter

Using TensorFlow backend.


## Initial data load

In [2]:
data = pd.read_csv("../../data/interim/ner_dataset.csv", encoding="latin1").fillna(method="ffill")

In [3]:
data.shape

(1048575, 4)

In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


## Preprocessing

#### Transform data into sentences and labels using SentenceGetter from helpers

In [5]:
getter = SentenceGetter(data)
sentences = getter.sentences

labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]

#### Create lists of words and tags

In [6]:
words = list(set(data["Word"].values))
print(f"Distinct words: {len(words)}")

Distinct words: 35178


In [7]:
tags = list(set(data["Tag"].values))
print(f"Distinct tags: {len(tags)}")

Distinct tags: 17


#### Constants

In [8]:
# those constants will be used in preprocessing, training and inference
N_WORDS = len(words)
N_TAGS = len(tags)
MAX_LEN = 45

constants = {'N_WORDS': N_WORDS,
             'N_TAGS': N_TAGS,
             'MAX_LEN': MAX_LEN
            }

#### Tokenize words and pad sequences

In [9]:
# narrowing vocabulary to 5000 most common words for regularization
word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

# we have to add two tokens, PAD for padding and UNK for unknown words
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i for i, w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i, t in enumerate(tags)}

# tokenizing words based on word2idx vocabulary (words that are not in word2idx are replaced with UNK token)
X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]
y = [[tag2idx[l_i] for l_i in l] for l in labels]

# we have to also pad sequenced to the same length - sequences shorter than MAX_LEN will be extended with zeros, and sequences longer will be shortened to MAX_LEN
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["O"])

### Save objects to json files

In [10]:
# those objects will be used in preprocessing, training and inference, so we will save them to utils/objects folder
constants_file = open("../../src/utils/objects/constants_dict.json", "w")
json.dump(constants, constants_file)
constants_file.close()

word2idx_file = open("../../src/utils/objects/word2idx.json", "w")
json.dump(word2idx, word2idx_file)
word2idx_file.close()

tag2idx_file = open("../../src/utils/objects/tag2idx.json", "w")
json.dump(tag2idx, tag2idx_file)
tag2idx_file.close()

## Train-test split

In [11]:
# we split the data into training and testing sets (0.9:0.1), setting random state ensure reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=666)

## Upload data to S3

In [12]:
# for training we have to upload data to S3, we are going to use default bucket and role that is assigned to this instance
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

# current data directory
DATA_DIR = '../../data/processed'

# data directory in S3
PREFIX = 'named_entity_recognition/bilstm_data'

#### Save files locally

In [13]:
pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1).to_csv(os.path.join(DATA_DIR, 'bilstm_train.csv'), header=False, index=False)
pd.concat([pd.DataFrame(y_test), pd.DataFrame(X_test)], axis=1).to_csv(os.path.join(DATA_DIR, 'bilstm_test.csv'), header=False, index=False)

#### Upload to S3

In [14]:
train_data_directory = sagemaker_session.upload_data(path=os.path.join(DATA_DIR, 'bilstm_train.csv'), bucket=bucket, key_prefix=PREFIX)
test_data_directory = sagemaker_session.upload_data(path=os.path.join(DATA_DIR, 'bilstm_test.csv'), bucket=bucket, key_prefix=PREFIX)

In [15]:
# we need to save directories to those files on S3 for training purposes
data_directories = {'train_data_directory': train_data_directory,
                    'test_data_directory': test_data_directory
                   }

data_directories_file = open("../../src/utils/objects/data_directories.json", "w")
json.dump(data_directories, data_directories_file)
data_directories_file.close()

In [None]:
# Empty Bucket (optional)

# import boto3
# bucket_to_delete = boto3.resource('s3').Bucket(bucket)
# bucket_to_delete.objects.all().delete()