In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(0)
import tensorflow as tf

In [2]:
data = pd.read_excel('ML_data.xlsx')
data.head(20)

Unnamed: 0,word,course,sentence,tag
0,master,course 0,Sentence 0,O
1,machine,course 0,Sentence 0,B-skill
2,learning,course 0,Sentence 0,I-skill
3,on,course 0,Sentence 0,O
4,python,course 0,Sentence 0,B-skill
5,r,course 0,Sentence 0,O
6,have,course 0,Sentence 1,O
7,a,course 0,Sentence 1,O
8,great,course 0,Sentence 1,O
9,intuition,course 0,Sentence 1,O


In [3]:
print("Unique words in corpus:", data['word'].nunique())
print("Unique tags in corpus:", data['tag'].nunique())

Unique words in corpus: 651
Unique tags in corpus: 3


In [4]:
words = list(set(data["word"].values))
words.append("ENDPAD")
num_words = len(words)

In [5]:
tags = list(set(data["tag"].values))
num_tags = len(tags)

In [6]:
def sentence_integrate(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                 s["tag"].values.tolist())]
    return data.groupby('sentence').apply(agg_func).tolist()

In [7]:
sentences=sentence_integrate(data)

In [8]:
sentences[0]

[('master', 'O'),
 ('machine', 'B-skill'),
 ('learning', 'I-skill'),
 ('on', 'O'),
 ('python', 'B-skill'),
 ('r', 'O')]

In [9]:
word2idx = {w: i+1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 15

X = [[word2idx[w[0]]-1 for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras

In [14]:
model = keras.Sequential()
model.add(InputLayer((max_len)))
model.add(Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len))
model.add(SpatialDropout1D(0.1))
model.add( Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 15)            9780      
                                                                 
 spatial_dropout1d (SpatialD  (None, 15, 15)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 15, 200)          92800     
 l)                                                              
                                                                 
Total params: 102,580
Trainable params: 102,580
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [16]:
history = model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test,y_test),
    batch_size=32,
    epochs=3,
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [34]:
i = np.random.randint(0, x_test.shape[0])
print("This is sentence:",i)
p = model.predict(np.array([x_test[i]]))
p = np.argmax(p, axis=-1)

print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_test[i], p[0]):
    print("{:15}{}\t{}".format(words[w], tags[true], tags[pred]))

This is sentence: 17
Word           True 	 Pred

------------------------------
sagemaker      O	O
builtin        O	O
algorithms     O	O
such           O	O
as             O	O
linear         O	O
learner        O	O
xgboost        O	O
principal      O	O
component      O	O
analysis       O	O
pca            O	O
and            O	O
knearest       O	O
neighbors      O	O
