## Code Template


In [None]:
import tensorflow as tf
import os
import numpy as np
import random

seed = 23791
#seed = 23511

random.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

### Step 1: Read File

In [None]:
def read_file(f):
    """This function is used to read files that are tab-separated. 
    The function will split each row into two parts: ID and data.
    Data is a list of either sentence or tag sequence that is splitted into a list by space. 
    """
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [None]:
row_id_text, texts = read_file('./data/REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./data/REVIEW_LABELSEQ.txt')

#texts = texts[:200]  # if you want to limit to the first 2
#tags = tags[:200]

In [None]:
print(len(row_id_tags))
print(sum(len(s) for s in texts))
print(sum(len(t) for t in tags))

### Step 2: Modify Input Data

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

unique_words = list(set([j for i in texts for j in i]))
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0

unique_tags = list(set([j for i in tags for j in i]))
label2idx = {j:i for i,j in enumerate(unique_tags)}
idx2label = {j:i for i,j in label2idx.items()}

input_length = 20  # set input length to 20  .  This is the value of the maxlen parameter for pad_sequences below

X = [[word2idx[j] for j in i] for i in texts]
# Add padding to inputs and set the length of inputs to 20.
# Everything past 25 will be truncated, and padding tokens ("PAD") will be appended to inputs shorter than 20
X = pad_sequences(maxlen = input_length, sequences = X, padding = "post", value = word2idx["PAD"])
y = [[label2idx[j] for j in i] for i in tags]
# Add padding labels (These are "O"s since they are outside entities).  This maxlen must be the same as for X above.
y = pad_sequences(maxlen = input_length, sequences = y, padding = "post", value = label2idx["O"])
y = [to_categorical(i, num_classes = len(unique_tags)) for i in y]

### Split Data into Training and Validation set

In [None]:
X_train, X_validation, y_train, y_validation  = train_test_split(X, y, test_size = 0.2)
print(len(y_validation))

### Build am LSTM model
You can add as many layers as you want.

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import CategoricalCrossentropy
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model = Sequential()
model.add(Embedding(input_dim=len(word2idx.keys()),output_dim=20,input_length=input_length))
model.add(Bidirectional(LSTM(units=50,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model.add(Dense(len(label2idx.keys()), activation="relu"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### Step 5: Results of how the LSTM model performs

In [None]:
#import numpy as np
history = model.fit(X_train,np.array(y_train),batch_size=16,epochs=1,validation_split=0.1)

print("fit done")

y_pred = model.predict(X_validation)
y_pred = np.argmax(y_pred, axis=-1)
y_validation = np.argmax(y_validation, -1)
y_pred = [[idx2label[i] for i in row] for row in y_pred]
y_validation = [[idx2label[i] for i in row]
                  for row in y_validation]

In [None]:
# Make sure to run all cells up to this point
# This is a test cell
assert(len(y_pred) == len(y_validation))

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report

for i in range(len(y_validation)):
    y_pred[i] = y_pred[i][:len(y_validation[i])]

report = flat_classification_report(y_pred=y_pred, y_true=y_validation, digits=7)
print(report)


In [None]:
accuracy = # report accuracy (copy and paste accuracy number from the report above)

In [None]:
# Check reported accuracy

### Step 6: Now Tweak Parameters


In [None]:
# Run this cell and
# See how we are cutting some sentences short
print("length of actual sentence:", len(texts[132]))
print("length of used sentence:  ", len(X[132]))

In [None]:
# (Run this cell)
# But there are examples like this as well:
print("actual:      ", len(texts[113]))
print("fed to model:", len(X[113]))
print(texts[113])
print(X[113])

In [None]:
#import numpy as np
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import CategoricalCrossentropy
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn_crfsuite.metrics import flat_classification_report


In [None]:
unique_words = list(set([j for i in texts for j in i]))
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0

unique_tags = list(set([j for i in tags for j in i]))
label2idx = {j:i for i,j in enumerate(unique_tags)}
idx2label = {j:i for i,j in label2idx.items()}

input_length_2 = ### set input length here!

X = [[word2idx[j] for j in i] for i in texts]
# Add padding inputs       ************
X = pad_sequences(maxlen = input_length_2, sequences = X, padding = "post", value = word2idx["PAD"])
y = [[label2idx[j] for j in i] for i in tags]
# Add padding labels.      ************     This must have the same maxlen as for X above.
y = pad_sequences(maxlen = input_length_2, sequences = y, padding = "post", value = label2idx["O"])
y = [to_categorical(i, num_classes = len(unique_tags)) for i in y]

X_train, X_validation, y_train, y_validation  = train_test_split(X, y, test_size = 0.2)

- Remember, this should also be the input_length parameter of the Embedding layer, hence using the same variable to set both.
- Run the above and below cell again each time you change the input_length variable to see updated results.

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(word2idx.keys()),output_dim=20,input_length=input_length_2))
model.add(Bidirectional(LSTM(units=50,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model.add(Dense(len(label2idx.keys()), activation="relu"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


history = model.fit(X_train,np.array(y_train),batch_size=16,epochs=1,validation_split=0.1)

print("fit done")

y_pred = model.predict(X_validation)
y_pred = np.argmax(y_pred, axis=-1)
y_validation = np.argmax(y_validation, -1)
y_pred = [[idx2label[i] for i in row] for row in y_pred]
y_validation = [[idx2label[i] for i in row]
                  for row in y_validation]


for i in range(len(y_validation)):
    y_pred[i] = y_pred[i][:len(y_validation[i])]

report = flat_classification_report(y_pred=y_pred, y_true=y_validation, digits=7)
print(report)

In [None]:
# Test for setting input_length_2

### One final, simple way to improve performance a lot of the time will be adjusting the epochs.


In [None]:
unique_words = list(set([j for i in texts for j in i]))
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0

unique_tags = list(set([j for i in tags for j in i]))
label2idx = {j:i for i,j in enumerate(unique_tags)}
idx2label = {j:i for i,j in label2idx.items()}


input_length_3 = #input_length_2  ### set input length.  You may uncomment  input_length_2  if you wish to use that!

X = [[word2idx[j] for j in i] for i in texts]
# Add padding inputs       ************
X = pad_sequences(maxlen = input_length_3, sequences = X, padding = "post", value = word2idx["PAD"])
y = [[label2idx[j] for j in i] for i in tags]
# Add padding labels.      ************     This must have the same maxlen as for X above.
y = pad_sequences(maxlen = input_length_3, sequences = y, padding = "post", value = label2idx["O"])
y = [to_categorical(i, num_classes = len(unique_tags)) for i in y]

X_train, X_validation, y_train, y_validation  = train_test_split(X, y, test_size = 0.2)

In [None]:
epochs = ### set your epochs here!!

model = Sequential()
model.add(Embedding(input_dim=len(word2idx.keys()),output_dim=20,input_length=input_length_3))
model.add(Bidirectional(LSTM(units=50,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model.add(Dense(len(label2idx.keys()), activation="relu"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


history = model.fit(X_train,np.array(y_train),batch_size=16,epochs=epochs,validation_split=0.1)

print("fit done")

y_pred = model.predict(X_validation)
y_pred = np.argmax(y_pred, axis=-1)
y_validation = np.argmax(y_validation, -1)
y_pred = [[idx2label[i] for i in row] for row in y_pred]
y_validation = [[idx2label[i] for i in row]
                  for row in y_validation]


for i in range(len(y_validation)):
    y_pred[i] = y_pred[i][:len(y_validation[i])]

report = flat_classification_report(y_pred=y_pred, y_true=y_validation, digits=7)
print(report)

In [None]:
# REPORT YOUR FINAL ACCURACY HERE:
accuracy_final =    ### fill this in!