In [None]:
import sys

from keras.models import Sequential
from keras.layers import LSTM, Embedding
from keras.layers import Masking
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import Flatten

import numpy as np
import random
import re
import string

mxlen=20

char_map={"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "A": 27, "B": 28, "C": 29, "D": 30, "E": 31, "F": 32, "G": 33, "H": 34, "I": 35, "J": 36, "K": 37, "L": 38, "M": 39, "N": 40, "O": 41, "P": 42, "Q": 43, "R": 44, "S": 45, "T": 46, "U": 47, "V": 48, "W": 49, "X": 50, "Y": 51, "Z": 52, "0": 53, "1": 54, "2": 55, "3": 56, "4": 57, "5": 58, "6": 59, "7": 60, "8": 61, "9": 62, "_": 63, "UNK":64}


def getTrainingData(hashtags, mxlen):
    trainingData = []
    labels = []

    for i in range(len(hashtags)):
        hashtag = hashtags[i]
        train_hashtag = []
        label = []
        
        for i in range(len(hashtag)-1):
            letter = hashtag[i]
            next_letter = hashtag[i+1]
            if letter != " ":
                if next_letter == " ":
                    label.append(1)
                else:
                    label.append(0)
                train_hashtag.append(letter)
                
        if hashtag != " ":
            train_hashtag.append(hashtag[-1])
            label.append(0)
            
        labels.append(label)
        trainingData.append(train_hashtag[:mxlen])

    return trainingData, labels

# pad input sequence to fixed length
def pad(trainingData, labels, mxlen):
    for i in range(len(trainingData)):
        sample = trainingData[i]
        label = labels[i]

        if len(sample) < mxlen:
            sample += [-1] * (mxlen-len(sample)) 
            label += [-1] * (mxlen-len(label))

        sample=np.array(sample[:mxlen])
        label=np.array(label[:mxlen])

    return np.array(trainingData), labels


def get_data(filename):
    with open(filename, 'r') as f:
        hashtag_data = f.read().split('\n')
        hashtag_data = [h for h in hashtag_data if len(h) > 0]

    originalData, labels = getTrainingData(hashtag_data, mxlen)
    data, labels = pad(originalData, labels, mxlen)

    samples = len(data)
    data = np.asarray(data).reshape((samples*mxlen, 1))
    new_trainingData=[]

    for char in data:
        cc=char[0]
        if cc == "-1":
            val=0
        else:
            if cc in char_map:
                val=char_map[cc]
            else:
                val=char_map["UNK"]

        new_trainingData.append(val)

    labels = np.asarray(labels).reshape((samples, mxlen, 1))
    data = np.array(new_trainingData).reshape(samples, mxlen)

    return originalData, data, labels


In [2]:
# get data
originalTraining, trainingData, labels = get_data("train.txt")
originalDev, devData, devLabels = get_data("dev.txt")

## 1 Bidirectional LSTM

In [12]:
hidden_neurons=512
char_dim=20

# define LSTM
model = Sequential()
model.add(Embedding(input_dim=len(char_map)+1, output_dim=char_dim, input_length=mxlen, mask_zero=True))

#add Bidirectional LSTM layer here
model.add(Bidirectional(LSTM(hidden_neurons, return_sequences=True)))
#add Dense Time Distributed output layer here
model.add(TimeDistributed(Dense(1, activation='sigmoid')))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 20)            1300      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 20, 1024)          2183168   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 20, 1)             1025      
Total params: 2,185,493
Trainable params: 2,185,493
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
BATCH_SIZE=256
NUM_EPOCHS=2

# Fit model on training data, evaluating on dev data
model.fit(trainingData, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=(devData, devLabels))

Train on 705490 samples, validate on 1282 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x23c9c3582b0>

## 2 Predictions to Segmentations

In [5]:
# convert predictions to segmentation
def segment(input_seq, ys):
    """
    Return the original hashtag and the segmented hashtag
       >>> input_seq = [g, o, b, e, a, r, s, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
       >>> ys = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0]
       >>> segment(input_seq, ys)
       gobears , go bears 
    """
    
    original=[]
    segmentation=[]
    
    for i, char in enumerate(input_seq):
        if char == -1:
            break
        original.append(char)
        segmentation.append(char)
        if ys[i] == 1:
            segmentation.append(' ')
    
    return original, segmentation

In [6]:
# Generate predictions for test data written to output.txt
out=open("output.txt", "w")
yhat = model.predict_classes(testData, verbose=0)
idx=0
samples,_,_ = yhat.shape
for batch_num in range(samples):
    vals=[]
    for seq in range(mxlen):
        vals.append(yhat[batch_num][seq][0])
    original, segmentation=segment(originalTest[idx], vals)
    out.write ("%s\t%s\n" % (''.join(original), ''.join(segmentation)))
    idx+=1
out.close()

## 3 Chunking-System Evaluation F-1 Score

In [7]:
def segment_F1_score(pred_labels, true_labels):
    """
    Return average F1 score of segmentations provided by the model
    """
    cumF1 = count = 0
    for true, pred in zip(true_labels, pred_labels):
        chunks_pred = chunks_true = 1
        last_pred = last_true = chunks_common = 0
        for i, label_lst in enumerate(true):
            label = label_lst[0]
            if label == -1:
                if last_pred == last_true:
                    chunks_common += 1
                break

            if label == pred[i][0]:
                if label == 1:
                    chunks_pred += 1
                    chunks_true += 1
                    if last_pred == last_true:
                        chunks_common += 1
                    last_true = last_pred = i
            else:
                if label == 1:
                    last_true = i
                    chunks_true += 1
                else:
                    last_pred = i
                    chunks_pred += 1
                
        precision = 1.0*chunks_common / chunks_pred
        recall = 1.0*chunks_common / chunks_true
        count += 1
        if (precision + recall) != 0:
            cumF1 += 1.0*(2*precision*recall) / (precision + recall)
            
    return 1.0*cumF1 / count

In [8]:
# make predictions for devData
yhat = model.predict_classes(devData, verbose=0)
segment_F1_score(yhat, devLabels)

0.7538054379318021