In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import keras
import sys

Using TensorFlow backend.


In [None]:
df = pd.read_csv('severeinjury.csv')
df['Part of Body Title'] = [e.split()[0] for e in df['Part of Body Title']]
df['Part of Body Title'] = [e.replace(',','') for e in df['Part of Body Title']]
display(df.head())

In [None]:
X_dirty = df['Final Narrative']
y_dirty = df['Part of Body Title']

In [None]:
#print(np.array(tokens))

In [None]:
def clean_sentences(sentences):
    translator = str.maketrans('', '', string.punctuation + string.digits)
    print('Starting translations...')
    sentences = [s.translate(translator) for s in sentences]
    stopset = set(nltk.corpus.stopwords.words('english'))
    print('Lowercasing...')
    tokens = [nltk.wordpunct_tokenize(s.lower()) for s in sentences]
    print('Splitting...')
    tokens = [np.array(t)[np.invert(np.isin(t, list(stopset)))] for t in tokens]
    return np.array(tokens)

tokens = clean_sentences(X_dirty)
tokenizer = keras.preprocessing.text.Tokenizer(num_words=1000, oov_token=1)
tokenizer = keras.preprocessing.text.Tokenizer()
    
X_all = []

count = 0
print('Fitting....')
for line in tokens:
    tokenizer.fit_on_texts(line)
    count += 1
    if count % 100 == 0 or count == len(X_dirty):
        if count != 0:
            sys.stdout.write('\r')
        bars = int(count/len(X_dirty)*20)
        sys.stdout.write('[{0:20}] {1:4.1%}'.format('='*bars, count/len(X_dirty)))
print('\nDone fitting!')
print('Sequencing....')
count = 0
for line in tokens:
    add = [list(np.array(tokenizer.texts_to_sequences(line)).flatten())]
    X_all += add
    count += 1
    if count % 100 == 0 or count == len(X_dirty):
        if count != 0:
            sys.stdout.write('\r')
        bars = int(count/len(X_dirty)*20)
        sys.stdout.write('[{0:20}] {1:4.1%}'.format('='*bars, count/len(X_dirty)))
print('\nFinished Tokening!')

In [None]:
with open('Vocab.dat', 'w') as vfile:
    vfile.write('{} {}'.format(0, 'x'))
    for key, value in sorted(tokenizer.word_index.items(),
                                 key=lambda kv: (kv[1], kv[0])):
        vfile.write('\n{} {}'.format(value, key))

In [None]:
def get_vocab_dict():
    vocab_dict = {}
    with open('Vocab.dat') as vocab_file:
        for line in vocab_file:
            (val, key) = line.split()
            val = int(val)
            if val == 0:
                key = ''
            vocab_dict[key] = val
    return vocab_dict

In [None]:
def tokenize_sentence_from_dict(sentence, vocab_dict):
    tokenized = []
    for word in sentence.split():
        if word in vocab_dict:
            tokenized += [vocab_dict[word]]
    return tokenized

In [None]:
def predict_from_sentence(model, sentence):
    vocab_dict = get_vocab_dict()
    tokenized = tokenize_sentence_from_dict(sentence, vocab_dict)
    return model.predict(np.array([tokenized]))

In [None]:
print(np.array(X_all))

In [None]:
#X_all = np.load('X_all.npy')
#y_all = np.load('y_all.npy')

X_all = keras.preprocessing.sequence.pad_sequences(X_all, 40)
y_all = np.array(pd.get_dummies(y_dirty))

np.save('X_all.npy', X_all)
np.save('y_all.npy', y_all)


In [None]:
indices = list(range(len(X_all)))
np.random.shuffle(indices)
num_train = int(.75*len(X_all))

X_train = X_all[indices[:num_train]]
X_test = X_all[indices[num_train:]]
y_train = y_all[indices[:num_train]]
y_test = y_all[indices[num_train:]]

In [None]:
def build_model():
    model = keras.models.Sequential()

    model.add(keras.layers.Embedding(12000, 50, mask_zero=True))
    model.add(keras.layers.LSTM(64))
    model.add(keras.layers.Dense(256, activation='relu'))
    model.add(keras.layers.Dropout(.5))
    model.add(keras.layers.Dense(61, activation='softmax'))

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )   

    return model


In [None]:
model = build_model()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

In [None]:
model.save("body_parts2.h5")

In [None]:
#6405
test_sentence = df['Final Narrative'][6405]
#test_sentence = 'The employee fell off of the ladder and hit his head.'
print("Sentence: \n\n{}".format(test_sentence))
model = keras.models.load_model("body_parts.h5")
pred = predict_from_sentence(model, test_sentence)
idx = np.argmax(pred)
print("\nPrediction: \n{}".format(pd.get_dummies(y_dirty).columns[idx]))
print("\nActual: \n{}".format(df['Part of Body Title'][6405]))

In [None]:
pd.DataFrame(pd.get_dummies(y_dirty).columns)