In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import keras
import sys

Using TensorFlow backend.


In [2]:
df = pd.read_csv('severeinjury.csv')
df['Part of Body Title'] = [e.split()[0] for e in df['Part of Body Title']]
df['Part of Body Title'] = [e.replace(',','') for e in df['Part of Body Title']]
df['NatureTitle'] = [e.split()[0] for e in df['NatureTitle']]
df['NatureTitle'] = [e.replace(',','') for e in df['NatureTitle']]
display(df.head())

Unnamed: 0,ID,UPA,EventDate,Employer,Address1,Address2,City,State,Zip,Latitude,...,Nature,NatureTitle,Part of Body,Part of Body Title,Event,EventTitle,Source,SourceTitle,Secondary Source,Secondary Source Title
0,2015010015,931176,1/1/2015,FCI Otisville Federal Correctional Institution,Two Mile Drive,,OTISVILLE,NEW YORK,10963.0,41.46,...,111,Fractures,513,Lower,1214,Injured by physical contact with person while ...,5721,Co-worker,5772.0,Inmate or detainee in custody
1,2015010016,930267,1/1/2015,Kalahari Manufacturing LLC,171 Progress Drive,,LAKE DELTON,WISCONSIN,53940.0,43.59,...,1522,Second,519,Leg(s),317,"Ignition of vapors, gases, or liquids",7261,"Welding, cutting, and blow torches",,
2,2015010018,929823,1/1/2015,Schneider National Bulk Carrier,420 CORAOPOLIS ROAD,,CORAOPOLIS,PENNSYLVANIA,15108.0,40.49,...,10,Traumatic,9999,Nonclassifiable,4331,Other fall to lower level less than 6 feet,8421,"Semi, tractor-trailer, tanker truck",741.0,Ladders-fixed
3,2015010019,929711,1/1/2015,PEPSI BOTTLING GROUP INC.,4541 HOUSTON AVE.,,MACON,GEORGIA,31206.0,32.77,...,1972,Soreness,510,Leg(s),640,Caught in or compressed by equipment or object...,8623,Pallet jack-powered,8420.0,"Truck-motorized freight hauling and utility, u..."
4,2015010020,929642,1/1/2015,North American Pipe Corporation,210 South Arch Street,,JANESVILLE,WISCONSIN,53545.0,42.67,...,111,Fractures,4429,Finger(s),6411,Caught in running equipment or machinery durin...,350,"Metal, woodworking, and special material machi...",,


In [3]:
X_dirty = df['Final Narrative']
y_body_dirty = df['Part of Body Title']
y_nature_dirty = df['NatureTitle']

In [4]:
def clean_sentences(sentences):
    translator = str.maketrans('', '', string.punctuation + string.digits)
    print('Starting translations...')
    sentences = [s.translate(translator) for s in sentences]
    stopset = set(nltk.corpus.stopwords.words('english'))
    print('Lowercasing...')
    tokens = [nltk.wordpunct_tokenize(s.lower()) for s in sentences]
    print('Splitting...')
    tokens = [np.array(t)[np.invert(np.isin(t, list(stopset)))] for t in tokens]
    return np.array(tokens)

tokens = clean_sentences(X_dirty)
tokenizer = keras.preprocessing.text.Tokenizer(num_words=1000, oov_token=1)
tokenizer = keras.preprocessing.text.Tokenizer()
    
X_all = []

count = 0
print('Fitting....')
for line in tokens:
    tokenizer.fit_on_texts(line)
    count += 1
    if count % 100 == 0 or count == len(X_dirty):
        if count != 0:
            sys.stdout.write('\r')
        bars = int(count/len(X_dirty)*20)
        sys.stdout.write('[{0:20}] {1:4.1%}'.format('='*bars, count/len(X_dirty)))
print('\nDone fitting!')
print('Sequencing....')
count = 0
for line in tokens:
    add = [list(np.array(tokenizer.texts_to_sequences(line)).flatten())]
    X_all += add
    count += 1
    if count % 100 == 0 or count == len(X_dirty):
        if count != 0:
            sys.stdout.write('\r')
        bars = int(count/len(X_dirty)*20)
        sys.stdout.write('[{0:20}] {1:4.1%}'.format('='*bars, count/len(X_dirty)))
print('\nFinished Tokening!')

Starting translations...
Lowercasing...
Splitting...
Fitting....
Done fitting!
Sequencing....
Finished Tokening!


In [5]:
with open('Vocab.dat', 'w') as vfile:
    vfile.write('{} {}'.format(0, 'x'))
    for key, value in sorted(tokenizer.word_index.items(),
                                 key=lambda kv: (kv[1], kv[0])):
        vfile.write('\n{} {}'.format(value, key))

In [6]:
def get_vocab_dict():
    vocab_dict = {}
    with open('Vocab.dat') as vocab_file:
        for line in vocab_file:
            (val, key) = line.split()
            val = int(val)
            if val == 0:
                key = ''
            vocab_dict[key] = val
    return vocab_dict

In [7]:
def tokenize_sentence_from_dict(sentence, vocab_dict):
    tokenized = []
    for word in sentence.split():
        if word in vocab_dict:
            tokenized += [vocab_dict[word]]
    return tokenized

In [8]:
def predict_from_sentence(model, sentence):
    vocab_dict = get_vocab_dict()
    tokenized = tokenize_sentence_from_dict(sentence, vocab_dict)
    return model.predict(np.array([tokenized]))

In [9]:
#X_all = np.load('X_all.npy')
#y_all = np.load('y_all.npy')

X_all = keras.preprocessing.sequence.pad_sequences(X_all, 50)
y_body_all = np.array(pd.get_dummies(y_body_dirty))
y_nature_all = np.array(pd.get_dummies(y_nature_dirty))

np.save('X_all.npy', X_all)
np.save('y_body_all.npy', y_body_all)
np.save('y_nature_all.npy', y_nature_all)


In [10]:
indices = list(range(len(X_all)))
np.random.shuffle(indices)
num_train = int(.75*len(X_all))

X_train = X_all[indices[:num_train]]
X_test = X_all[indices[num_train:]]
y_body_train = y_body_all[indices[:num_train]]
y_body_test = y_body_all[indices[num_train:]]
y_nature_train = y_nature_all[indices[:num_train]]
y_nature_test = y_nature_all[indices[num_train:]]

In [11]:
def build_model(num_output):
    model = keras.models.Sequential()

    model.add(keras.layers.Embedding(12000, 64, mask_zero=True))
    model.add(keras.layers.LSTM(64))
    model.add(keras.layers.Dropout(.5))
    model.add(keras.layers.Dense(256, activation='relu'))
    model.add(keras.layers.Dropout(.5))
    model.add(keras.layers.Dense(num_output, activation='softmax'))

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )   

    return model


In [12]:
model_body = build_model(len(pd.get_dummies(y_body_dirty).columns))
model_body.fit(X_train, y_body_train, validation_data=(X_test, y_body_test), epochs=10)

model_nature = build_model(len(pd.get_dummies(y_nature_dirty).columns))
model_nature.fit(X_train, y_nature_train, validation_data=(X_test, y_nature_test), epochs=10)

Train on 16183 samples, validate on 5395 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 16183 samples, validate on 5395 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd0dd9b1860>

In [13]:
model_body.save("body_parts2.h5")
model_nature.save("nature.h5")

In [25]:
#6405
test_idx = 6969
test_sentence = df['Final Narrative'][test_idx]
test_sentence = 'The employee got his foot stuck in a machine. He lost a toe.'
print("Sentence: \n\n{}".format(test_sentence))
model_body = keras.models.load_model("body_parts2.h5")
model_nature = keras.models.load_model("nature.h5")
pred_body = predict_from_sentence(model_body, test_sentence)
pred_nature = predict_from_sentence(model_nature, test_sentence)
idx_body = np.argmax(pred_body)
idx_nature = np.argmax(pred_nature)
print("\nPrediction: \n\tBody:   {}\n\tNature: {}".format(pd.get_dummies(y_body_dirty).columns[idx_body], pd.get_dummies(y_nature_dirty).columns[idx_nature]))
print("\nActual: \n\tBody:   {}\n\tNature: {}".format(df['Part of Body Title'][test_idx], df['NatureTitle'][test_idx]))

Sentence: 

The employee got his foot stuck in a machine. He lost a toe.

Prediction: 
	Body:   Foot
	Nature: Amputations

Actual: 
	Body:   Eye(s)
	Nature: Fractures


In [29]:
print(df['Final Narrative'][0], '\n')
print(df['Final Narrative'][10])

Three correctional facility guards were escorting a restrained federal prison inmate when he became disruptive, requiring the use of force. 
Two guards and the inmate fell onto the Lieutenant's right leg, fracturing his fibula. He was transported to the hospital and released the following day. 

An employee was struck by an excavator on 01/02/2015 and hospitalized for injuries to his leg, several broken bones in his foot, and a fracture in his left arm.
