## Build classifer to label causes for OSHA data
* Use Malaysia pre-labelled data to build a classifier to label OSHA.xls data 
* Test Word2Vec model

In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn import metrics 

In [2]:
osha=pd.read_excel('osha.xlsx',names = ["Case", "Title", "Description","Summary", "Classification"])

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
osha['Title_tok'] = osha['Title'].apply(str).apply(tokenizer.tokenize)
osha['Description_tok'] = osha['Description'].apply(str).apply(tokenizer.tokenize)
osha['Summary_tok'] = osha['Summary'].apply(str).apply(tokenizer.tokenize)

In [5]:
osha[['Title','Summary_tok','Description_tok']].head()

Unnamed: 0,Title,Summary_tok,Description_tok
0,Employee Falls From Flatbed Trailer And Later...,"[truck, flatbed, truck, trailer, fall, abdomen]","[On, August, 30, 2013, Employee, 1, was, worki..."
1,Two Workers Are Struck By Motor Vehicle And O...,"[construction, undrgrd, power, line, highway, ...","[On, August, 27, 2013, Employees, 1, and, 2, o..."
2,Employee Is Struck By Bales Of Wire And Killed,"[waste, proc, fac, industrial, truck, struck, ...","[On, August, 26, 2013, Employee, 1, with, Lee,..."
3,Employee Is Splashed With Hot Water And Is Bu...,"[truck, driver, pump, tank, hot, water, struck...","[On, July, 14, 2013, Employee, 1, vacuum, pump..."
4,Employee Suffers Burns While Moving Soup,"[burn, spill, arm, chest, abdomen]","[On, June, 30, 2013, Employee, 1, was, working..."


In [6]:
osha['Description_tok'].apply(len).max(), osha['Description_tok'].apply(len).mean(), osha['Summary_tok'].apply(len).max(), osha['Summary_tok'].apply(len).mean()

(740, 115.91612547481927, 24, 10.16535963729935)

In [7]:
# Read in labelled Malaysia Data
report_train=pd.read_excel('MsiaAccidentCasesTrain.xlsx',names = ["Cause", "Summary", "Description"])
report_test=pd.read_excel('MsiaAccidentCasesTest.xlsx',names = ["Cause", "Summary", "Description"])

In [8]:
report_train['Summary_tok'] = report_train['Summary'].apply(str).apply(tokenizer.tokenize)
report_train['Description_tok'] = report_train['Description'].apply(str).apply(tokenizer.tokenize)

In [9]:
report_test['Summary_tok'] = report_test['Summary'].apply(str).apply(tokenizer.tokenize)
report_test['Description_tok'] = report_test['Description'].apply(str).apply(tokenizer.tokenize)


In [10]:
report_train[['Summary_tok','Description_tok']].head()

Unnamed: 0,Summary_tok,Description_tok
0,"[Died, being, caught, in, between, machines]","[The, accident, occurred, as, victim, was, ass..."
1,"[Died, been, buried]","[The, accident, occurred, during, the, floor, ..."
2,"[Died, crushed, by, entrance, arch]","[Victim, with, four, co, workers, were, instal..."
3,"[Died, due, to, mine, cave, in]","[A, series, of, avalanche, trapped, victim, wh..."
4,"[Died, being, run, over, by, a, lorry]","[Accident, involving, an, employee, who, has, ..."


In [11]:
report_train['Description_tok'].apply(len).max(), report_train['Description_tok'].apply(len).mean(), report_train['Summary_tok'].apply(len).max(), report_train['Summary_tok'].apply(len).mean()

(103, 27.835164835164836, 10, 4.862637362637362)

In [12]:
# combine all text to build word2vec model

all_reports = pd.DataFrame()
# all_reports['Summary'] = pd.concat([report_train['Summary_tok'], osha['Summary_tok']], axis=0, ignore_index=True)
# all_reports['Description'] = pd.concat([report_train['Description_tok'], report_test['Description_tok'], osha['Description_tok'], report_train['Summary_tok'], report_test['Summary_tok']], axis=0, ignore_index=True)
all_reports['Description'] = pd.concat([report_train['Description_tok'], report_test['Description_tok'], osha['Description_tok'], 
                                        report_train['Summary_tok'], report_test['Summary_tok'], osha['Summary_tok'],
                                        osha['Title_tok']], axis=0, ignore_index=True)


In [13]:
all_reports.head()

Unnamed: 0,Description
0,"[The, accident, occurred, as, victim, was, ass..."
1,"[The, accident, occurred, during, the, floor, ..."
2,"[Victim, with, four, co, workers, were, instal..."
3,"[A, series, of, avalanche, trapped, victim, wh..."
4,"[Accident, involving, an, employee, who, has, ..."


In [14]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(all_reports['Description'], workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, hs=1, negative=0)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "osha_reports"
model.save(model_name)

2017-10-08 16:40:33,339 : INFO : 'pattern' package not found; tag filters are not available for English
2017-10-08 16:40:33,347 : INFO : collecting all words and their counts
2017-10-08 16:40:33,348 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2017-10-08 16:40:33,572 : INFO : PROGRESS: at sentence #10000, processed 1173626 words, keeping 28319 word types
2017-10-08 16:40:33,728 : INFO : PROGRESS: at sentence #20000, processed 1932591 words, keeping 35957 word types
2017-10-08 16:40:33,749 : INFO : PROGRESS: at sentence #30000, processed 2035093 words, keeping 36453 word types
2017-10-08 16:40:33,771 : INFO : PROGRESS: at sentence #40000, processed 2115781 words, keeping 38130 word types
2017-10-08 16:40:33,788 : INFO : collected 39188 word types from a corpus of 2184261 raw words and 49436 sentences
2017-10-08 16:40:33,790 : INFO : Loading a fresh vocabulary
2017-10-08 16:40:33,965 : INFO : min_count=1 retains 39188 unique words (100% of original 39188, drops 0)
2017-10-08 16:40:33,966 : INFO : min_count=1 leaves 2184261 word corpus (100% of original 2184261, drops 0)
2017-10-08 16:40:34,088 : INFO : deleting the raw counts dictionary of 39188 items
2017-10-08 16:40:34,090 : INFO : sample=0.001 downsamples 37 most-common wor

In [15]:
model.doesnt_match("victim died fall August".split(" "))

'died'

In [16]:
model.most_similar("died"), model.most_similar("dead")

([('suffering', 0.4766765832901001),
  ('5047812', 0.4291504919528961),
  ('killed', 0.40602555871009827),
  ('dead', 0.3973725438117981),
  ('Prevost', 0.3949882686138153),
  ('paralyzed', 0.39073607325553894),
  ('suffered', 0.39008939266204834),
  ('awhile', 0.3765147030353546),
  ('receives', 0.37588953971862793),
  ('Meyers', 0.37171670794487)],
 [('deceased', 0.658421516418457),
  ('Arrival', 0.6127527356147766),
  ('CalPine', 0.46651631593704224),
  ('boll', 0.4663007855415344),
  ('Watchman', 0.4642772674560547),
  ('whales', 0.46344828605651855),
  ('Makeshift', 0.45510026812553406),
  ('sawhorses', 0.45261430740356445),
  ('flaws', 0.45000481605529785),
  ('surgical', 0.4470706880092621)])

In [17]:
#vector average for entire sentence
#np.mean(model.wv[report_train['Description_tok'][0]], axis=0)

In [18]:
# vector average for entire sentence
report_train['Desc_vec'] = ""
for index, row in report_train.iterrows():
    report_train.loc[index, 'Desc_vec'] = np.mean(model.wv[row['Description_tok']], axis=0).astype(object)

In [19]:
# vector average for entire sentence
report_test['Desc_vec'] = ""
for index, row in report_test.iterrows():
    report_test.loc[index, 'Desc_vec'] = np.mean(model.wv[row['Description_tok']], axis=0).astype(object)

In [20]:
# vector average for entire sentence
report_train['Sum_vec'] = ""
for index, row in report_train.iterrows():
    report_train.loc[index, 'Sum_vec'] = np.mean(model.wv[row['Summary_tok']], axis=0).astype(object)

In [21]:
# vector average for entire sentence
report_test['Sum_vec'] = ""
for index, row in report_test.iterrows():
    report_test.loc[index, 'Sum_vec'] = np.mean(model.wv[row['Summary_tok']], axis=0).astype(object)

In [22]:
# prepare test and train data

y_train = report_train['Cause'].values
# x_train = np.array(report_train['Desc_vec'].tolist())
x_train = np.array(report_train['Sum_vec'].tolist())
y_test = report_test['Cause'].values
# x_test = np.array(report_test['Desc_vec'].tolist())
x_test = np.array(report_test['Sum_vec'].tolist())

In [23]:
# Use SVM model

text_clf = SGDClassifier()
text_clf.fit(x_train, y_train)
# report_train['Cause']

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [24]:
predicted = text_clf.predict(x_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test))

[[ 5  1  0  4  0  0  0  0  0  0]
 [ 0  2  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0]
 [ 1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0]
 [ 4  0  0  3  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 2  1  0  4  0  0  0  1  1  3]]
0.528301886792


In [25]:
print(metrics.classification_report(y_test, predicted))

                                  precision    recall  f1-score   support

       Caught in/between Objects       0.42      0.50      0.45        10
              Collapse of object       0.50      0.67      0.57         3
                        Drowning       0.00      0.00      0.00         1
                   Electrocution       0.00      0.00      0.00         1
Exposure to extreme temperatures       0.00      0.00      0.00         0
                           Falls       1.00      1.00      1.00        17
             Fires and Explosion       0.50      1.00      0.67         1
                           Other       0.00      0.00      0.00         8
                          Others       0.00      0.00      0.00         0
        Struck By Moving Objects       1.00      0.25      0.40        12

                     avg / total       0.66      0.53      0.54        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [26]:
osha['Sum_vec'] = ""
for index, row in osha.iterrows():
    osha.set_value(index, 'Sum_vec', np.mean(model.wv[row['Summary_tok']], axis=0).astype(object))

In [27]:
osha['Title_vec'] = ""
for index, row in osha.iterrows():
    osha.set_value(index, 'Title_vec', np.mean(model.wv[row['Title_tok']], axis=0).astype(object))

In [28]:
osha['Cause'] = ""
for index, row in osha.iterrows():
    osha.set_value(index, 'Cause', text_clf.predict(row['Sum_vec']))















































































































In [29]:
osha[['Cause','Title','Summary']]

Unnamed: 0,Cause,Title,Summary
0,[Struck By Moving Objects],Employee Falls From Flatbed Trailer And Later...,truck flatbed truck trailer fall abdomen
1,[Electrocution],Two Workers Are Struck By Motor Vehicle And O...,construction undrgrd power line highway ...
2,[Collapse of object],Employee Is Struck By Bales Of Wire And Killed,waste proc fac industrial truck struck b...
3,[Exposure to extreme temperatures],Employee Is Splashed With Hot Water And Is Bu...,truck driver pump tank hot water struc...
4,[Exposure to extreme temperatures],Employee Suffers Burns While Moving Soup,burn spill arm chest abdomen
5,[Caught in/between Objects],Employee Injures Self With Knife,knife puncture abdomen struck by slip ...
6,[Caught in/between Objects],Foreman Is Fatally Crushed When Forklift Tips...,construction equipment operator industri...
7,[Caught in/between Objects],Employee Fractures Abdomen When Run Over By T...,cart struck by run over fracture abdomen
8,[Collapse of object],Employee Suffers Abdominal Fracture In Fall F...,installing ladder scaffold structure mo...
9,[Caught in/between Objects],Carpenter Injured In Abdomen When Saw Kicks B...,carpenter saw table saw blade unguarde...
