### Build classifer to label causes for OSHA data

* Use Malaysia pre-labelled data to build a classifier to label OSHA.xls data
* Test three types of classifiers (Naive Bayes, Decision Tree, SVM)

In [1]:
# import dependencies

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import tree

import string

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
%matplotlib inline

In [3]:
osha=pd.read_excel('./Text Mining/data/raw/osha.xlsx', header=None, names = ["Case", "Title", "Description","Summary", "Classification"])

In [4]:
osha.head()

Unnamed: 0,Case,Title,Description,Summary,Classification
0,201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,
1,202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,
2,200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...
3,200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,
4,201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,


In [5]:
# Read in labelled Malaysia Data
report_train=pd.read_excel('MsiaAccidentCasesTrain.xlsx',names = ["Cause", "Title", "Summary"])
report_test=pd.read_excel('MsiaAccidentCasesTest.xlsx',names = ["Cause", "Title", "Summary"])

In [6]:
report_train.groupby('Cause').describe()

Unnamed: 0_level_0,Summary,Summary,Summary,Summary,Title,Title,Title,Title
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Cause,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Caught in/between Objects,36,36,Victim was driving mechanical buffalo loaded w...,1,36,34,Died crushed by object,2
Collapse of object,5,5,Concrete wall of a building next to the car wa...,1,5,5,Died being crushed by pallet,1
Drowning,8,8,"Victim, a tractor driver was carrying land lev...",1,8,3,Died due to drowning,6
Electrocution,17,17,"Victim, a security guard was struck by lightni...",1,17,8,Died due to electrocution,4
Exposure to Chemical Substances,2,2,Victim was found unconscious after got out of ...,1,2,2,Died due to the exposure of hazardous chemical,1
Exposure to extreme temperatures,2,2,The victim suffered 3rd degree burns from spla...,1,2,2,Died due to burns,1
Falls,56,56,Victim fell from a high place while performing...,1,56,22,Died falling from height,32
Fires and Explosion,4,4,Three workers were died due to coal explosion,1,4,4,Killed in mine explosion,1
Other,9,9,The victim died after being stung by wasps whi...,1,9,8,Died being buried,2
Others,1,1,A worker has been stung by wasps while,1,1,1,Died stung by wasps,1


In [7]:
# Prepare two sets of training and test data
# One set based on the Summary column, One set bsed on the Title column

X_Summary_train = report_train['Summary']
X_Title_train = report_train['Title']
y_train = report_train['Cause']

X_Summary_test = report_test['Summary']
X_Title_test = report_test['Title']
y_test = report_test['Cause']

In [8]:
# Function to preprocess text
# - remove stop words
# - lemmanize
# - remove punctuation
# - remove numbers
# - convert to lower case
# - remove tokens which are less than 4 characters

def text_process(mess):
    stop = nltk.corpus.stopwords.words('english')
    wnl = nltk.WordNetLemmatizer()
    snowball = nltk.SnowballStemmer('english')
    
    tokens = word_tokenize(mess.replace("'", ""))
    tokens_nop = [ t for t in tokens if t not in string.punctuation ]
    tokens_char = [x for x in tokens_nop if not any(c.isdigit() for c in x)]
    tokens_lower = [ t.lower() for t in tokens_char ]
    tokens_nostop=[ t for t in tokens_lower if t not in stop ]
    tokens_lem = [ wnl.lemmatize(t) for t in tokens_nostop ]
    tokens_snow = [ snowball.stem(t) for t in tokens_lem ]
    tokens_clean = [ t for t in tokens_snow if len(t) >= 3 ]
    
    return tokens_clean

## Naive Bayes

* nb1_clf - classifier based on Title text
* nb2_clf - classifier based on Summary text

In [35]:
nb1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])
nb2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

In [36]:
nb1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [37]:
predicted = nb1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  1  0  0  0]
 [ 0  0  0  0  2  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  3  2  0  0  3]
 [ 2  0  0  0  0  0  0 10]]
0.698113207547


In [38]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.82      0.90      0.86        10
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.74      1.00      0.85        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.77      0.83      0.80        12

              avg / total       0.58      0.70      0.63        53



  'precision', 'predicted', average, warn_for)


In [39]:
nb2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [40]:
predicted = nb2_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 1  0  0  0  8  0  0  1]
 [ 0  0  0  0  3  0  0  0]
 [ 0  0  0  0  1  0  0  0]
 [ 0  0  0  0  1  0  0  0]
 [ 0  0  0  0 16  0  0  1]
 [ 0  0  0  0  1  0  0  0]
 [ 0  0  0  0  7  0  0  1]
 [ 1  0  0  0  8  0  0  3]]
0.377358490566


In [41]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.50      0.10      0.17        10
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       0.00      0.00      0.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.36      0.94      0.52        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.50      0.25      0.33        12

              avg / total       0.32      0.38      0.27        53



  'precision', 'predicted', average, warn_for)


## Decision Tree
* dt1_clf - classifier based on Title text
* dt2_clf - classifier based on Summary text

In [42]:
dt1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])
dt2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])

In [43]:
dt1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [44]:
predicted = dt1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 2  0  0  0  3  0  0  5  0]
 [ 0  2  0  0  0  0  1  0  0]
 [ 0  0  1  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 1  0  0  0 16  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  1  1  4  2]
 [ 1  0  0  0  0  0  0 10  1]
 [ 0  0  0  0  0  0  0  0  0]]
0.622641509434


In [45]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.40      0.20      0.27        10
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.84      0.94      0.89        17
      Fires and Explosion       0.50      1.00      0.67         1
                    Other       0.50      0.12      0.20         8
 Struck By Moving Objects       0.53      0.83      0.65        12
              Suffocation       0.00      0.00      0.00         0

              avg / total       0.63      0.62      0.59        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [46]:
dt2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [47]:
predicted = dt2_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[10  0  0  0  0  0  0  0  0]
 [ 1  2  0  0  0  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 2  0  1  0 10  0  3  1  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  0  1  2  1]
 [ 6  0  0  0  2  0  1  3  0]
 [ 0  0  0  0  0  0  0  0  0]]
0.509433962264


In [48]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.40      1.00      0.57        10
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       0.50      1.00      0.67         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.83      0.59      0.69        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.20      0.12      0.15         8
 Struck By Moving Objects       0.50      0.25      0.33        12
              Suffocation       0.00      0.00      0.00         0

              avg / total       0.55      0.51      0.49        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## SVM

* sv1_clf - classifier based on Title text
* sv2_clf - classifier based on Summary text

In [28]:
sv1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

sv2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

In [29]:
sv1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [30]:
predicted = sv1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  1  0  0  0]
 [ 0  2  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0]
 [ 1  0  1  0 14  0  0  1]
 [ 0  0  0  1  0  0  0  0]
 [ 1  0  0  3  1  0  1  2]
 [ 1  0  0  0  0  0  0 11]]
0.716981132075


In [31]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.69      0.90      0.78        10
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       0.50      1.00      0.67         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.88      0.82      0.85        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.50      0.12      0.20         8
 Struck By Moving Objects       0.79      0.92      0.85        12

              avg / total       0.73      0.72      0.70        53



  'precision', 'predicted', average, warn_for)


In [32]:
sv2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11240b950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [33]:
predicted = sv2_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  0  0  1  0]
 [ 0  0  0  0  2  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1]
 [ 0  0  1  0 15  1  0  0]
 [ 1  0  0  0  0  0  0  0]
 [ 3  0  0  3  2  0  0  0]
 [ 3  0  3  0  0  0  0  6]]
0.584905660377


In [34]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.56      0.90      0.69        10
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       0.20      1.00      0.33         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.79      0.88      0.83        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.86      0.50      0.63        12

              avg / total       0.56      0.58      0.55        53



  'precision', 'predicted', average, warn_for)


## Label osha data

Use SVM model with Summary data

In [49]:
osha['Cause'] = sv2_clf.predict(osha.Summary).tolist()

In [50]:
osha[['Cause','Summary']].head()

Unnamed: 0,Cause,Summary
0,Suffocation,burn industrial truck waste proc fac pa...
1,Struck By Moving Objects,truck flatbed truck trailer fall abdomen
2,Struck By Moving Objects,construction undrgrd power line highway ...
3,Caught in/between Objects,waste proc fac industrial truck struck b...
4,Exposure to extreme temperatures,truck driver pump tank hot water struc...


## Determine whether incident was fatal

In [51]:
list_ = ['Killed','Fatally','Dies', 'Asphyxiated']

osha['Fatal'] = "False"
for index, row in osha.iterrows():
    if any(word in str(row.Title) for word in list_):
        osha.loc[index, 'Fatal'] = "True"

In [52]:
osha.head()

Unnamed: 0,Case,Title,Description,Summary,Classification,Cause,Fatal
0,201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,,Suffocation,False
1,202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,,Struck By Moving Objects,True
2,200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...,Struck By Moving Objects,True
3,200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,,Caught in/between Objects,True
4,201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,,Exposure to extreme temperatures,False


## Output results to CSV file

In [53]:
osha.to_csv("osha1.csv")

## Determine which Accident types are more commonly resulting in fatality or catastrophe

In [59]:
dfFatalOsha = osha[osha['Fatal']=='True']

In [60]:
dfFatalOsha.shape

(5165, 7)

In [61]:
dfFatalOsha.groupby('Cause').Cause.count()

Cause
Caught in/between Objects           2000
Collapse of object                    86
Drowning                              68
Electrocution                        290
Exposure to Chemical Substances       85
Exposure to extreme temperatures     136
Falls                               1191
Fires and Explosion                  184
Other                                146
Others                                 4
Struck By Moving Objects             697
Suffocation                          278
Name: Cause, dtype: int64