### Build classifer to label causes for OSHA data

* Use Malaysia pre-labelled data to build a classifier to label OSHA.xls data
* Test three types of classifiers (Naive Bayes, Decision Tree, SVM)

In [1]:
# import dependencies

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import tree

import string

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
%matplotlib inline

In [2]:
osha=pd.read_excel('./Text Mining/data/raw/osha.xlsx', header=None, index_col=0, names = ["Title", "Description","Summary", "Classification"])

In [3]:
osha.head()

Unnamed: 0_level_0,Title,Description,Summary,Classification
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,
202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,
200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...
200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,
201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,


In [4]:
# Read in labelled Malaysia Data
report_train=pd.read_excel('MsiaAccidentCasesTrain.xlsx',names = ["Cause", "Title", "Summary"])
report_test=pd.read_excel('MsiaAccidentCasesTest.xlsx',names = ["Cause", "Title", "Summary"])

In [5]:
report_train.groupby('Cause').describe()

Unnamed: 0_level_0,Summary,Summary,Summary,Summary,Title,Title,Title,Title
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Cause,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Caught in/between Objects,36,36,Victim was died due to being struck by machine...,1,36,34,Died caught between objects,2
Collapse of object,5,5,The incident took place during the removal of ...,1,5,5,Died crushed by collapsing wall,1
Drowning,8,8,Victims carry out land clearing work using bul...,1,8,3,Died due to drowning,6
Electrocution,17,17,The victim was died due to electrocution. He i...,1,17,8,Died due to electrocution,4
Exposure to Chemical Substances,2,2,Victim was diagnosed with organophosphorus as ...,1,2,2,Died due to the exposure of hazardous chemical,1
Exposure to extreme temperatures,3,3,"Accident happened due to the problem of a ""top...",1,3,3,Died due to contact with hot water,1
Falls,56,56,A general worker fell down from 26th floor as ...,1,56,22,Died falling from height,32
Fires and Explosion,3,3,A fire incident took place inside a building u...,1,3,3,Died due to fire,1
Other,9,9,The victim was thrown away from a tractor's se...,1,9,8,Died being buried,2
Struck By Moving Objects,40,40,"During the incident, the victim, a constructio...",1,40,39,Fatal accident involving Express Bus,2


In [6]:
report_test.groupby('Cause').describe()

Unnamed: 0_level_0,Summary,Summary,Summary,Summary,Title,Title,Title,Title
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Cause,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Caught in/between Objects,11,11,"The victim, a foreign worker, was crushed to d...",1,11,8,Struck by a tree,2
Collapse of object,3,3,"The victim, 44 years, was buried with rubbish ...",1,3,3,Death caused by sudden floor collapse,1
Drowning,1,1,"The victim, 7 years, was found drowned at a ri...",1,1,1,Drowned at river re-construction site,1
Falls,17,17,"A foreign worker, age 26, died when he fell of...",1,17,12,Fall from height,4
Fires and Explosion,1,1,"A local worker, age 30, died when barrel drums...",1,1,1,Death due to exploding barrel drums,1
Other,8,8,"The victim, 40 years, was found dead in a toil...",1,8,8,Found dead in a toilet,1
Struck By Moving Objects,12,12,"The victim, 14 years, was killed after being h...",1,12,11,Hit by bulldozer,2


In [7]:
# Prepare two sets of training and test data
# One set based on the Summary column, One set bsed on the Title column

X_Summary_train = report_train['Summary']
X_Title_train = report_train['Title']
y_train = report_train['Cause']

X_Summary_test = report_test['Summary']
X_Title_test = report_test['Title']
y_test = report_test['Cause']

In [8]:
# Function to preprocess text
# - remove stop words
# - lemmanize
# - remove punctuation
# - remove numbers
# - convert to lower case
# - remove tokens which are less than 4 characters

def text_process(mess):
    stop = nltk.corpus.stopwords.words('english')
    wnl = nltk.WordNetLemmatizer()
    snowball = nltk.SnowballStemmer('english')
    
    tokens = word_tokenize(mess.replace("'", ""))
    tokens_nop = [ t for t in tokens if t not in string.punctuation ]
    tokens_char = [x for x in tokens_nop if not any(c.isdigit() for c in x)]
    tokens_lower = [ t.lower() for t in tokens_char ]
    tokens_nostop=[ t for t in tokens_lower if t not in stop ]
    tokens_lem = [ wnl.lemmatize(t) for t in tokens_nostop ]
    tokens_snow = [ snowball.stem(t) for t in tokens_lem ]
    tokens_clean = [ t for t in tokens_snow if len(t) >= 3 ]
    
    return tokens_clean

## Naive Bayes

* nb1_clf - classifier based on Title text
* nb2_clf - classifier based on Summary text

In [9]:
nb1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])
nb2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

In [10]:
nb1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
predicted = nb1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  3  2  0  0  3]
 [ 2  0  0  0  0  0  0 10]]
0.698113207547


In [12]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.82      0.82      0.82        11
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         0
                    Falls       0.74      1.00      0.85        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.77      0.83      0.80        12

              avg / total       0.60      0.70      0.64        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [46]:
nb2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [47]:
predicted = nb2_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 1  0  0  9  0  0  1]
 [ 0  0  0  3  0  0  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0  0 16  0  0  1]
 [ 0  0  0  1  0  0  0]
 [ 0  0  0  7  0  0  1]
 [ 1  0  0  8  0  0  3]]
0.377358490566


In [15]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.50      0.09      0.15        11
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       0.00      0.00      0.00         1
                    Falls       0.36      0.94      0.52        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.50      0.25      0.33        12

              avg / total       0.33      0.38      0.27        53



  'precision', 'predicted', average, warn_for)


## Decision Tree
* dt1_clf - classifier based on Title text
* dt2_clf - classifier based on Summary text

In [16]:
dt1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])
dt2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])

In [17]:
dt1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [18]:
predicted = dt1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 5  0  0  0  1  0  0  5  0]
 [ 0  2  0  0  0  0  1  0  0]
 [ 0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0 16  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  5  2]
 [ 1  0  0  0  0  0  0 10  1]
 [ 0  0  0  0  0  0  0  0  0]]
0.641509433962


In [19]:
print(metrics.classification_report(y_test, predicted))

                                  precision    recall  f1-score   support

       Caught in/between Objects       0.71      0.45      0.56        11
              Collapse of object       1.00      0.67      0.80         3
                        Drowning       1.00      1.00      1.00         1
Exposure to extreme temperatures       0.00      0.00      0.00         0
                           Falls       0.94      0.94      0.94        17
             Fires and Explosion       0.00      0.00      0.00         1
                           Other       0.00      0.00      0.00         8
        Struck By Moving Objects       0.50      0.83      0.62        12
                     Suffocation       0.00      0.00      0.00         0

                     avg / total       0.64      0.64      0.62        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [43]:
dt2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [44]:
predicted = dt2_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  0  1  1]
 [ 0  2  0  0  0  1  0]
 [ 1  0  0  0  0  0  0]
 [ 4  0  1 10  1  1  0]
 [ 0  0  0  0  1  0  0]
 [ 5  0  0  1  0  2  0]
 [ 5  0  0  2  1  0  4]]
0.528301886792


In [45]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.38      0.82      0.51        11
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       0.00      0.00      0.00         1
                    Falls       0.77      0.59      0.67        17
      Fires and Explosion       0.33      1.00      0.50         1
                    Other       0.40      0.25      0.31         8
 Struck By Moving Objects       0.80      0.33      0.47        12

              avg / total       0.63      0.53      0.53        53



## SVM

* sv1_clf - classifier based on Title text
* sv2_clf - classifier based on Summary text

In [23]:
sv1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

sv2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

In [24]:
sv1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [25]:
predicted = sv1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  0  0  0  2]
 [ 0  2  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 1  0  0  0 15  0  0  1]
 [ 0  0  0  1  0  0  0  0]
 [ 1  0  0  3  1  0  1  2]
 [ 2  0  0  0  0  0  0 10]]
0.716981132075


In [26]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.69      0.82      0.75        11
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         0
                    Falls       0.94      0.88      0.91        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.50      0.12      0.20         8
 Struck By Moving Objects       0.67      0.83      0.74        12

              avg / total       0.75      0.72      0.71        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [48]:
sv2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x11df6de18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [49]:
predicted = sv2_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 7  2  0  0  0  0  1  1]
 [ 0  1  0  0  1  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  1  0 16  0  0  0]
 [ 1  0  0  0  0  0  0  0]
 [ 2  0  0  3  1  1  0  1]
 [ 2  0  3  0  1  1  0  5]]
0.566037735849


In [50]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.58      0.64      0.61        11
       Collapse of object       0.33      0.33      0.33         3
                 Drowning       0.20      1.00      0.33         1
            Electrocution       0.00      0.00      0.00         0
                    Falls       0.84      0.94      0.89        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.71      0.42      0.53        12

              avg / total       0.58      0.57      0.56        53



  'recall', 'true', average, warn_for)


## Label osha data

Use SVM model with Summary data

In [30]:
osha['Cause'] = sv2_clf.predict(osha.Summary).tolist()

In [31]:
osha = osha.applymap(str)

In [32]:
osha['Cause'] = sv1_clf.predict(osha['Title']).tolist()

In [33]:
osha[['Cause','Summary']].head()

Unnamed: 0_level_0,Cause,Summary
0,Unnamed: 1_level_1,Unnamed: 2_level_1
201079928,Exposure to extreme temperatures,burn industrial truck waste proc fac pa...
202561825,Struck By Moving Objects,truck flatbed truck trailer fall abdomen
200361855,Struck By Moving Objects,construction undrgrd power line highway ...
200361863,Struck By Moving Objects,waste proc fac industrial truck struck b...
201079324,Exposure to extreme temperatures,truck driver pump tank hot water struc...


In [34]:
summary = osha.groupby('Cause').Cause.count()
dfSummary = pd.DataFrame(summary)
dfSummary = dfSummary.rename(columns={'Cause': 'Count'})
total = dfSummary['Count'].sum()
dfSummary['Percentage'] = (dfSummary['Count']/total)*100
dfSummary

Unnamed: 0_level_0,Count,Percentage
Cause,Unnamed: 1_level_1,Unnamed: 2_level_1
Caught in/between Objects,1967,12.050481
Collapse of object,409,2.505667
Drowning,177,1.084359
Electrocution,1381,8.460455
Exposure to Chemical Substances,319,1.954298
Exposure to extreme temperatures,1499,9.183361
Falls,3326,20.376156
Fires and Explosion,552,3.381731
Other,285,1.746003
Struck By Moving Objects,6159,37.732035


## Determine whether incident was fatal

In [35]:
list_ = ['killed','fatally', 'fatal', 'dies', 'asphyxiated', 'die', 
         'doa', 'deceased', 'dead', 'death', 'kills', 'kill', 'drowned']

osha['Fatal'] = False
for index, row in osha.iterrows():
    if (any(word in str(row.Title).lower() for word in list_)
        or any(word in str(row.Description).lower() for word in list_)):
        osha.loc[index, 'Fatal'] = True

In [36]:
osha.head()

Unnamed: 0_level_0,Title,Description,Summary,Classification,Cause,Fatal
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,,Exposure to extreme temperatures,False
202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,,Struck By Moving Objects,True
200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...,Struck By Moving Objects,True
200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,,Struck By Moving Objects,True
201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,,Exposure to extreme temperatures,False


## Output results to CSV file

In [37]:
osha.to_csv("osha1.csv", index_label='Case')

## Determine which Accident types are more commonly resulting in fatality or catastrophe

In [38]:
dfFatalOsha = osha[osha['Fatal']==True]
dfFatalOsha.shape

(6816, 6)

In [39]:
summary = dfFatalOsha.groupby('Cause').Cause.count()
summary

Cause
Caught in/between Objects            830
Collapse of object                   206
Drowning                             154
Electrocution                        409
Exposure to Chemical Substances      106
Exposure to extreme temperatures     306
Falls                               1430
Fires and Explosion                  348
Other                                146
Struck By Moving Objects            2774
Suffocation                          107
Name: Cause, dtype: int64

In [40]:
dfSummary = pd.DataFrame(summary)
dfSummary = dfSummary.rename(columns={'Cause': 'Count'})
total = dfSummary['Count'].sum()
dfSummary['Percentage'] = (dfSummary['Count']/total)*100
dfSummary

Unnamed: 0_level_0,Count,Percentage
Cause,Unnamed: 1_level_1,Unnamed: 2_level_1
Caught in/between Objects,830,12.17723
Collapse of object,206,3.0223
Drowning,154,2.25939
Electrocution,409,6.000587
Exposure to Chemical Substances,106,1.555164
Exposure to extreme temperatures,306,4.489437
Falls,1430,20.980047
Fires and Explosion,348,5.105634
Other,146,2.142019
Struck By Moving Objects,2774,40.698357
