### Build classifer to label causes for OSHA data

* Use Malaysia pre-labelled data to build a classifier to label OSHA.xls data
* Test three types of classifiers (Naive Bayes, Decision Tree, SVM)

In [1]:
# import dependencies

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import tree

import string

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
%matplotlib inline

In [2]:
osha=pd.read_excel('osha.xlsx',names = ["Case", "Title", "Description","Summary", "Classification"])

In [3]:
# Read in labelled Malaysia Data
report_train=pd.read_excel('MsiaAccidentCasesTrain.xlsx',names = ["Cause", "Title", "Summary"])
report_test=pd.read_excel('MsiaAccidentCasesTest.xlsx',names = ["Cause", "Title", "Summary"])

In [4]:
report_train.groupby('Cause').describe()

Unnamed: 0_level_0,Summary,Summary,Summary,Summary,Title,Title,Title,Title
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Cause,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Caught in/between Objects,36,36,Victim was found pinned between a truck's body...,1,36,34,Died crushed by object,2
Collapse of object,5,5,A building structure collapsed while two worke...,1,5,5,Death due to structural collapse,1
Drowning,8,8,A child who was not being accompanied by his g...,1,8,3,Died due to drowning,6
Electrocution,17,17,The victim was electrocuted as he was testing ...,1,17,8,Died due to electrocution,4
Exposure to Chemical Substances,2,2,Victim was diagnosed with organophosphorus as ...,1,2,2,Died due to the exposure of hazardous chemical,1
Exposure to extreme temperatures,2,2,The victim suffered 3rd degree burns from spla...,1,2,2,Died due to contact with hot water,1
Falls,56,56,Victim fell from a high place while performing...,1,56,22,Died falling from height,32
Fires and Explosion,4,4,Three workers were died due to coal explosion,1,4,4,Died due to fire,1
Other,9,9,Victim was concreting the floor as support str...,1,9,8,Died being buried,2
Others,1,1,A worker has been stung by wasps while,1,1,1,Died stung by wasps,1


In [5]:
# Prepare two sets of training and test data
# One set based on the Summary column, One set bsed on the Title column


X_Summary_train = report_train['Summary']
X_Title_train = report_train['Title']
y_train = report_train['Cause']

X_Summary_test = report_test['Summary']
X_Title_test = report_test['Title']
y_test = report_test['Cause']

In [6]:
# Function to preprocess text
# - remove stop words
# - lemmanize
# - remove punctuation
# - remove numbers
# - convert to lower case
# - remove tokens which are less than 4 characters

def text_process(mess):
    stop = nltk.corpus.stopwords.words('english')
    wnl = nltk.WordNetLemmatizer()
    snowball = nltk.SnowballStemmer('english')
    
    tokens = word_tokenize(mess.replace("'", ""))
    tokens_nop = [ t for t in tokens if t not in string.punctuation ]
    tokens_char = [x for x in tokens_nop if not any(c.isdigit() for c in x)]
    tokens_lower = [ t.lower() for t in tokens_char ]
    tokens_nostop=[ t for t in tokens_lower if t not in stop ]
    tokens_lem = [ wnl.lemmatize(t) for t in tokens_nostop ]
    tokens_snow = [ snowball.stem(t) for t in tokens_lem ]
    tokens_clean = [ t for t in tokens_snow if len(t) >= 3 ]
    
    return tokens_clean

## Naive Bayes

* nb1_clf - classifier based on Title text
* nb2_clf - classifier based on Summary text

In [7]:
nb1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])
nb2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

In [8]:
nb1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [9]:
predicted = nb1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  1  0  0  0]
 [ 0  0  0  0  2  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  3  2  0  0  3]
 [ 2  0  0  0  0  0  0 10]]
0.698113207547


In [10]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.82      0.90      0.86        10
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.74      1.00      0.85        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.77      0.83      0.80        12

              avg / total       0.58      0.70      0.63        53



  'precision', 'predicted', average, warn_for)


In [11]:
nb2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [12]:
predicted = nb1_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 4  0  0  0  0  0  0  6]
 [ 1  0  0  0  1  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1]
 [ 1  0  0  0 15  0  0  1]
 [ 0  0  0  0  1  0  0  0]
 [ 2  0  0  1  3  0  0  2]
 [ 3  0  0  0  1  0  0  8]]
0.528301886792


In [13]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.36      0.40      0.38        10
       Collapse of object       0.00      0.00      0.00         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.71      0.88      0.79        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.44      0.67      0.53        12

              avg / total       0.42      0.53      0.46        53



  'precision', 'predicted', average, warn_for)


## Decision Tree
* dt1_clf - classifier based on Title text
* dt2_clf - classifier based on Summary text

In [14]:
dt1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])
dt2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', tree.DecisionTreeClassifier())
                    ])

In [15]:
dt1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [16]:
predicted = dt1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 4  0  0  0  0  1  0  0  5]
 [ 0  2  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0 16  0  0  0]
 [ 0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  7]
 [ 1  0  0  0  0  0  0  0 11]]
0.641509433962


In [17]:
print(metrics.classification_report(y_test, predicted))

                                  precision    recall  f1-score   support

       Caught in/between Objects       0.57      0.40      0.47        10
              Collapse of object       1.00      0.67      0.80         3
                        Drowning       1.00      1.00      1.00         1
                   Electrocution       0.00      0.00      0.00         1
Exposure to extreme temperatures       0.00      0.00      0.00         0
                           Falls       0.94      0.94      0.94        17
             Fires and Explosion       0.00      0.00      0.00         1
                           Other       0.00      0.00      0.00         8
        Struck By Moving Objects       0.48      0.92      0.63        12

                     avg / total       0.59      0.64      0.60        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [18]:
dt2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [19]:
predicted = dt2_clf.predict(X_Summary_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  1  0  0  0  0  0]
 [ 1  2  0  0  0  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [ 2  0  1  0  0 10  0  3  1]
 [ 0  0  0  1  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  2  2]
 [ 6  0  0  0  1  2  0  0  3]]
0.509433962264


In [20]:
print(metrics.classification_report(y_test, predicted))

                                 precision    recall  f1-score   support

      Caught in/between Objects       0.39      0.90      0.55        10
             Collapse of object       1.00      0.67      0.80         3
                       Drowning       0.50      1.00      0.67         1
                  Electrocution       0.00      0.00      0.00         1
Exposure to Chemical Substances       0.00      0.00      0.00         0
                          Falls       0.83      0.59      0.69        17
            Fires and Explosion       0.00      0.00      0.00         1
                          Other       0.40      0.25      0.31         8
       Struck By Moving Objects       0.50      0.25      0.33        12

                    avg / total       0.58      0.51      0.50        53



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## SVM

* sv1_clf - classifier based on Title text
* sv2_clf - classifier based on Summary text

In [21]:
sv1_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

sv2_clf = Pipeline([('vect', CountVectorizer(analyzer=text_process)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 
                                             ))
                    ])

In [22]:
sv1_clf.fit(X_Title_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [23]:
predicted = sv1_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  1  0  0  0]
 [ 0  2  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0]
 [ 1  0  0  0 16  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 1  0  0  3  1  0  1  2]
 [ 2  0  0  0  0  0  0 10]]
0.735849056604


In [24]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.64      0.90      0.75        10
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       1.00      1.00      1.00         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.89      0.94      0.91        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.50      0.12      0.20         8
 Struck By Moving Objects       0.83      0.83      0.83        12

              avg / total       0.75      0.74      0.72        53



  'precision', 'predicted', average, warn_for)


In [25]:
sv2_clf.fit(X_Summary_train,y_train ) 

Pipeline(steps=[('vect', CountVectorizer(analyzer=<function text_process at 0x000000000CC07158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [26]:
predicted = sv2_clf.predict(X_Title_test)
 
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )

[[ 9  0  0  0  0  0  1  0]
 [ 0  2  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1]
 [ 0  0  1  0 15  0  0  1]
 [ 1  0  0  0  0  0  0  0]
 [ 2  0  0  5  1  0  0  0]
 [ 3  0  3  0  0  0  0  6]]
0.622641509434


In [27]:
print(metrics.classification_report(y_test, predicted))

                           precision    recall  f1-score   support

Caught in/between Objects       0.60      0.90      0.72        10
       Collapse of object       1.00      0.67      0.80         3
                 Drowning       0.20      1.00      0.33         1
            Electrocution       0.00      0.00      0.00         1
                    Falls       0.94      0.88      0.91        17
      Fires and Explosion       0.00      0.00      0.00         1
                    Other       0.00      0.00      0.00         8
 Struck By Moving Objects       0.75      0.50      0.60        12

              avg / total       0.64      0.62      0.61        53



  'precision', 'predicted', average, warn_for)


## Label osha data

Use SVM model with Summary data

In [28]:
osha['Cause'] = sv2_clf.predict(osha.Summary).tolist()

In [29]:
osha[['Cause','Summary']].head()

Unnamed: 0,Cause,Summary
0,Caught in/between Objects,truck flatbed truck trailer fall abdomen
1,Struck By Moving Objects,construction undrgrd power line highway ...
2,Caught in/between Objects,waste proc fac industrial truck struck b...
3,Struck By Moving Objects,truck driver pump tank hot water struc...
4,Exposure to extreme temperatures,burn spill arm chest abdomen
