In [67]:
"""Assignment SVM Trial"""

# Dependencies
import os
import pandas as pd
import json
import string

import nltk
import numpy as np

from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [68]:
# Initializing
stopwords = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
snowball = nltk.SnowballStemmer('english')
lancaster = nltk.LancasterStemmer()
porter = nltk.PorterStemmer()

In [69]:
stop = stopwords.words('english');
def my_tokenizer(text):
    tokens = nltk.tokenize.word_tokenize(str(text)) # split string into words (tokens)
    tokens = [t for t in tokens if t not in string.punctuation] # remove punctuations
    tokens = [t.lower() for t in tokens if len(t) > 2] # remove short words and downcase rest
    tokens = [t for t in tokens if t not in stop] # remove stopwords
    tokens = [wnl.lemmatize(t) for t in tokens] # put words into base form using Lemmatization
    #tokens = [snowball.stem(t) for t in tokens] # put words into base form using Snowbal Stemmer
    #tokens = [lancaster.stem(t) for t in tokens] # put words into base form using Lancaster Stemmer
    #tokens = [porter.stem(t) for t in tokens] # put words into base form using most popular stemmer
    text_after_process=" ".join(tokens)
    return(text_after_process)

AttributeError: 'list' object has no attribute 'words'

In [70]:
# Read Files
AccidentCases=pd.read_csv("Data/MsiaAccidentCases.csv", delimiter=',', encoding="utf-8-sig");
print(AccidentCases);


                               Cause   \
0           Caught in/between Objects   
1                               Other   
2            Struck By Moving Objects   
3           Caught in/between Objects   
4                 Fires and Explosion   
5                 Fires and Explosion   
6                               Falls   
7                       Electrocution   
8                               Falls   
9           Caught in/between Objects   
10                Fires and Explosion   
11          Caught in/between Objects   
12          Caught in/between Objects   
13                              Falls   
14                              Falls   
15   Exposure to extreme temperatures   
16                              Falls   
17                      Electrocution   
18                              Falls   
19                              Falls   
20                      Electrocution   
21           Struck By Moving Objects   
22                              Falls   
23   Exposure to

In [71]:
AccidentCases['Summary_Case'] = AccidentCases['Summary Case'].apply(my_tokenizer)
X_AccidentCases_Train = AccidentCases.Summary_Case
AccidentCases.head()

Unnamed: 0,Cause,Title Case,Summary Case,Summary_Case
0,Caught in/between Objects,Died being caught in between machines,The accident occurred as victim was assigned t...,accident occurred victim assigned inspect main...
1,Other,Died been buried,The accident occurred during the floor concret...,accident occurred floor concreting work falsew...
2,Struck By Moving Objects,Died crushed by entrance arch,Victim with four co-workers were installing wo...,victim four co-worker installing wood plate in...
3,Caught in/between Objects,Died due to mine cave-in,A series of avalanche trapped victim who was m...,series avalanche trapped victim mining prior i...
4,Fires and Explosion,Died being run over by a lorry,Accident involving an employee who has been ru...,accident involving employee run lorry riding m...


In [72]:
AccidentCases['Cause_Code'] = AccidentCases['Cause '].apply(my_tokenizer)
X_AccidentCases_Train = AccidentCases.Cause_Code
AccidentCases.head()


Unnamed: 0,Cause,Title Case,Summary Case,Summary_Case,Cause_Code
0,Caught in/between Objects,Died being caught in between machines,The accident occurred as victim was assigned t...,accident occurred victim assigned inspect main...,caught in/between object
1,Other,Died been buried,The accident occurred during the floor concret...,accident occurred floor concreting work falsew...,
2,Struck By Moving Objects,Died crushed by entrance arch,Victim with four co-workers were installing wo...,victim four co-worker installing wood plate in...,struck moving object
3,Caught in/between Objects,Died due to mine cave-in,A series of avalanche trapped victim who was m...,series avalanche trapped victim mining prior i...,caught in/between object
4,Fires and Explosion,Died being run over by a lorry,Accident involving an employee who has been ru...,accident involving employee run lorry riding m...,fire explosion


In [73]:
# Splitting data in training and test set
from sklearn.model_selection import train_test_split
X_Cases_Trn, X_Cases_Tst, Y_Cases_Trn, Y_Cases_Tst = train_test_split(AccidentCases.Summary_Case, AccidentCases.Cause_Code, test_size=0.25, random_state=12)

In [74]:
#print training data
print (X_Cases_Trn , Y_Cases_Trn)

106    victim found pinned truck body head time repai...
17     victim security guard struck lightning walking...
16     victim colleague dismantling shear wall lift s...
156    victim foreign worker fell mobile scaffold wir...
167    incident happened approximately around 1.00 in...
164    approximately engineer died diving installatio...
140    victim died scene due crushed overturned mecha...
32     victim lorry driver found dead believed died d...
150    victim malaysian citizen died falling height m...
181    victim 25-year old local citizen work welder y...
92     accident involving express bus occured 301.5 n...
103    accident involving local worker caught two con...
111    accident occured victim crushed 100 piece mirr...
38     unbalanced forklift driven victim caused overt...
31     victim colleague assigned cutting tree victim ...
108    incident victim driving tractor came steep sha...
62     victim carrying road leveling work using steam...
33     victim crane operator li

In [75]:
##############################################################################
#Create dtm using word frequency
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer( )

X_Cases_Trn_counts = count_vect.fit_transform(X_Cases_Trn)
print (X_Cases_Trn_counts.shape)

Y_Cases_Trn_counts = count_vect.fit_transform(Y_Cases_Trn)
print (Y_Cases_Trn_counts.shape)


(136, 675)
(136, 17)


In [76]:
#Create dtm by using Term Frequency. 
#Divide the number of occurrences of each word in a document 
#by the total number of words in the document: 
#these new features are called tf for Term Frequencies
#If set use_idf=True, which mean create dtm by using tf_idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_Cases_Trn_counts)
X_Cases_Trn_tf = tfidf_transformer.transform(X_Cases_Trn_counts)
print (X_Cases_Trn_tf.shape)

tfidf_transformer = TfidfTransformer(use_idf=True).fit(Y_Cases_Trn_counts)
Y_Cases_Trn_tf = tfidf_transformer.transform(Y_Cases_Trn_counts)
print (Y_Cases_Trn_tf.shape)

(136, 675)
(136, 17)


In [77]:
#Build a pipeline: Combine multiple steps into one
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),  
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

# SVM

In [78]:
#SVM
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(
                                            alpha=1e-3 #0.001
                                             ))
                    ])

In [79]:
print (X_Cases_Trn_tf)
#Use pipeline to train the model
text_clf.fit(X_Cases_Trn,Y_Cases_Trn ) 

  (0, 645)	0.0861130075538
  (0, 259)	0.242226031135
  (0, 444)	0.318241402322
  (0, 621)	0.669431561433
  (0, 78)	0.304780855071
  (0, 290)	0.260030690759
  (0, 597)	0.267067308926
  (0, 484)	0.355954948468
  (0, 668)	0.161117012093
  (1, 645)	0.0913575132596
  (1, 522)	0.409391569201
  (1, 280)	0.409391569201
  (1, 573)	0.262782729551
  (1, 360)	0.355100840671
  (1, 647)	0.409391569201
  (1, 517)	0.409391569201
  (1, 268)	0.355100840671
  (2, 645)	0.152743713456
  (2, 131)	0.260210540907
  (2, 186)	0.282241759677
  (2, 533)	0.342237799094
  (2, 648)	0.230615875855
  (2, 356)	0.260210540907
  (2, 531)	0.342237799094
  (2, 579)	0.185230624079
  :	:
  (133, 581)	0.318272931171
  (133, 245)	0.318272931171
  (133, 87)	0.345038892905
  (133, 219)	0.345038892905
  (134, 645)	0.154557002324
  (134, 668)	0.14458769424
  (134, 186)	0.285592377712
  (134, 239)	0.140016613949
  (134, 293)	0.20451188873
  (134, 252)	0.239668337333
  (134, 183)	0.140016613949
  (134, 302)	0.200724893156
  (134, 65

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [80]:
from sklearn import metrics 
predicted = text_clf.predict(X_Cases_Tst)
 #X_Cases_Tst
#    Y_Cases_Tst
print(metrics.confusion_matrix(Y_Cases_Tst, predicted))
print(np.mean(predicted == Y_Cases_Tst) )
#y_test.value_counts()

[[ 0  0  2  0  0  0  1  0  1  0]
 [ 0  4  1  0  0  0  2  0  5  0]
 [ 0  1  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  1  0  0  0]
 [ 0  1  0  0  0  0 10  0  4  0]
 [ 0  0  0  0  0  0  0  1  0  0]
 [ 0  2  0  0  1  0  0  0  4  0]
 [ 0  0  1  0  0  0  0  0  0  0]]
0.45652173913
