In [None]:
# import packages
import pandas as pd
import numpy as np
import re
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.rcParams['figure.figsize'] = (12,10)
plt.rcParams['font.size'] = 12

# Import  scikit modules
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer
# NLP modules
import nltk 
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder 
import spacy
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline




In [None]:
en='en_core_web_sm'
nlp=spacy.load(en)

In [None]:
general_df=pd.read_csv('data/NSS_DS_data.thegeneral.csv')

In [None]:
general_df.head()

In [None]:
general_df.columns

In [None]:
general_df.info()

In [None]:
# severity of accidents 
claimgroup_df=general_df[['ClaimID_RGEN','ClaimGroup']].groupby('ClaimGroup').count()
# total claim
total_claim=claimgroup_df['ClaimID_RGEN'].sum()
# change it to percentage share by claimgroup
claimgroup_df['ClaimID_RGEN']=claimgroup_df['ClaimID_RGEN']/total_claim

# rename columns and then plot
claimgroup_df.rename(columns={'ClaimID_RGEN':'count'},inplace=True)
claimgroup_df.plot(kind='bar')

In [None]:
# severity of accidents 
severity_df=general_df[['ClaimID_RGEN','SeverityTypeName']].groupby('SeverityTypeName').count()
# rename columns and then plot
severity_df.rename(columns={'ClaimID_RGEN':'count'},inplace=True)
severity_df.plot(kind='bar')

In [None]:
# severity of accidents 
severity_byState=general_df[['StateName','SeverityTypeName','ClaimID_RGEN']].groupby(['StateName','SeverityTypeName']).count()
severity_byState.head()

In [None]:
# rename columns and then plot
severity_byState.rename(columns={'ClaimID_RGEN':'count'},inplace=True)


In [None]:
severity_byState.head()

In [None]:
severity_byState=severity_byState.reset_index('SeverityTypeName')

In [None]:
severity_byState.head()

In [None]:
severity_byState=severity_byState.reset_index()

In [None]:
severity_byState.head()

In [None]:
# major accidents by state
severity_byState_major=severity_byState[severity_byState.SeverityTypeName=='Major (hospitalization 3+ days or ICU)']
severity_byState_major=severity_byState_major.drop('SeverityTypeName',axis=1)
severity_byState_major=severity_byState_major.set_index('StateName')
severity_byState_major.head()

In [None]:
# moderate accidents by state
severity_byState_Moderate=severity_byState[severity_byState.SeverityTypeName=='Moderate']
severity_byState_Moderate=severity_byState_Moderate.drop('SeverityTypeName',axis=1)
severity_byState_Moderate=severity_byState_Moderate.set_index('StateName')
# minor accidents by state
severity_byState_Minor=severity_byState[severity_byState.SeverityTypeName=='Minor']
severity_byState_Minor=severity_byState_Minor.drop('SeverityTypeName',axis=1)
severity_byState_Minor=severity_byState_Minor.set_index('StateName')
# death causing accidents by state
severity_byState_Death=severity_byState[severity_byState.SeverityTypeName=='Death']
severity_byState_Death=severity_byState_Death.drop('SeverityTypeName',axis=1)
severity_byState_Death=severity_byState_Death.set_index('StateName')
# life threatening accidents by state
severity_byState_Life_threatening=severity_byState[severity_byState.SeverityTypeName=='Life-threatening']
severity_byState_Life_threatening=severity_byState_Life_threatening.drop('SeverityTypeName',axis=1)
severity_byState_Life_threatening=severity_byState_Life_threatening.set_index('StateName')



plt.figure(1)
plt.subplot(212)
severity_byState_major.plot(kind='bar')
plt.ylabel('number of major accidents')
plt.title('Major Accidents by State')
plt.figure(2)
plt.subplot(211)
severity_byState_Moderate.plot(kind='bar')
plt.ylabel('number of moderate accidents')
plt.title('Moderate Accidents by State')

plt.figure(3)
plt.subplot(222)
severity_byState_Minor.plot(kind='bar')
plt.ylabel('number of minor accidents')
plt.title('Minor Accidents by State')
plt.figure(4)
plt.subplot(223)
severity_byState_Death.plot(kind='bar')
plt.ylabel('number of death accidents')
plt.title('Death Accidents by State')

### pennsylvania has the highest number of Major Accident types

In [None]:
severity_byState_major.plot(kind='bar')
plt.title('Major Accidents category by states')
plt.ylabel('number of accidents')

In [None]:
general_df['SeverityTypeName'].unique()

In [None]:
general_df['InjuryDescription'] .dtype

In [None]:
general_df.info()

In [None]:
general_df['InjuryDescription'].dtype

In [None]:
# lower the texts in the column
general_df['AccidentDescription'] = general_df['AccidentDescription'].str.lower() 

In [None]:
# remove if there is any space before and after the texts
general_df['AccidentDescription'] =general_df['AccidentDescription'].str.strip()

In [None]:
general_df['AccidentDescription'] =general_df['AccidentDescription'].replace('[^\w\s]',regex=True)


In [None]:
# let's remove stop words from tokenized words.
from nltk.corpus import stopwords

# let's download stop words
nltk.download('stopwords')
# set up stop words to enlish stop words
stop_words = set(stopwords.words('english')) 

### to apply spilt or word_tokenize to a column , you should change it to string type using astype(str)

In [None]:
# remove the stops words from Injury description column and assing it back to general_df

general_df['AccidentDescription'] = general_df['AccidentDescription'].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

### - predict causes of loss one by one
## - mute and unmute one at a time

In [None]:
# insured vehicle rear end claimant vehicle
general_df['losscause'] = np.where(general_df['LossCauseName']=='IV rear-end CV',1,0)
# collision at intersection or others
#general_df['losscause'] = np.where(general_df['LossCauseName']=='Collision in an intersection',1,0)
# if the cause is 'Collision with motor vehicle'
#general_df['losscause'] = np.where(general_df['LossCauseName']=='Collision with motor vehicle',1,0)

In [None]:
# map severity levels to numeric
            
##general_df['LossCauseName']=general_df.SeverityTypeName.map({'Death':0,'Life-threatening':1,'Major (hospitalization 3+ days or ICU)':2,'Minor':3,'Moderate':4})


In [None]:
general_df.isna().sum()

In [None]:
# Define X and Y

X=general_df['AccidentDescription']
y=general_df['losscause']


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from sklearn.metrics import classification_report

#from sklearn.metrics import Score
#Train and evaluate the model
# fit CountVectorizer to X_train data
vect = CountVectorizer().fit(X_train)
# transform X_train data
X_train_vectorized = vect.transform(X_train)

In [None]:
# Naive Bayes model
clfrNB = MultinomialNB(alpha = 0.1)
# fit the model on vectorized data
clfrNB.fit(X_train_vectorized, y_train)
# predict loss cause
pred = clfrNB.predict(vect.transform(X_test))
# calculate score
score = metrics.accuracy_score(y_test,pred)

### Actually IV-rear-end-cv, but predcited not(FN)=1335
### Actually not IV-rear-end-cv, but predicted as iv-rear-end-cv(FP)=4485
### Actually iv-rear-end-cv, but predicted as iv-rear-end-cv(TP)=7899

In [None]:
# proportion of correct prediction to total prediction of IV-rear-end-cv 
# (i.e how much of the those identified as positives were actually positives)
TP=7899
FP=4485
precision=TP/(TP+FP)
print('precision',round((precision),2))


FN=1335
TN=22479  
# recall is how much iv-rear-end-cv was correctly predicted (i.e how much of positives were correctly identified as positives)
recall=TP/(TP+FN)
print('recall',round((recall),2))

# accuracy is the diagonal part
# cp=correct prediction
#wp=wrong prediction
#TotP=Total Predictions
cp=(22479  + 7899)
wp= (1335 +4485)
TotP=cp+wp
accuracy_score=cp/TotP
print('accuracy_score',round((accuracy_score),2))

In [None]:
print('accuracy',round((score),2))
cm=confusion_matrix(y_test, pred)
print(cm)

auc = roc_auc_score(y_test, pred)
print('AUC',round((auc),2))

print(classification_report(y_test,pred))

In [None]:
# plot confusion matrix
plot_confusion_matrix(y_test, pred, classes=[1,0],
                      title='Confusion matrix, without normalization')

In [None]:
general_df['InjuryDescription_tokens']=general_df['InjuryDescription'].astype(str).apply(nltk.word_tokenize)

In [None]:
general_df['InjuryDescription_tokens'].head()

In [None]:
## test['tweet'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
from collections import Counter

In [None]:
general_df['word_counts_InjuryDescr'] = [dict(Counter(doc)) for doc in general_df['InjuryDescription_tokens']]

In [None]:
general_df['word_counts_InjuryDescr'].head()

In [None]:
from gensim.corpora.dictionary import Dictionary 

In [None]:
#dictionary = Dictionary(general_df['InjuryDescription_tokens'])
#print(dictionary.token2id)

In [None]:
#corpus = [dictionary.doc2bow(doc) for doc in general_df['InjuryDescription_tokens']]

In [None]:
#corpus

In [None]:
# import Tfidf model from gensim.models.tfidfmodel

#from gensim.models.tfidfmodel import TfidfModel

In [None]:
# give weights to  tokens infifth line in my documents 
#tfidf=TfidfModel(corpus)
# calculate tfidf weights by passing corpus to tfidf
#tfidf[corpus[4]]

In [None]:
# cause of accidents 
cause_df=general_df[['ClaimID_RGEN','LossCauseName']].groupby('LossCauseName').count()
# rename columns and then plot
cause_df.rename(columns={'ClaimID_RGEN':'count'},inplace=True)
print(cause_df)
cause_df.plot(kind='bar')

In [None]:
cause_df['cause_share']=(cause_df['count'])/(len(general_df))
cause_df=cause_df.drop('count',axis=1).sort_values('cause_share',ascending=False)


In [None]:
cause_df=cause_df.head(10)
cause_df.head()

In [None]:
cause_df.plot(kind='bar')
plt.ylabel('Percentage')
plt.xlabel('')
plt.title("Percentage Share of Accident Causes")

In [None]:
len(general_df)

In [None]:
general_df['LossCauseName']=general_df.LossCauseName.map

In [None]:
nltk.download('punkt')

In [None]:
# who was at fault
fualts=general_df[['ClaimID_RGEN','FaultRatingName']].groupby('FaultRatingName').count()
#.rename(columns=('ClaimID_RGEN':'count',inplace=True)
fualts.rename(columns={'ClaimID_RGEN':'count'},inplace=True)
fualts.plot(kind='bar')