In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [42]:
data = pd.read_excel("TextLabelDataframe.xlsx")
data = data.dropna()
data = data.reset_index(drop= True)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Data columns (total 2 columns):
Text       443 non-null object
FaxType    443 non-null object
dtypes: object(2)
memory usage: 7.0+ KB
None


In [43]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import words
porter = PorterStemmer()

In [44]:
temp = []
for i in range(data.shape[0]):
    temp.append(data['Text'][i])
temp = "".join(temp)

from collections import Counter
alphabets = "abcdefghijklmnopqrstuvwxyz"
alphabets = list(alphabets)
tempd = dict(Counter(temp))
keys = list(tempd.keys())
to_remove = []
for i in range(len(keys)):
    if keys[i] not in alphabets:
        to_remove.append(keys[i])
print(to_remove)

['0', '1', ' ', '2', '9', '4', '3', '6', '’', '8', '5', '@', '7', '®', '|', '$', '%', '‘', '&', '—', '“', '°', '\\', '£', '}', '=', '«', '»', '"', "'", '<', '[', '+', '#', '™', ']', '*', '~', '©', '§', 'é', '{', '”', '¢', '€', '¥']


In [45]:
to_remove.remove(' ')
print(to_remove)

['0', '1', '2', '9', '4', '3', '6', '’', '8', '5', '@', '7', '®', '|', '$', '%', '‘', '&', '—', '“', '°', '\\', '£', '}', '=', '«', '»', '"', "'", '<', '[', '+', '#', '™', ']', '*', '~', '©', '§', 'é', '{', '”', '¢', '€', '¥']


In [46]:
Text = []
for i in range(data.shape[0]):
    text = data['Text'][i]
    for item in to_remove:
        text = text.replace(item, '')
    Text.append(text)

In [47]:
en = []
words = set(words.words())
for i in range(len(Text)):
    s = " ".join(w for w in nltk.wordpunct_tokenize(Text[i]) if w.lower() in words or not w.isalpha())
    en.append(s)
    
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

stemmed = []
for i in range(len(en)):
    x = stemSentence(en[i])
    stemmed.append(x)

In [48]:
data['ProcessedText'] = stemmed

In [49]:
print(data.head())

                                                Text FaxType  \
0  01 11 2019 4 39 12 pm 0600 cvs caremark page 1...     Fly   
1  ne ity rar ri anna cn nna afl nn yn 4 fy janua...     Fly   
2  | ® | ann hn ba | | vagus aar january 11 2019 ...     Fly   
3  prime rf 10 6 1 11 2019 3 57 12 pm page 1 001 ...     Fly   
4  5125333179 11 26 21 01 11 2019 1 1 tas png jan...    Thmp   

                                       ProcessedText  
0  page pharmaci updat updat c hang layout networ...  
1  ne anna yn new plan inform black river memori ...  
2  ann ba vagu portal enhanc excit announc new en...  
3  prime page server f prime therapeut fraud tip ...  
4  e j pharmaci seal ni drug order form lile x co...  


In [50]:
from sklearn.feature_extraction.text import CountVectorizer
text_vectorizer = CountVectorizer()
text_features = text_vectorizer.fit_transform(data['ProcessedText'])
text_features.get_shape()

(443, 5927)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_features = tfidf_vectorizer.fit_transform(data['ProcessedText'])
print('Shape of Sparse Matrix: ', tfidf_features.shape)
print('Amount of Non-Zero occurences: ', tfidf_features.nnz)

Shape of Sparse Matrix:  (443, 5927)
Amount of Non-Zero occurences:  67818


In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(tfidf_features, data['FaxType'],
                                                 test_size = 0.25, random_state= 0)

In [53]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report

In [54]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators= 175)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test, pred))

0.6306306306306306


In [55]:
print(classification_report(y_test, pred, target_names= y_test.unique()))

              precision    recall  f1-score   support

         Fly       1.00      1.00      1.00         1
   Injection       1.00      0.33      0.50         3
          IV       1.00      0.80      0.89         5
        Oral       1.00      0.40      0.57         5
     Horizon       1.00      1.00      1.00         2
     General       1.00      1.00      1.00         1
         DME       1.00      0.50      0.67         2
       Order       0.95      0.87      0.91        23
         DVR       0.68      0.88      0.77        17
       Audit       0.50      0.25      0.33         4
 Phy Billing       1.00      0.60      0.75        10
        Auth       0.00      0.00      0.00         9
         Inv       0.21      0.67      0.32         6
    HCARES18       1.00      1.00      1.00         2
      Refill       0.44      0.58      0.50        12
         EOB       0.25      0.40      0.31         5
        Thmp       0.00      0.00      0.00         1
        Demo       0.00    

In [56]:
test_labels = list(y_test.unique())
print(test_labels)

['Fly', 'Injection', 'IV', 'Oral', 'Horizon', 'General', 'DME', 'Order', 'DVR', 'Audit', 'Phy Billing', 'Auth', 'Inv', 'HCARES18', 'Refill', 'EOB', 'Thmp', 'Demo', 'Ans']


In [57]:
print(confusion_matrix(y_test, pred, labels= y_test.unique()))
cm = confusion_matrix(y_test, pred, labels= y_test.unique())
print(type(cm))

[[20  1  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  4  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0  0  0  1  0  6  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  7  0  1  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  4  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  2  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  1  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0

In [58]:
print(confusion_matrix(y_test, pred, labels= ['Fly', 'IV']))

[[20  0]
 [ 0  0]]


In [59]:
error = []
for i in range(cm.shape[0]):
    p = 0
    for j in range(cm.shape[1]):
        p += cm[i][j]
    e = float((p-cm[i][i])/p)
    error.append(e)
print(error)

[0.13043478260869565, 0.3333333333333333, 1.0, 0.4166666666666667, 0.4, 0.11764705882352941, 0.6, 0.6, 0.0, 0.6666666666666666, 1.0, 0.2, 0.0, 0.75, 1.0, 0.5, 0.0, 0.0, 0.0]


In [60]:
print("The classification error when accuracy is between 63-65%")
print()
for i in range(len(test_labels)):
    print('For {} the error is {}'.format(test_labels[i], error[i]))
    print()

The classification error when accuracy is between 63-65%

For Fly the error is 0.13043478260869565

For Injection the error is 0.3333333333333333

For IV the error is 1.0

For Oral the error is 0.4166666666666667

For Horizon the error is 0.4

For General the error is 0.11764705882352941

For DME the error is 0.6

For Order the error is 0.6

For DVR the error is 0.0

For Audit the error is 0.6666666666666666

For Phy Billing the error is 1.0

For Auth the error is 0.2

For Inv the error is 0.0

For HCARES18 the error is 0.75

For Refill the error is 1.0

For EOB the error is 0.5

For Thmp the error is 0.0

For Demo the error is 0.0

For Ans the error is 0.0



In [21]:
import dill

In [61]:
data[data['FaxType'] == 'Injection']

Unnamed: 0,Text,FaxType,ProcessedText
215,1 11 2019 12 30 pm baylor +18558227838 p 1 sen...,Injection,p sent facsimil number phone date compani coll...
216,1 10 2019 6 54 pm fax server +17136614828 p 1 ...,Injection,server p sent compani phone phone e mail date ...
217,1 11 2019 12 47 pm fax server +17136614828 p 4...,Injection,server p sent new e script prescript by l pati...
218,jan 11 2019 03 34 pm 17136614828 page 1 1 sout...,Injection,page pharmaci fe fe h h sh oe ae us oe us lue ...
219,#570 p 001 001 med spec clin southside kim 01 ...,Injection,p spec kim sate f address harborsid p p box am...
220,1 11 2019 5 19 pm fax server +17136614828 p 1 ...,Injection,server p sent infus compani phone phone e mail...
221,dermatology dermatologkc surgery & dermatopath...,Injection,dermatolog surgeri dermatopatholog f caban spe...
222,1 11 2019 10 53 fax server fax southside organ...,Injection,server organ number phone number number p sent...
223,jan 11 2019 02 45 pm 17136614828 page 1 1 sout...,Injection,page pharmaci fe fe h h sh oe ae us oe us lue ...
224,jan 11 2019 02 46 pm 17136614828 page 1 1 sout...,Injection,page pharmaci fe fe h h sh oe ae us oe us lue ...


In [62]:
t_case = data['ProcessedText'][215]
print(t_case)

p sent facsimil number phone date compani colleg medicin subject messag c thank send refil request get pleas hold decid need continu need switch imag c clinic assist center diseas prevent travi st suit f e w medicin sent subject messag c notic inform facsimil may contain confidenti inform may health inform defin feder health insur portabl account act privaci rule privaci use individu entiti cover sheet intend recipi herebi notifi disclosur dissemin distribut inform strictli may subject legal restrict sanction receiv facsimil error pleas notifi sender immedi telephon number list arrang return destruct inform p sent power p sent am page pharmaci err eker eker ere er author request refer g ree rea k eh pharmaci main street phone phone patient steven dob sex male circl sugar land phone pharmaci written quantiti fill quantiti left day suppli inject subcutan everi pleas review indic sign back pharmaci thank plu refil refil request confidenti inform transmiss pharmaci pharmacist send data leg

In [76]:
print(X_test)
print(y_test)

  (0, 64)	0.041274014645028424
  (0, 188)	0.1291298127111653
  (0, 296)	0.07447951170705035
  (0, 406)	0.03221933559135732
  (0, 445)	0.04178508847489711
  (0, 541)	0.07326130874847989
  (0, 618)	0.06373012626131289
  (0, 727)	0.048278082572994006
  (0, 755)	0.06386355853186242
  (0, 784)	0.0884081956540403
  (0, 785)	0.06824137879810008
  (0, 794)	0.03146342230195861
  (0, 926)	0.12587711846639796
  (0, 931)	0.1489590234141007
  (0, 1012)	0.22102048913510075
  (0, 1051)	0.031188682467923095
  (0, 1084)	0.03878804708457463
  (0, 1086)	0.05076929274788121
  (0, 1117)	0.07579527113840588
  (0, 1209)	0.05107091987871742
  (0, 1259)	0.07152487017019568
  (0, 1337)	0.05370666722181623
  (0, 1341)	0.043043270903721766
  (0, 1402)	0.045453170686525136
  (0, 1414)	0.03438674958930767
  :	:
  (110, 2437)	0.08956916708754391
  (110, 3049)	0.051920753314937736
  (110, 3232)	0.08918791941955198
  (110, 3254)	0.05204973500867561
  (110, 3281)	0.08698487999148587
  (110, 3291)	0.10916607206761986
  

In [80]:
model.predict(X_test[2])

array(['Order'], dtype=object)