# LOADING CLEANED DATA INTO DATAFRAME

In [1]:
import pandas as pd
df=pd.read_csv('data_cleaned.csv',encoding='latin1')

#creating corpus
corpus=df.copy()

#creating new column mix in dataframe which contains words from both summary and data columns
corpus['mix']=corpus['summary']+' '+corpus['data']

corpus.drop(['summary','data'],axis=1,inplace=True)

corpus.head()

Unnamed: 0,previous_appointment,categories,sub_categories,mix
0,0,PRESCRIPTION,REFILL,pt aware need rov refil phone note call patien...
1,0,ASK_A_DOCTOR,MEDICATION RELATED,mom want know focalin need dosage adjust phone...
2,0,ASK_A_DOCTOR,MEDICATION RELATED,pt call discuss nortryptiline say weird tas ph...
3,0,MISCELLANEOUS,OTHERS,fyi nortryptline medicaid phone note call pati...
4,0,MISCELLANEOUS,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",letter patient establish request phone note ca...


# DIVIDING INTO TRAIN AND TEST SETS

In [2]:
#creating feature variables and target variables
X=corpus['mix']
y1=corpus['categories']
y2=corpus['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X_train,X_test,y1_train,y1_test=train_test_split(X,y1,test_size=0.3,random_state=42)

# CONVERTING TRAIN AND TEST DATA INTO TERM DOCUMENT MATRICES

In [3]:
#converting train data into term document matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer()
X_train_tdm=vect.fit_transform(X_train.values.astype('U'))

In [5]:
#converting test data into term document matrix
X_test_tdm=vect.transform(X_test.values.astype('U'))

# PREDICTING CATEGORIES USING NAIVE_BAYES AND SVM

In [8]:
from sklearn import metrics

#training naive_bayes model to predict categories
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X_train_tdm,y1_train)
y1_pred_naive=nb.predict(X_test_tdm)

#training SVM model to predict categories
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X_train_tdm,y1_train)
y1_pred_svm=s.predict(X_test_tdm)

metrics.accuracy_score(y1_pred_svm,y1_test),metrics.accuracy_score(y1_pred_naive,y1_test)

(0.80505749969086182, 0.73061703969333502)

# PREDICTING SUB_CATEGORIES USING NAIVE_BAYES AND SVM 

In [9]:
#training naive_bayes model to predict sub_categories
X_train,X_test,y2_train,y2_test=train_test_split(X,y2,test_size=0.3,random_state=42)
nb.fit(X_train_tdm,y2_train)
y2_pred_naive=nb.predict(X_test_tdm)

#training SVM model to predict sub_categories
s.fit(X_train_tdm,y2_train)
y2_pred_svm=s.predict(X_test_tdm)

metrics.accuracy_score(y2_pred_svm,y2_test),metrics.accuracy_score(y2_pred_naive,y2_test)

(0.72505255348089526, 0.52769877581303326)

# PREDICTING SUB_CATEGORIES FOR EACH CATEGORY

In [62]:
#loading dataframe and creating corpus
import pandas as pd
df1=pd.read_csv('data_cleaned.csv',encoding='latin1')

corpus=df1.copy()

corpus['mix']=corpus['summary']+' '+corpus['data']

corpus.drop(['summary','data'],axis=1,inplace=True)

corpus['categories'].value_counts()

PRESCRIPTION     14499
APPOINTMENTS     12960
ASK_A_DOCTOR     11744
MISCELLANEOUS    10462
LAB               4246
Name: categories, dtype: int64

# FOR CATEGORY "PRESCRIPTIONS"

In [64]:
#creating a corpus with only PRESCRIPTION as category
corpus1=corpus[corpus['categories']=='PRESCRIPTION']

X1=corpus1['mix']
y1=corpus1['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X1_train,X1_test,y1_train,y1_test=train_test_split(X1,y1,test_size=0.3,random_state=42)

#converting into term document matrices
from sklearn.feature_extraction.text import TfidfVectorizer
vect1=TfidfVectorizer()
X1_train_tdm=vect1.fit_transform(X1_train.values.astype('U'))
X1_test_tdm=vect1.transform(X1_test.values.astype('U'))

#training SVM model
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X1_train_tdm,y1_train)

#training naive_bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X1_train_tdm,y1_train)

#predicting using SVM and Naive_Bayes
y1_pred_svm=s.predict(X1_test_tdm)
y1_pred_naive=nb.predict(X1_test_tdm)

from sklearn import metrics
metrics.accuracy_score(y1_pred_svm,y1_test),metrics.accuracy_score(y1_pred_naive,y1_test)


(0.92229885057471261, 0.73195402298850576)

# FOR CATEGORY "APPOINTMENTS"

In [65]:
#creating a corpus with only APPOINTMENTS as category
corpus2=corpus[corpus['categories']=='APPOINTMENTS']

X2=corpus2['mix']
y2=corpus2['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X2_train,X2_test,y2_train,y2_test=train_test_split(X2,y2,test_size=0.3,random_state=42)

#converting into term document matrices
from sklearn.feature_extraction.text import TfidfVectorizer
vect1=TfidfVectorizer()
X2_train_tdm=vect1.fit_transform(X2_train.values.astype('U'))
X2_test_tdm=vect1.transform(X2_test.values.astype('U'))

#training SVM model
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X2_train_tdm,y2_train)

#training naive_bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X2_train_tdm,y2_train)

#predicting using SVM and Naive_Bayes
y2_pred_svm=s.predict(X2_test_tdm)
y2_pred_naive=nb.predict(X2_test_tdm)

metrics.accuracy_score(y2_pred_svm,y2_test),metrics.accuracy_score(y2_pred_naive,y2_test)


(0.87860082304526754, 0.76851851851851849)

# FOR CATEGORY "ASK_A_DOCTOR"

In [66]:
#creating a corpus with only ASK_A_DOCTOR as category
corpus3=corpus[corpus['categories']=='ASK_A_DOCTOR']

X3=corpus3['mix']
y3=corpus3['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X3_train,X3_test,y3_train,y3_test=train_test_split(X3,y3,test_size=0.3,random_state=42)

#converting into term document matrices
from sklearn.feature_extraction.text import TfidfVectorizer
vect1=TfidfVectorizer()
X3_train_tdm=vect1.fit_transform(X3_train.values.astype('U'))
X3_test_tdm=vect1.transform(X3_test.values.astype('U'))

#training SVM model
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X3_train_tdm,y3_train)

#training naive_bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X3_train_tdm,y3_train)

#predicting using SVM and Naive_Bayes
y3_pred_svm=s.predict(X3_test_tdm)
y3_pred_naive=nb.predict(X3_test_tdm)

metrics.accuracy_score(y3_pred_svm,y3_test),metrics.accuracy_score(y3_pred_naive,y3_test)


(0.89982973893303064, 0.90607264472190696)

# FOR CATEGORY "MISCELLANEOUS"

In [67]:
#creating a corpus with only MISCELLANEOUS as category
corpus4=corpus[corpus['categories']=='MISCELLANEOUS']

X4=corpus4['mix']
y4=corpus4['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X4_train,X4_test,y4_train,y4_test=train_test_split(X4,y4,test_size=0.3,random_state=42)

#converting into term document matrices
from sklearn.feature_extraction.text import TfidfVectorizer
vect1=TfidfVectorizer()
X4_train_tdm=vect1.fit_transform(X4_train.values.astype('U'))
X4_test_tdm=vect1.transform(X4_test.values.astype('U'))

#training SVM model
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X4_train_tdm,y4_train)

#training naive_bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X4_train_tdm,y4_train)

#predicting using SVM and Naive_Bayes
y4_pred_svm=s.predict(X4_test_tdm)
y4_pred_naive=nb.predict(X4_test_tdm)

metrics.accuracy_score(y4_pred_svm,y4_test),metrics.accuracy_score(y4_pred_naive,y4_test)


(0.83689072953169796, 0.72857597961134124)

# FOR CATEGORY "LAB"

In [68]:
#creating a corpus with only LAB as category
corpus5=corpus[corpus['categories']=='LAB']

X5=corpus5['mix']
y5=corpus5['sub_categories']

#dividing into train and test sets
from sklearn.model_selection import train_test_split
X5_train,X5_test,y5_train,y5_test=train_test_split(X5,y5,test_size=0.3,random_state=42)

#converting into term document matrices
from sklearn.feature_extraction.text import TfidfVectorizer
vect1=TfidfVectorizer()
X5_train_tdm=vect1.fit_transform(X5_train.values.astype('U'))
X5_test_tdm=vect1.transform(X5_test.values.astype('U'))

#training SVM model
from sklearn.svm import LinearSVC
s=LinearSVC()
s.fit(X5_train_tdm,y5_train)

#training naive_bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X5_train_tdm,y5_train)

#predicting using SVM and Naive_Bayes
y5_pred_svm=s.predict(X5_test_tdm)
y5_pred_naive=nb.predict(X5_test_tdm)

metrics.accuracy_score(y5_pred_svm,y5_test),metrics.accuracy_score(y5_pred_naive,y5_test)


(0.89560439560439564, 0.78178963893249609)