|imports|
|---|

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

import pickle
import os

from nltk import word_tokenize

|Change directory|
|---|

In [11]:
# change dir to /data where all our files reside
os.getcwd()
os.chdir("../data")

'c:\\Users\\abhinav.m\\Desktop\\abhi\\project\\DoctorTalkAnalyzer\\data'

In [8]:
os.getcwd()

'c:\\Users\\abhinav.m\\Desktop\\abhi\\project\\DoctorTalkAnalyzer\\data'

|Read dataset|
|---|

In [22]:
df = pd.read_excel("./dataset/sentiment.xlsx", engine="openpyxl", index_col=False)
df = df.iloc[: , 1:] # drop column of indices

In [23]:
df

Unnamed: 0,Sentence,Category
0,KDM from large NCI NCN center said they are us...,Recommend
1,twice as many Swiss oncologists voting for use...,Recommend
2,Pembrolizumab + Axitinib and Avelumab + Axitin...,Recommend
3,Identified current policy inconsistencies with...,Recommend
4,His go to parp is olaparib because he feels th...,Recommend
...,...,...
28394,Several patients who requested prior authoriza...,General
28395,"Not really, I guess we still have much to unde...",General
28396,Insights from one top SL: PD L1 testing in cis...,General
28397,HR is not low,General


In [24]:
df.Category.value_counts()

General          24261
Recommend         4040
Not Recommend       98
Name: Category, dtype: int64

In [25]:
df = df.dropna(axis=0)

|Sample equal sized sets from dataset|
|---|

In [26]:
df_0 = df[df['Category'] == 'Recommend'].sample(frac=1)
df_1 = df[df['Category'] == 'Not Recommend'].sample(frac=1)
df_2 = df[df['Category'] == 'General'].sample(frac=1)

# we want a balanced set for training against - there are 7072 `0` examples
sample_size = 98

data = pd.concat([df_0.head(sample_size), df_1.head(sample_size), df_2.head(sample_size)]).sample(frac=1)

In [27]:
data.Category.value_counts()

Not Recommend    98
Recommend        98
General          98
Name: Category, dtype: int64

|Load, edit and save pickles (run once)|
|---|

In [28]:
with open("./pickle/INCLUDE_ENTITY_DICT.pickle",'rb') as infile:
    drugs_list_list = pickle.load(infile)
# read pickle file
with open("./pickle/remove_drugs.pkl",'rb') as infile:
    remove_list = pickle.load(infile)

In [31]:
drugs_list_list['cancer'] = 'DISEASE'
drugs_list_list['rcc'] = 'DISEASE'
drugs_list_list['mesothelioma'] = 'DISEASE'
drugs_list_list['chemo'] = 'CHEMICAL'
drugs_list_list['chemotherapy'] = 'CHEMICAL'

disease = []
drug_surface_to_canonical = []
for i in drugs_list_list:
  if drugs_list_list[i]=='DISEASE':
    disease.append(i.lower())
  if drugs_list_list[i]=='CHEMICAL':
    drug_surface_to_canonical.append(i.lower())

drug_surface_to_canonical = list(set(drug_surface_to_canonical) - set(remove_list))

with open('./pickle/drug_surface_to_canonical.pkl', 'wb') as fp:
    pickle.dump(drug_surface_to_canonical, fp)

|Load final pickle (normal use)|
|---|

In [None]:
with open('./pickle/drug_surface_to_canonical.pkl', 'wb') as infile:
    drug_surface_to_canonical =  pickle.load(infile)

|Drug name & disease masking|
|---|

In [32]:
mask_df = data.copy()
for row in mask_df.index:
    line = mask_df.Sentence[row].lower()
    sent = ""
    for word in word_tokenize(line):
        if word in drug_surface_to_canonical:
            word = "DRUG"
        elif word in disease:
            word = "DISEASE"
        sent += f"{word} "
    mask_df.Sentence[row] = sent

In [33]:
mask_df.Category.value_counts()

Not Recommend    98
Recommend        98
General          98
Name: Category, dtype: int64

|Train & test data splitting|
|---|

In [97]:
X_train, X_test, y_train, y_test = train_test_split(mask_df.Sentence, mask_df.Category, test_size=0.05, random_state=16, shuffle=True)

In [98]:
text_clf = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', MultinomialNB()),
... ])

In [99]:
text_clf = text_clf.fit(X_train, y_train)

In [100]:
text_clf.predict(X_test)

array(['Not Recommend', 'Not Recommend', 'Not Recommend', 'Not Recommend',
       'Not Recommend', 'Recommend', 'Not Recommend', 'Not Recommend',
       'Not Recommend', 'Recommend', 'Not Recommend', 'Recommend',
       'Not Recommend', 'Recommend', 'Not Recommend'], dtype='<U13')

In [103]:


y_pred = text_clf.predict(X_test)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

accuracy:   0.333


In [9]:
# pickle.dump(text_clf, open('model1', 'wb'))
with open("model1",'rb') as infile:
    text_clf = pickle.load(infile)

In [106]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_jobs=5, random_state=42)),
... ])

In [107]:
text_clf_svm = text_clf_svm.fit(X_train,y_train)

In [108]:
y_predict = text_clf_svm.predict(X_test)

In [109]:
score1 = metrics.accuracy_score(y_test, y_predict)
print("accuracy:   %0.3f" % score1)

accuracy:   0.667


In [10]:
# pickle.dump(text_clf_svm, open('model_svm', 'wb'))
with open("model_svm",'rb') as infile:
    text_clf_svm = pickle.load(infile)

In [13]:
ddf = df.tail(500)
ddf['Masked_Sentence'] = ddf.Sentence


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddf['Masked_Sentence'] = ddf.Sentence


In [15]:
ddf

Unnamed: 0,Sentence,Category,Masked_Sentence
27899,He would be interested in conducting such a st...,General,He would be interested in conducting such a st...
27900,"The remaining patients, will be the best to re...",General,"The remaining patients, will be the best to re..."
27901,Focusing on areas where majority of patients t...,General,Focusing on areas where majority of patients t...
27902,The key of sucess for this NMIBC trials is to ...,General,The key of sucess for this NMIBC trials is to ...
27903,This is significant and encouraging for patien...,General,This is significant and encouraging for patien...
...,...,...,...
28394,Several patients who requested prior authoriza...,General,Several patients who requested prior authoriza...
28395,"Not really, I guess we still have much to unde...",General,"Not really, I guess we still have much to unde..."
28396,Insights from one top SL: PD L1 testing in cis...,General,Insights from one top SL: PD L1 testing in cis...
28397,HR is not low,General,HR is not low


In [16]:
for row in ddf.index:
    line = ddf.Masked_Sentence[row].lower()
    sent = ""
    for word in word_tokenize(line):
        if word in drug_surface_to_canonical:
            word = "DRUG"
        elif word in disease:
            word = "DISEASE"
        sent += f"{word} "
    ddf.Masked_Sentence[row] = sent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddf.Masked_Sentence[row] = sent


In [18]:
y1 = text_clf.predict(list(ddf.Masked_Sentence))
y2 = text_clf_svm.predict(list(ddf.Masked_Sentence))
gh = ddf
gh["Predicted"] = y1
gh['SVM_Predicted'] = y2
gh.to_excel('Predicted_Results.xlsx',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gh["Predicted"] = y1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gh['SVM_Predicted'] = y2
