In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import nltk
nltk.download('punkt')

In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser


HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
helper.cd_main_data()
import pandas as pd
#load main Midas labelled data table
df = pd.read_csv('midas_labeled_data_Q12018.csv')
df['midas_final_unstructured'].fillna('No Score', inplace=True)

In [None]:
df.head()

In [None]:
df_insight = df[['cleaned_note_unstructured', 'snippet_unstructured', 'midas_final_unstructured']]

In [None]:
#labels
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df['midas_final_unstructured'].values)

In [None]:
df_insight.head(20)

In [None]:
#map labels to numerical value
labels = list(lbl_enc.inverse_transform(y))
class_dict = dict(zip(labels, y))
class_dict

In [None]:
xpred = df['snippet_unstructured'].values
ypred = df['midas_final_unstructured'].values

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer 
ctv.fit(list(xpred))
xpred_ctv =  ctv.transform(xpred) 



In [None]:
os.chdir(HOME_DIR+'/main_data/models')
# load the model from disk
from sklearn.externals import joblib
filename = 'XGB_ctv_snippet_0.071.sav'
clf = joblib.load(filename)


In [None]:
class_names = list(class_dict.values())
predictions = clf.predict_proba(xpred_ctv.tocsc())
for i, j in enumerate(predictions):
     print(('IDX:{} | Class: {} - {} | Proba: {}').format(i, j.argmax(), class_names[j.argmax()], max(j)))

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime import lime_text
from sklearn.pipeline import make_pipeline

#classes
class_names = list(class_dict.keys())

#make pipeline
pipe = make_pipeline(ctv, clf)

#instantiate explainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
#Predict and analyze labelled data
def predict_analyze_labelled(idx, highlight=False):
    print('Row ID: {}| Text : {}'.format(idx, xpred[idx]))
    exp = explainer.explain_instance(xpred[idx], pipe.predict_proba, num_features=6, labels=[0, 1, 2, 3, 4, 5])
    pred_class = clf.predict(xpred_ctv.tocsc()[idx]).reshape(1,-1)[0,0]
    print('Predicted class =', class_names[pred_class])
    print('True class: %s' % class_names[y[idx]])
    print ('Explanation for class %s' % class_names[pred_class])
    print ('\n'.join(map(str, exp.as_list(label=pred_class))))
    
    if highlight:
        exp.show_in_notebook(text=xpred[idx], labels=(pred_class,))

In [None]:
for i in range(5):
    predict_analyze_labelled(i)

# Predict unlabelled data

In [None]:
#load file
helper.cd_main_data()
csv_name = 'midas_unlabelled_patient_note_aggregate_Jan18.csv'
df_unlabelled = pd.read_csv(csv_name, nrows=100000)
df_unlabelled.head()

In [None]:
df_unlabelled['cleaned_notes'].dropna(inplace=True)
unlabelled_pred = df_unlabelled['cleaned_notes'].values

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer 
ctv.fit(list(xpred))
unlabelled_pred_ctv =  ctv.transform(unlabelled_pred) 


In [None]:
predictions_unl = clf.predict_proba(unlabelled_pred_ctv.tocsc())
for i, j in enumerate(predictions_unl):
    if max(j) > 0.8:
        print(('IDX:{} | Class: {} - {} | Proba: {}').format(i, j.argmax(), class_names[j.argmax()], max(j)))

In [None]:
#make pipeline
pipe = make_pipeline(ctv, clf)

#instantiate explainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
#Predict and analyze labelled data
def predict_analyze_labelled(idx, highlight=False):
    print('Row ID: {}| Text : {}'.format(idx, unlabelled_pred[idx]))
    exp = explainer.explain_instance(unlabelled_pred[idx], pipe.predict_proba, num_features=6, labels=[0, 1, 2, 3, 4, 5])
    pred_class = clf.predict(unlabelled_pred_ctv.tocsc()[idx]).reshape(1,-1)[0,0]
    print('Predicted class =', class_names[pred_class])
    print ('Explanation for class %s' % class_names[pred_class])
    print ('\n'.join(map(str, exp.as_list(label=pred_class))))
    
    if highlight:
        exp.show_in_notebook(text=xpred[idx], labels=(pred_class,))

In [None]:
print(pipe.predict_proba([unlabelled_pred[0]]).round(3))

In [None]:
predict_analyze_labelled(883, True)