In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence, text
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import matplotlib.pyplot as pltå
import nltk
import sklearn
nltk.download('punkt')

In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser
import itertools

HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
# Evaluation metric

def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

def plot_confusion_matrix(cm, class_names,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
#load main Midas labelled data table
helper.cd_main_data()
import pandas as pd

df = pd.read_csv('midas_labeled_data_Q12018.csv')
df['midas_final_unstructured'].fillna('No Score', inplace=True)

In [None]:
#labels to numerical value
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df['midas_final_unstructured'].values)

In [None]:
y

In [None]:
#map labels to numerical value
labels = list(lbl_enc.inverse_transform(y))
class_dict = dict(zip(labels, y))
class_dict

In [None]:
class_names = ['Grade I', 'Grade II', 'Grade III', 'Grade IV', 'Invalid', 'No Score']

In [None]:
#train test split
xtrain, xvalid, ytrain, yvalid = train_test_split(df['cleaned_note_unstructured'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
#dims
print (xtrain.shape)
print (xvalid.shape)

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

Future directions - Try LightGBM
GridSearchCv
Change ngrams

In [None]:
# Fitting a simple xgboost on CountVectorizer
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
from sklearn.externals import joblib
os.chdir(HOME_DIR+'/main_data/models')
# save the model to disk
filename = 'XGB_ctv_0.117.sav'
joblib.dump(clf, filename)


In [None]:
ls

In [None]:
#get the classes from probabilities
y_pred = []
for i in predictions:
    
    y_pred.append(i.argmax())

In [None]:
sklearn.metrics.accuracy_score(yvalid, y_pred)

In [None]:
sklearn.metrics.f1_score(yvalid, y_pred, average='weighted')

In [None]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix
print(class_names)
cnf_matrix = confusion_matrix(yvalid, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(9, 9))
plot_confusion_matrix(cnf_matrix, class_names, True,
                      title='Confusion matrix, with normalization')
plt.show()

In [None]:
# clf.get_booster().get_score()

In [None]:
from eli5 import show_weights
show_weights(clf, vec=ctv)

# LIME Feature Analysis

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime import lime_text
from sklearn.pipeline import make_pipeline

In [None]:
class_names = list(class_dict.keys())

In [None]:
c = make_pipeline(ctv, clf)

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
#x valid is the original list of strings before vectorization

print(c.predict_proba([xvalid[0]]).round(3))

In [None]:
idx = 40
exp = explainer.explain_instance(xvalid[idx], c.predict_proba, num_features=6, labels=[0, 1, 2, 3, 4, 5])
print('Text to predict: {}'.format(xvalid[idx]))
print('Document id: %d' % idx)
print('Predicted class =', class_names[clf.predict(xvalid_ctv.tocsc()[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_pred[idx]])

In [None]:

for i in class_dict.values():
    try:
        print ('Explanation for class %s' % class_names[i])
        print ('\n'.join(map(str, exp.as_list(label=i))))
        print ()
    except KeyError:
        pass


In [None]:
exp.show_in_notebook(text=False)

In [None]:
exp.show_in_notebook(text=xvalid[idx], labels=(0,))

In [None]:
from eli5 import show_prediction
show_prediction(clf, xvalid[1], vec=tfv, show_feature_values=True)

In [None]:
no_missing = lambda feature_name, feature_value: not np.isnan(feature_value)
show_prediction(clf, xvalid[1], vec=tfv, show_feature_values=True, feature_filter=no_missing)