In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import nltk
import sklearn
nltk.download('punkt')

In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser
import itertools

HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
helper.cd_main_data()
import pandas as pd
#load main Midas labelled data table
df = pd.read_csv('midas_labeled_data_Q12018.csv')
df['midas_final_unstructured'].fillna('No Score', inplace=True)

In [None]:
#labels to numerical value
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df['midas_final_unstructured'].values)

In [None]:
y

In [None]:
#map labels to numerical value
labels = list(lbl_enc.inverse_transform(y))
class_dict = dict(zip(labels, y))
class_dict

In [None]:
#train test split
xtrain, xvalid, ytrain, yvalid = train_test_split(df['cleaned_note_unstructured'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
#dims
print (xtrain.shape)
print (xvalid.shape)

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
import matplotlib
svd_sum = [i.sum() for i in xtrain_svd]
colors = ['red','green','blue','purple', 'orange', 'black']
label = ytrain
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
plt.scatter(svd_sum, y = ytrain, c=label, cmap=matplotlib.colors.ListedColormap(colors))
print(class_dict.keys())
ax.set_yticklabels(class_dict.keys())
plt.show()


In [None]:
#2d plot data
xtrain_svd_X = []
xtrain_svd_Y = []

for i in xtrain_svd:
    neg_list = []
    pos_list = []
    for j in i:
        if j < 0:
            neg_list.append(j)
        else:
            pos_list.append(j)
    xtrain_svd_X.append(sum(pos_list))
    xtrain_svd_Y.append(sum(neg_list))
print(len(xtrain_svd_X))
print(len(xtrain_svd_Y))  


In [None]:

colors = ['red','green','blue','purple', 'orange', 'black']
label = ytrain
fig, ax = plt.subplots(1, 1, figsize=(9, 6))
plt.scatter(xtrain_svd_X, xtrain_svd_Y, c=label, cmap=matplotlib.colors.ListedColormap(colors))
cb = plt.colorbar()
loc = np.arange(0,max(label),max(label)/float(len(colors)))
cb.set_ticks(loc)
cb.set_ticklabels(list(class_dict.keys()))

In [None]:
from mpl_toolkits.mplot3d import Axes3D
colors = ['red','green','blue','purple', 'orange', 'black']
label = ytrain
fig = plt.figure(figsize=(9, 9))
ax = fig.add_subplot(111, projection='3d')
plt.scatter(xtrain_svd_X, xtrain_svd_Y, ytrain, c=label, cmap=matplotlib.colors.ListedColormap(colors))
plt.show()

In [None]:
plt.plot(xtrain_svd[1])
plt.show()

In [None]:
svd.get_params()

Future directions - Try LightGBM

In [None]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
history = clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
# sklearn.metrics.accuracy_score(yvalid, predictions)
yvalid

In [None]:
y_pred = []
for i in predictions:
    
    y_pred.append(i.argmax())

In [None]:
sklearn.metrics.accuracy_score(yvalid, y_pred)

In [None]:
sklearn.metrics.f1_score(yvalid, y_pred, average='weighted')

In [None]:
sklearn.metrics.confusion_matrix(yvalid, y_pred)

In [None]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix
class_names = class_dict.values()
print(class_names)
cnf_matrix = confusion_matrix(yvalid, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, class_names,
                      title='Confusion matrix, without normalization')
plt.show()

In [None]:
print(clf.feature_importances_)

In [None]:
from xgboost import plot_importance
fig, AX = plt.subplots(1,1,figsize=(10,10))
plot_importance(clf, xlabel=type, importance_type='weight', ax=AX, max_num_features=10)
plt.show()

In [None]:
# clf.get_booster().get_score()

In [None]:
from eli5 import show_weights
show_weights(clf, vec=tfv)

In [None]:
from eli5 import show_weights
show_weights(clf, vec=svd)

# LIME Feature Analysis

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime import lime_text
from sklearn.pipeline import make_pipeline

In [None]:
class_names = list(class_dict.keys())

In [None]:
c = make_pipeline(tfv, clf)

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
#x valid is the original list of strings before vectorization

print(c.predict_proba([xvalid[0]]).round(3))

In [None]:
idx = 1340
exp = explainer.explain_instance(xvalid[idx], c.predict_proba, num_features=6, labels=[0, 1, 2, 3, 4, 5])
print('Text to predict: {}'.format(xvalid[idx]))
print('Document id: %d' % idx)
print('Predicted class =', class_names[clf.predict(xvalid_tfv.tocsc()[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_pred[idx]])

In [None]:

for i in class_dict.values():
    try:
        print ('Explanation for class %s' % class_names[i])
        print ('\n'.join(map(str, exp.as_list(label=i))))
        print ()
    except KeyError:
        pass


In [None]:
exp.show_in_notebook(text=False)

In [None]:
exp.show_in_notebook(text=xvalid[idx], labels=(0,))