In [2]:
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-

# RRN to classify text
# Author: adriamoya

#%matplotlib inline
#import matplotlib.pyplot as plt

import re
import datetime
import numpy as np
import pandas as pd
from collections import Counter
import random as rn
import tensorflow as tf

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(1337)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K
# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, LSTM, Embedding, Reshape, Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
# load data
df = pd.read_csv("/Users/moyandreu/.kaggle/competitions/bcpnews/train.csv")
df_test = pd.read_csv("/Users/moyandreu/.kaggle/competitions/bcpnews/test.csv")

In [4]:
# ip calculation
def ip(y_target, y_pred):
    return 100*(2*(metrics.roc_auc_score(y_target, y_pred))-1)

In [6]:
def preprocessing(df, column="text"):

    """ Preprocessing (lower case, remove urls, punctuations) """

    print("\nPreprocessing %s ..." % (column))

    # preprocessing steps: lower case, remove urls, punctuations ...
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace(r'http[\w:/\.]+','') # remove urls
    df[column] = df[column].str.replace(r'[^\.(a-zA-ZÀ-ÿ0-9)\s]','') #remove everything but characters and punctuation ( [^\.\w\s] )
    df[column] = df[column].str.replace(r'(?<=\d)(\.)(?=\d)','') #remove dots in thousands (careful with decimals!)
    df[column] = df[column].str.replace(r'\.\.+','.') #replace multple periods with a single one
    df[column] = df[column].str.replace(r'\.',' .') #replace multple periods with a single one
    df[column] = df[column].str.replace(r'\(',' ') # replace brackets with white spaces
    df[column] = df[column].str.replace(r'\)',' ') # replace brackets with white spaces
    df[column] = df[column].str.replace(r'\s\s+',' ') #replace multple white space with a single one
    df[column] = df[column].str.strip()

    return df

In [7]:
def build_dictionary(df, min_count_word=5):

    """ Build dictionary and relationships between words and integers """

    print("\nBuilding dictionary ..." )

    # get all unique words (only consider words that have been used more than 5 times)
    all_text = ' '.join(df.text.values)
    words = all_text.split()
    u_words = Counter(words).most_common()
    u_words = [word[0] for word in u_words if word[1]>min_count_word] # we will only consider words that have been used more than 5 times

    print('The number of unique words is:', len(u_words) )

    # create the dictionary
    word2num = dict(zip(u_words,range(len(u_words))))
    word2num['<Other>'] = len(u_words)
    num2word = dict(zip(word2num.values(), word2num.keys()))

    num2word[len(word2num)] = '<PAD>'
    word2num['<PAD>'] = len(word2num)

    return word2num, num2word, len(u_words)

In [8]:
def word2int(df, n_u_words, column='text', word_threshold=500):

    """ Convert words to integers and prepad sentences """

    print("\nConverting words to integers and prepadding ..." )

    int_text = [[word2num[word] if word in word2num else n_u_words for word in Text.split()] for Text in df[column].values] # Text.split() python2

    print('The number of texts greater than %s in length is: ' % str(word_threshold), np.sum(np.array([len(t)>word_threshold for t in int_text])))
    print('The number of texts less than 50 in length is: ', np.sum(np.array([len(t)<50 for t in int_text])) )

    for i, t in enumerate(int_text):
        if len(t)<word_threshold:
            int_text[i] = [word2num['<PAD>']]*(word_threshold-len(t)) + t
        elif len(t)>word_threshold:
            int_text[i] = t[:word_threshold]
        else:
            continue

    return int_text

In [9]:
def fit_evaluate_model(X_train, X_valid, y_train, y_valid, params):

    """ Fit and evaluate Many to One RNN """

    print("\nCreating Sequential RNN: Many to One..." )
    
    early_stopping = EarlyStopping(monitor='loss', patience=2)

    model = Sequential()
    
    model.add(Embedding(len(word2num), params['embedding_size'])) # , batch_size=batch_size
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    #model.add(Dropout(0.2))
    model.add(LSTM(100))
    #model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid')) # sigmoid
    
    model.compile(loss=params['loss_func'], optimizer=params['optimizer'], metrics=params['metrics'])
    model.summary()

    batch_size = params['batch_size']
    print("\nFitting the model ..." )
    model.fit(X_train, y_train, batch_size=batch_size, epochs=params['epochs'], callbacks=[early_stopping])
    
    print("\nPredicting probs on train ..." )
    pred_train = model.predict(X_train)
    print("\nAUC: {0:.2f}%".format(100*metrics.roc_auc_score(y_train, pred_train)), "| GINI: {0:.2f}%".format(ip(y_train, pred_train)))

    print("\nEvaluating in valid ..." )
    print(model.evaluate(X_valid, y_valid, batch_size=batch_size))
    
    print("\nPredicting probs on valid ..." )
    pred_valid = model.predict(X_valid)
    print("\nAUC: {0:.2f}%".format(100*metrics.roc_auc_score(y_valid, pred_valid)), "| GINI: {0:.2f}%".format(ip(y_valid, pred_valid)))

    return model, pred_train, pred_valid

In [10]:
def predict_test(model, df_test, column):

    # words to numbers
    int_text = word2int(df_test, n_u_words, column, word_threshold)

    X = np.array(int_text)

    pred = model.predict(X)

    l_pred = []
    for item in pred:
        l_pred.append(item[0])
        
    return l_pred

In [11]:
# preprocessing steps: lower case, remove urls, punctuations ...

# text
df = preprocessing(df)
df_test = preprocessing(df_test)

# title
df = preprocessing(df, 'title')
df_test = preprocessing(df_test, 'title')


Preprocessing text ...

Preprocessing text ...

Preprocessing title ...

Preprocessing title ...


In [12]:
# build dictionary
min_count_word = 4
word2num, num2word, n_u_words = build_dictionary(df, min_count_word)

# train / valid split
print("\nTrain / Valid split ..." )

np.random.seed(0)
df['msk'] = np.random.randn(df.shape[0])

np.random.seed(0)
msk = np.random.rand(len(df)) <= 0.9

df_train = df[msk] ; df_train.reset_index(inplace=True); df_train = df_train.drop(['msk', 'index'], axis=1)
df_valid = df[~msk]; df_valid.reset_index(inplace=True); df_valid = df_valid.drop(['msk', 'index'], axis=1)

print("Train shape:", df_train.shape )
print("Valid shape:", df_valid.shape )


Building dictionary ...
The number of unique words is: 41112

Train / Valid split ...
Train shape: (13196, 7)
Valid shape: (1441, 7)


In [13]:
df_valid.head()

Unnamed: 0,id,keywords,summary,title,text,date,flag
0,9,"[del, estados, gobierno, el, que, gps, se, méx...","Desde los años 90 el paisaje ha cambiado, de h...",gobierno de donald trump cómo donald trump est...,se ha hablado mucho de la errática política ex...,2017-05-08,0
1,14,"[cartera, pone, bbva, por, una, en, venta, y, ...",El grupo BBVA ha decidido poner a la venta una...,bbva pone a la venta una cartera de antiguas s...,el grupo bbva ha decidido poner a la venta una...,2016-06-29,1
2,21,"[españa, que, por, del, en, cooperación, ibéri...",Portugal y España suscribieron hoy ocho acuerd...,portugal y españa sellan ocho acuerdos de coop...,portugal y españa suscribieron hoy ocho acuerd...,2017-05-30,0
3,28,"[digitalización, en, una, transparencia, y, de...","Esta conferencia, organizada por los diarios E...",independencia digitalización y transparencia l...,el encuentro organizado por expansión y el mun...,2016-12-16,1
4,39,"[la, del, contrasplit, noticias, entidad, el, ...",Los titulares de acciones de la entidad a fech...,noticias de bankia bankia ejecuta hoy su contr...,bankia ejecutará hoy su contrasplit de accione...,2017-06-05,0


## Model text

In [14]:
word_threshold = 500

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

# word to integer
X_train = np.array(word2int(df_train, n_u_words, 'text', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'text', word_threshold))

y_train = df_train['flag'].values
y_valid = df_valid['flag'].values

model_text, pred_train, pred_valid = fit_evaluate_model(X_train, X_valid, y_train, y_valid, params)

print("\nTest results ..." )
test_pred = predict_test(model_text, df_test, 'text')


Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  6014
The number of texts less than 50 in length is:  145

Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  694
The number of texts less than 50 in length is:  18

Creating Sequential RNN: Many to One...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         82048     
___________________________________________________

In [15]:
df_train['pred_text'] = pred_train
df_valid['pred_text'] = pred_valid
df_test['pred_text'] = test_pred

## Title

In [16]:
word_threshold = 15

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

# word to integer
X_train = np.array(word2int(df_train, n_u_words, 'title', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'title', word_threshold))

y_train = df_train['flag'].values
y_valid = df_valid['flag'].values

model_title, pred_train, pred_valid = fit_evaluate_model(X_train, X_valid, y_train, y_valid, params)

print("\nTest results ..." )
test_pred = predict_test(model_title, df_test, 'title')


Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  2182
The number of texts less than 50 in length is:  13196

Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  241
The number of texts less than 50 in length is:  1441

Creating Sequential RNN: Many to One...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         82048     
_________________________________________________

In [17]:
df_train['pred_title'] = pred_train
df_valid['pred_title'] = pred_valid
df_test['pred_title'] = test_pred

# Stacking

In [18]:
X_train = df_train[['pred_text', 'pred_title']].values
X_valid = df_valid[['pred_text', 'pred_title']].values

y_train = df_train['flag'].values
y_valid = df_valid['flag'].values

In [20]:
# xgb sparse matrix
xgtrain = xgb.DMatrix(X_train, label= y_train)
xgvalid = xgb.DMatrix(X_valid, label= y_valid)

clf = XGBClassifier(
    booster = 'gbtree',
    learning_rate =0.01,
    n_estimators=3000, #3000
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.7,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=99)

xgb_param = clf.get_xgb_params()

# cross-validation
# ------------------------------------------------------------------------------

cv_folds = 5
early_stopping_rounds = 100

print('\nInitializing cross-validation...')
cvresult = xgb.cv(
    xgb_param,
    xgtrain,
    num_boost_round=clf.get_params()['n_estimators'],
    nfold=cv_folds,
    metrics='auc',
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=1)

# retrieve parameters
print('\nXGBClassifier parameters')
clf.set_params(n_estimators=cvresult.shape[0])

# fit the algorithm on the training data
print('\nFit algorithm on train data...')
clf.fit(X_train, y_train, eval_metric='auc')

# Predict training set
# ------------------------------------------------------------------------------
print('\nPredicting on training set...' )
dtrain_predictions = clf.predict(X_train)
dtrain_predprob = clf.predict_proba(X_train)[:,1]

# print model report:
print('Model Report' )
print('Accuracy : %.4g' % metrics.accuracy_score(y_train , dtrain_predictions) )
print('AUC Score (Train): %f' % metrics.roc_auc_score(y_train, dtrain_predprob) )
print('IP Score  (Train): %f' % ip(y_train, dtrain_predprob) )

# Predict valid set
# ------------------------------------------------------------------------------
print('\nPredicting on valid set...' )
dvalid_predprob = clf.predict_proba(X_valid)[:,1]

# print model report:
print('Model Report' )
print('AUC Score (Valid): %f' % metrics.roc_auc_score(y_valid, dvalid_predprob) )
print('IP Score  (Valid): %f' % ip(y_valid, dvalid_predprob) )


Initializing cross-validation...
[0]	train-auc:0.980278+0.000595949	test-auc:0.978449+0.00189885
[1]	train-auc:0.988446+0.0034081	test-auc:0.986811+0.00330488
[2]	train-auc:0.989829+0.000239477	test-auc:0.988423+0.00160333
[3]	train-auc:0.989879+0.00069279	test-auc:0.988399+0.00186405
[4]	train-auc:0.989678+0.000775535	test-auc:0.988073+0.00183153
[5]	train-auc:0.989687+0.000619573	test-auc:0.987901+0.00203283
[6]	train-auc:0.989836+0.00078628	test-auc:0.988134+0.0022735
[7]	train-auc:0.989861+0.000919643	test-auc:0.988196+0.00230157
[8]	train-auc:0.99001+0.000590956	test-auc:0.988397+0.00193445
[9]	train-auc:0.990017+0.000469416	test-auc:0.988573+0.00172401
[10]	train-auc:0.99004+0.000537906	test-auc:0.988519+0.00181943
[11]	train-auc:0.990088+0.000615662	test-auc:0.988619+0.00177716
[12]	train-auc:0.990084+0.000699344	test-auc:0.988585+0.00183207
[13]	train-auc:0.990047+0.000727366	test-auc:0.988514+0.00186119
[14]	train-auc:0.990078+0.000790925	test-auc:0.98859+0.00184521
[15]	trai

[125]	train-auc:0.991515+0.000169942	test-auc:0.989567+0.000894771
[126]	train-auc:0.991519+0.00016866	test-auc:0.989563+0.000892847
[127]	train-auc:0.991524+0.000170611	test-auc:0.989563+0.000893844
[128]	train-auc:0.991527+0.000169764	test-auc:0.989561+0.000889371
[129]	train-auc:0.991535+0.000163392	test-auc:0.98956+0.00089398
[130]	train-auc:0.991544+0.000160753	test-auc:0.989555+0.000904823
[131]	train-auc:0.991547+0.000162743	test-auc:0.989551+0.00090834
[132]	train-auc:0.991552+0.000160179	test-auc:0.989559+0.000902389
[133]	train-auc:0.991562+0.000161224	test-auc:0.989562+0.000900549
[134]	train-auc:0.991562+0.000162093	test-auc:0.989564+0.000900249
[135]	train-auc:0.991564+0.000164792	test-auc:0.989565+0.000896012
[136]	train-auc:0.99157+0.000165116	test-auc:0.989567+0.000897518
[137]	train-auc:0.991574+0.000160747	test-auc:0.989562+0.000894222
[138]	train-auc:0.99158+0.000161115	test-auc:0.989559+0.000888826
[139]	train-auc:0.991586+0.000164807	test-auc:0.989562+0.000887803
[

[249]	train-auc:0.992026+0.00016564	test-auc:0.989614+0.000858845
[250]	train-auc:0.992029+0.000165252	test-auc:0.989617+0.000860003
[251]	train-auc:0.992034+0.000164953	test-auc:0.989617+0.000860861
[252]	train-auc:0.992036+0.000166461	test-auc:0.989617+0.00086118
[253]	train-auc:0.992041+0.000165076	test-auc:0.989617+0.000857418
[254]	train-auc:0.992044+0.000164858	test-auc:0.989618+0.00085915
[255]	train-auc:0.992047+0.000166562	test-auc:0.989619+0.000861278
[256]	train-auc:0.99205+0.000167605	test-auc:0.989621+0.00086051
[257]	train-auc:0.992054+0.000165143	test-auc:0.989621+0.000861276
[258]	train-auc:0.992059+0.000165359	test-auc:0.989617+0.000859954
[259]	train-auc:0.992061+0.000166124	test-auc:0.989618+0.000858235
[260]	train-auc:0.992063+0.000168255	test-auc:0.989614+0.000856418
[261]	train-auc:0.992069+0.000170032	test-auc:0.989613+0.00085792
[262]	train-auc:0.992073+0.000171717	test-auc:0.989612+0.000863012
[263]	train-auc:0.992076+0.000172878	test-auc:0.98961+0.000861201
[2

[373]	train-auc:0.99243+0.000183996	test-auc:0.989653+0.000863044
[374]	train-auc:0.992433+0.000184713	test-auc:0.989651+0.000861482
[375]	train-auc:0.992435+0.000184741	test-auc:0.989651+0.000857981
[376]	train-auc:0.992437+0.000184551	test-auc:0.989649+0.00085899
[377]	train-auc:0.99244+0.000185629	test-auc:0.989646+0.000862934
[378]	train-auc:0.992444+0.000184752	test-auc:0.989645+0.000866916
[379]	train-auc:0.992447+0.000185056	test-auc:0.989647+0.000866363
[380]	train-auc:0.992451+0.00018696	test-auc:0.989645+0.00086408
[381]	train-auc:0.992454+0.000186752	test-auc:0.989648+0.000863291
[382]	train-auc:0.992457+0.000187867	test-auc:0.989646+0.000863739
[383]	train-auc:0.992459+0.000187791	test-auc:0.989645+0.000865054
[384]	train-auc:0.992463+0.000187805	test-auc:0.989643+0.000864356
[385]	train-auc:0.992466+0.000186643	test-auc:0.989645+0.000862904
[386]	train-auc:0.992468+0.000186556	test-auc:0.989648+0.000862064
[387]	train-auc:0.992471+0.00018651	test-auc:0.98965+0.000859094
[3

[497]	train-auc:0.99277+0.000178882	test-auc:0.989648+0.000872921
[498]	train-auc:0.992773+0.000179451	test-auc:0.989646+0.000872028
[499]	train-auc:0.992774+0.000179087	test-auc:0.989645+0.000869551
[500]	train-auc:0.992776+0.000180092	test-auc:0.989645+0.000869597
[501]	train-auc:0.992778+0.000179913	test-auc:0.989646+0.000873187
[502]	train-auc:0.99278+0.000180407	test-auc:0.989643+0.000872693
[503]	train-auc:0.992782+0.000180065	test-auc:0.989643+0.000875293
[504]	train-auc:0.992787+0.000179481	test-auc:0.989639+0.000871123
[505]	train-auc:0.992788+0.000178999	test-auc:0.98964+0.000869911
[506]	train-auc:0.992792+0.000180288	test-auc:0.989643+0.000873368
[507]	train-auc:0.992794+0.00017981	test-auc:0.989644+0.000871993
[508]	train-auc:0.992796+0.000179443	test-auc:0.989644+0.00087369
[509]	train-auc:0.992798+0.000179886	test-auc:0.989643+0.000874207
[510]	train-auc:0.992802+0.000179291	test-auc:0.989642+0.000873109
[511]	train-auc:0.992804+0.000179904	test-auc:0.989641+0.000874179


  if diff:


In [21]:
X_test = np.array(df_test[['pred_text', 'pred_title']])

# Predict test set
# ------------------------------------------------------------------------------
print('\nPredicting on test set...' )
dtest_predprob = clf.predict_proba(X_test)[:,1]


Predicting on test set...


In [22]:
df_test['pred'] = dtest_predprob

In [23]:
df_test.head()

Unnamed: 0,id,keywords,summary,title,text,date,pred_text,pred_title,pred
0,1,"[despedir, se, administración, que, el, en, co...","La administración concursal de Unipost, que ge...",la administración concursal de unipost empezar...,la empresa se cerrará a finales del mes de abr...,2018-02-26,0.397246,0.595022,0.126336
1,2,"[algo, superación, blogs, volver, últimos, tie...",La compañía española se mueve lenta pero consi...,algo se mueve en zardoya . blogs de bolságora,la compañía española se mueve lenta pero consi...,2018-01-29,0.009619,0.375674,0.024883
2,3,"[le, tras, viral, primera, trasplante, que, un...","Jennifer Jones, de 40 años, es una mujer estad...",virales respira por primera vez tras un traspl...,jennifer jones de 40 años es una mujer estadou...,2018-03-12,0.032414,0.55394,0.027032
3,4,"[necesario, y, compras, los, en, la, rebaja, q...",¿Qué supone esta decisión para la economía de ...,las claves de la rebaja en las compras del bce...,la economía de la zona euro ha alcanzado el pu...,2017-10-27,0.685469,0.727556,0.563398
4,5,"[presidente, como, blankfein, sus, schwartz, s...","Lloyd Bankfeild, consejero delegado de Goldman...",david solomon se postula como el próximo presi...,lloyd bankfeild consejero delegado de goldman ...,2018-03-12,0.465237,0.177519,0.069221


___

In [None]:
word_threshold = 500
X_train = np.array(word2int(df_train, n_u_words, 'text', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'text', word_threshold))

In [None]:
X_train.shape

In [None]:
valid_instance = X_valid[1]

In [None]:
np.transpose(X_train).shape

In [None]:
df_train_txt = pd.DataFrame(X_train)
df_valid_txt = pd.DataFrame(X_valid)

In [None]:
import lime
import lime.lime_tabular

# create the lime explainer
explainer = lime.lime_tabular.LimeTabularExplainer(df_train_text.as_matrix(), feature_names=df_train_txt.columns) # X_train.values, , class_names=(0,1)

predict_fn = lambda x: model_text.predict(x)

In [None]:
exp = explainer.explain_instance(np.reshape(df_valid_txt.loc[1], 500), predict_fn, num_features=500)

In [None]:
# Lime
# ------------------------------------------------------------------------------
print('\nUsing Lime to explain instances...')
import lime
import lime.lime_tabular
import re

# create the lime explainer
explainer = lime.lime_tabular.LimeTabularExplainer(df_train[features].as_matrix(), feature_names=features) # X_train.values, , class_names=(0,1)

def lime_explain_instance(id):

    test_instance_tot = test.loc[test[col_id]==id].head(1)
    test_instance = test_instance_tot[features]
    test_instance = test_instance.clip(-10000000.0, 10000000.0) # convert int to float instead?
    test_instance = test_instance.values[0]

    # prediction function: for classifiers, this should be function that takes a numpy array and outputs probability predictions
    predict_fn_xgb = lambda x: clf.predict_proba(x).astype(float)

    exp = explainer.explain_instance(test_instance, predict_fn_xgb, num_features=200) # test_instance.values
    print('Document id     : %d' % (id))
    print('Probability (=1):', clf.predict_proba([test_instance])[0,1])
    print('True class      : %s' % test_instance_tot[col_target].values[0])

    ll = []
    for i in range(1, len(exp.as_list()), 1):
        id_var = exp.as_map()[1][i][0]
        var = features[id_var]
        value = test_instance[id_var]
        crit = exp.as_list()[i][0]
        w = exp.as_list()[i][1]
        dd = {
            "variable": var,
            "value": value,
            "explanation": w,
            "criteria": crit
        }
        ll.append(dd)

    explainer_df = pd.DataFrame(ll)
    explainer_df = explainer_df.sort_values('explanation', ascending=False)
    explainer_df.head(10)
    explainer_df.tail(10)

    pyplot.bar(range(len(explainer_df)), explainer_df['explanation'].values)
    ind = np.arange(len(explainer_df['variable'].values))    # the x locations for the groups
    pyplot.xticks(ind, explainer_df['variable'].values, rotation='vertical')
    # pyplot.savefig('3_gbm_raw_feature_importance.png', bbox_inches='tight')
    pyplot.show()

    return explainer_df

# check top 15 of largest estimated probabilities
test[['id', 'TARGET', 'predprob']].sort_values('predprob', ascending=False).head(15)

"""
2016030520890380
2014120519399710
2015120012335320
2015060519288510
2015090014583910
2014120013445730
"""
explainer_df = lime_explain_instance(2016030520890380)

explainer_df.head(10)
explainer_df.tail(10)

# Output

In [None]:
df_submission = df_test[['id', 'pred']]

In [None]:
df_submission.head()

In [None]:
submission_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
print submission_time
df_submission.to_csv('../submissions/submission_%s.csv' % submission_time, sep=",", na_rep="", mode="w", index=False, encoding='utf-8')