In [1]:
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-

# RRN to classify text
# Author: adriamoya

#%matplotlib inline
#import matplotlib.pyplot as plt

import re
import pickle
import datetime
import numpy as np
import pandas as pd
from collections import Counter
import random as rn
import tensorflow as tf

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(1337)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K
# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, LSTM, Embedding, Reshape, Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping

import xgboost as xgb
from xgboost import XGBClassifier

import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

from sklearn import metrics
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# variables
flag = 'flag'

In [3]:
# load data
df = pd.read_csv("../../1_construction/3_newspaper_scraper/analyses/cleaned_datasets/train.csv")
df_test  = pd.read_csv("../../1_construction/3_newspaper_scraper/analyses/cleaned_datasets/test.csv")

In [5]:
# ip calculation
def ip(y_target, y_pred):
    return 100*(2*(metrics.roc_auc_score(y_target, y_pred))-1)

In [6]:
def preprocessing(df, column="text"):
    """Preprocessing (lower case, remove urls, punctuations).
    
    Args:
        df     : Dataset with articles information (pandas.DataFrame).
        column : Name of the column that contains the text of the article. Default is `text`.
        
    Returns:
        df     : Dataset with articles information (pandas.DataFrame).
        
    """

    print("\nPreprocessing %s ..." % (column))

    # preprocessing steps: lower case, remove urls, punctuations ...
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace(r'http[\w:/\.]+','') # remove urls
    df[column] = df[column].str.replace(r'[^\.(a-zA-ZÀ-ÿ0-9)\s]','') #remove everything but characters and punctuation ( [^\.\w\s] )
    df[column] = df[column].str.replace(r'(?<=\d)(\.)(?=\d)','') #remove dots in thousands (careful with decimals!)
    df[column] = df[column].str.replace(r'\.\.+','.') #replace multple periods with a single one
    df[column] = df[column].str.replace(r'\.',' .') #replace multple periods with a single one
    df[column] = df[column].str.replace(r'\(',' ') # replace brackets with white spaces
    df[column] = df[column].str.replace(r'\)',' ') # replace brackets with white spaces
    df[column] = df[column].str.replace(r'\s\s+',' ') #replace multple white space with a single one
    df[column] = df[column].str.strip()

    return df

In [7]:
def build_dictionary(df, min_count_word=5):
    """Build dictionary and relationships between words and integers.
    
    Args:
        df             : Dataset with articles information (pandas.DataFrame).
        min_count_word : Only consider words that have been used more than n times. Default is 5.
        
    Returns:
        word2num       : Dictionary (words to numbers).
        num2word       : Dictionary (numbers to words).
        n_u_words      : Length of the dictionary (number of unique words).
        
    """

    print("\nBuilding dictionary ..." )

    # get all unique words (only consider words that have been used more than 5 times)
    all_text = ' '.join(df.text.values)
    words = all_text.split()
    u_words = Counter(words).most_common()
    u_words = [word[0] for word in u_words if word[1]>min_count_word] # we will only consider words that have been used more than 5 times

    print('The number of unique words is:', "{:,}".format(len(u_words)))

    # create the dictionary
    word2num = dict(zip(u_words,range(len(u_words))))
    word2num['<Other>'] = len(u_words)
    num2word = dict(zip(word2num.values(), word2num.keys()))

    num2word[len(word2num)] = '<PAD>'
    word2num['<PAD>'] = len(word2num)
    
    n_u_words = len(u_words)

    return word2num, num2word, n_u_words

In [8]:
def word2int(df, n_u_words, column='text', word_threshold=500):
    """Convert words to integers and prepad sentences
    
    Args:
        df             : Dataset with articles information (pandas.DataFrame)
        n_u_words      : Length of the dictionary (number of unique words).
        column         : Name of the column that contains the text of the article. Default is `text`.
        word_threshold : Number of words to consider for each text (padding). Default is 500.
        
    Returns:
        int_text       : Array with texts translated to integers.
        """

    print("\nConverting words to integers and prepadding ..." )

    int_text = [[word2num[word] if word in word2num else n_u_words for word in Text.split()] for Text in df[column].values] # Text.split() python2

    print('The number of texts greater than %s in length is: ' % str(word_threshold), "{:,}".format(np.sum(np.array([len(t)>word_threshold for t in int_text]))))
    print('The number of texts less than 50 in length is: ', "{:,}".format(np.sum(np.array([len(t)<50 for t in int_text]))))

    for i, t in enumerate(int_text):
        if len(t)<word_threshold:
            int_text[i] = [word2num['<PAD>']]*(word_threshold-len(t)) + t
        elif len(t)>word_threshold:
            int_text[i] = t[:word_threshold]
        else:
            continue

    return int_text

In [9]:
def predict_test(model, X_test, column):
    """Make predictions in test dataset.
    
    Args:
        model     : Model trained.
        X_test    : Array with test features.
        column    : Name of the column that contains desired feature.
        
    Returns:
        pred_test : Array with test predictions.
        
    """

    # words to numbers
    int_text = word2int(X_test, n_u_words, column, word_threshold)

    X = np.array(int_text)

    pred = model.predict(X)

    l_pred = []
    for item in pred:
        l_pred.append(item[0])
        
    return l_pred

In [11]:
# preprocessing steps: lower case, remove urls, punctuations ...

# text
df = preprocessing(df, 'text')
df_test = preprocessing(df_test, 'text')

# title
df = preprocessing(df, 'title')
df_test = preprocessing(df_test, 'title')

# summary
df = preprocessing(df, 'summary')
df_test = preprocessing(df_test, 'summary')


Preprocessing text ...

Preprocessing text ...

Preprocessing title ...

Preprocessing title ...

Preprocessing summary ...

Preprocessing summary ...


In [12]:
# build dictionary
min_count_word = 4
word2num, num2word, n_u_words = build_dictionary(df, min_count_word)


Building dictionary ...
The number of unique words is: 41,112


In [13]:
# train / validation split
print("\nTrain / Validation split ...")

X, y = df[df.columns[~df.columns.str.contains(flag)]].values, df[flag].values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=42)

print("X_train:", X_train.shape)
print("X_valid:", X_valid.shape)
print("y_train:", y_train.shape)
print("y_valid:", y_valid.shape)


Train / Validation split ...
X_train: (11709, 6)
X_valid: (2928, 6)
y_train: (11709,)
y_valid: (2928,)


In [14]:
columns = df.columns[~df.columns.str.contains(flag)].values

In [15]:
df_train = pd.DataFrame(X_train, columns=columns); df_train[flag] = y_train
df_valid = pd.DataFrame(X_valid, columns=columns); df_valid[flag] = y_valid

In [16]:
print("Train:", df_train.shape)
print("Valid:", df_valid.shape)

Train: (11709, 7)
Valid: (2928, 7)


In [19]:
def model_CNNLSTM(X_train, y_train, X_valid, y_valid, params):
    
    early_stopping = EarlyStopping(monitor='loss', patience=2)

    model = Sequential()
    
    model.add(Embedding(len(word2num), params['embedding_size'])) # , batch_size=batch_size
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid')) # sigmoid
    
    model.compile(loss=params['loss_func'], optimizer=params['optimizer'], metrics=params['metrics'])
    model.summary()

    batch_size = params['batch_size']
    model.fit(X_train, y_train, batch_size=batch_size, epochs=params['epochs'], callbacks=[early_stopping])
    pred_train = model.predict(X_train)
    pred_valid = model.predict(X_valid)
    
    print("\nAUC (train): {0:.2f}%".format(100*metrics.roc_auc_score(y_train, pred_train)))
    print("\nAUC (valid): {0:.2f}%".format(100*metrics.roc_auc_score(y_valid, pred_valid)))

In [21]:
from sklearn.model_selection import StratifiedKFold

models = ['CNN', 'LSTM', 'BiLSTM', 'CNNLSTM']

kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=0)

word_threshold = 15

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

X_train = np.array(word2int(df_train, n_u_words, 'title', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'title', word_threshold))

for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
    print("")
    print("-"*80)
    print("Fold %d" % fold_counter)
    print("-"*80)
    X_tr = X_train[tr_index]; y_tr = y_train[tr_index]
    X_te = X_train[te_index]; y_te = y_train[te_index]
    
    model_CNNLSTM(X_tr, y_tr, X_te, y_te, params)


Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  1,957
The number of texts less than 50 in length is:  11,709

Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  466
The number of texts less than 50 in length is:  2,928

--------------------------------------------------------------------------------
Fold 0
--------------------------------------------------------------------------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 128)         0         
_________________________________________________

Epoch 1/3
Epoch 2/3
Epoch 3/3

AUC (train): 98.82%

AUC (valid): 91.74%


# Modelling

In [15]:
def fit_evaluate_model(X_train, X_valid, y_train, y_valid, params):
    """Fit and evaluate Many to One RNN
    
    Args:
        X_train    : Array with train features.
        X_valid    : Array with validation features.
        y_train    : Array with train flag.
        y_valid    : Array with validation flag.
        params     : Dictionary with parameter configuration.
        
    Returns:
        model      : Model already trained.
        pred_train : Array with train predictions.
        pred_valid : Array with validation predictions.
    
    """

    print("\nCreating Sequential RNN: Many to One..." )
    
    early_stopping = EarlyStopping(monitor='loss', patience=2)

    model = Sequential()
    
    model.add(Embedding(len(word2num), params['embedding_size'])) # , batch_size=batch_size
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    #model.add(Dropout(0.2))
    model.add(LSTM(100))
    #model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid')) # sigmoid
    
    model.compile(loss=params['loss_func'], optimizer=params['optimizer'], metrics=params['metrics'])
    model.summary()

    batch_size = params['batch_size']
    print("\nFitting the model ..." )
    model.fit(X_train, y_train, batch_size=batch_size, epochs=params['epochs'], callbacks=[early_stopping])
    
    print("\nPredicting probs on train ..." )
    pred_train = model.predict(X_train)
    print("\nAUC: {0:.2f}%".format(100*metrics.roc_auc_score(y_train, pred_train)), "| GINI: {0:.2f}%".format(ip(y_train, pred_train)))

    print("\nEvaluating in valid ..." )
    print(model.evaluate(X_valid, y_valid, batch_size=batch_size))
    
    print("\nPredicting probs on valid ..." )
    pred_valid = model.predict(X_valid)
    print("\nAUC: {0:.2f}%".format(100*metrics.roc_auc_score(y_valid, pred_valid)), "| GINI: {0:.2f}%".format(ip(y_valid, pred_valid)))

    return model, pred_train, pred_valid

## Text

In [16]:
word_threshold = 500

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

# word to integers
print("\nTrain")
X_train = np.array(word2int(df_train, n_u_words, 'text', word_threshold))
print("\nValid")
X_valid = np.array(word2int(df_valid, n_u_words, 'text', word_threshold))


Train

Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  5,372
The number of texts less than 50 in length is:  130

Valid

Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  1,336
The number of texts less than 50 in length is:  33


In [17]:
model_text, pred_train, pred_valid = fit_evaluate_model(X_train, X_valid, y_train, y_valid, params)

print("\nTest results ..." )
pred_test = predict_test(model_text, df_test, 'text')


Creating Sequential RNN: Many to One...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense_1 (Dense)              (None,

In [18]:
df_train['pred_text'] = pred_train
df_valid['pred_text'] = pred_valid
df_test['pred_text'] = pred_test

## Title

In [19]:
word_threshold = 15

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

# word to integer
X_train = np.array(word2int(df_train, n_u_words, 'title', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'title', word_threshold))


Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  1,957
The number of texts less than 50 in length is:  11,709

Converting words to integers and prepadding ...
The number of texts greater than 15 in length is:  466
The number of texts less than 50 in length is:  2,928


In [20]:
model_title, pred_train, pred_valid = fit_evaluate_model(X_train, X_valid, y_train, y_valid, params)

print("\nTest results ..." )
pred_test = predict_test(model_title, df_test, 'title')


Creating Sequential RNN: Many to One...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense_2 (Dense)              (None,

In [21]:
df_train['pred_title'] = pred_train
df_valid['pred_title'] = pred_valid
df_test['pred_title'] = pred_test

## Summary

In [22]:
word_threshold = 250

params = {
    'loss_func': 'binary_crossentropy', # binary_crossentropy
    'optimizer': 'rmsprop', # adam, rmsprop
    'metrics': ['accuracy'],
    'embedding_size': 100,
    'batch_size': 128,
    'epochs': 3
}

# word to integer
X_train = np.array(word2int(df_train, n_u_words, 'summary', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'summary', word_threshold))


Converting words to integers and prepadding ...
The number of texts greater than 250 in length is:  39
The number of texts less than 50 in length is:  160

Converting words to integers and prepadding ...
The number of texts greater than 250 in length is:  8
The number of texts less than 50 in length is:  35


In [23]:
model_summary, pred_train, pred_valid = fit_evaluate_model(X_train, X_valid, y_train, y_valid, params)

print("\nTest results ..." )
pred_test = predict_test(model_summary, df_test, 'summary')


Creating Sequential RNN: Many to One...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         4111400   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense_3 (Dense)              (None,

In [24]:
df_train['pred_summary'] = pred_train
df_valid['pred_summary'] = pred_valid
df_test['pred_summary'] = pred_test

# Stacking

In [25]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

params = {
        'min_child_weight': [1, 3, 5],
        'gamma': [0.5, 1, 1.5, 2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1, 0.01, 0.005]
        }

xgb = XGBClassifier(learning_rate=0.001, n_estimators=10000,
                    objective='binary:logistic', silent=True)
folds = 4
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

predictors = ['pred_text', 'pred_title', 'pred_summary']
X_train = df_train[predictors].values
y_train = df_train[flag].values.flatten()

print("Randomized search...")
random_search = RandomizedSearchCV(xgb,
                                   param_distributions=params,
                                   n_iter=param_comb,
                                   scoring='roc_auc',
                                   n_jobs=-1,
                                   cv=skf.split(X_train, y_train),
                                   verbose=1,  # 2
                                   random_state=1001 )
random_search.fit(X_train, y_train)

Randomized search...
Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.9min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x11d5a0f68>,
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.001, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'min_child_weight': [1, 3, 5], 'gamma': [0.5, 1, 1.5, 2], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.01, 0.005]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=1)

In [27]:
X_test = df_test[predictors].values

In [31]:
pred_train = random_search.predict_proba(X_train)[:,1]
pred_test = random_search.predict_proba(X_test)[:,1]

In [32]:
print( "AUC Score (train): %f" % metrics.roc_auc_score(y_train, pred_train))

AUC Score (train): 0.994422


In [34]:
import pickle

pickle.dump(random_search.best_estimator_, open('xgboost_random_search.dat', "wb"))

In [35]:
random_search2 = pickle.load(open('xgboost_random_search.dat', "rb"))

In [36]:
pred_train2 = random_search2.predict_proba(X_train)[:,1]

In [37]:
print( "AUC Score (train): %f" % metrics.roc_auc_score(y_train, pred_train2))

AUC Score (train): 0.994422


In [87]:
def fit_evaluate_xgboost(alg, 
             dtrain, 
             dtest, 
             predictors, 
             verbose=0, 
             useTrainCV=True, 
             cv_folds=5, 
             early_stopping_rounds=50, 
             flag='flag'):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[flag].values.flatten())
        cvresult = xgb.cv(
            xgb_param, 
            xgtrain, 
            num_boost_round=alg.get_params()['n_estimators'], 
            nfold=cv_folds,
            metrics='auc', 
            early_stopping_rounds=early_stopping_rounds, 
            verbose_eval=verbose)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(alg.get_params())
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[flag].values.flatten(),eval_metric='auc')
        
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    # Print model report:
    print( "\nModel Report (Train)")
    print( "Accuracy : %.4g" % metrics.accuracy_score(dtrain[flag].values, dtrain_predictions))
    print( "AUC Score: %f" % metrics.roc_auc_score(dtrain[flag].values, dtrain_predprob))
    
    # Predict validation set:
    dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]

    # Print model report:
    print( "\nModel Report (Test)")
    print( "AUC Score: %f" % metrics.roc_auc_score(dtest[flag].values, dtest_predprob))
    
    return alg

In [92]:
predictors = ['pred_text', 'pred_title', 'pred_summary']

# xgb sparse matrix
xgtrain = xgb.DMatrix(X_train, label= y_train)
xgvalid = xgb.DMatrix(X_valid, label= y_valid)

model_xgb = XGBClassifier(
 booster = 'gbtree',
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 reg_alpha=1,
 scale_pos_weight=1,
 seed=27)

model_xgb = fit_evaluate_xgboost(model_xgb, df_train, df_valid, predictors)

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 1, 'missing': None, 'n_estimators': 187, 'n_jobs': 1, 'nthread': 4, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': 27, 'silent': True, 'subsample': 0.8}

Model Report (Train)
Accuracy : 0.9675
AUC Score: 0.993087

Model Report (Test)
AUC Score: 0.939585


# Saving the models

In [111]:
def save_model(model, model_name):
    print("\nSaving", model_name,"...")
    model.save("./models/%s.h5" % model_name)
    
save_model(model_text, "model_text")
save_model(model_title, "model_title")
save_model(model_summary, "model_summary")


Saving model_text ...

Saving model_title ...

Saving model_summary ...


In [94]:
# save model to file
pickle.dump(model_xgb, open("./models/model_xgb.pickle.dat", "wb"))

___

In [104]:
from keras.models import load_model

model_summary.save('./models/model_summary.h5')
model = load_model('./models/model_summary.h5')

In [110]:
metrics.roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 0])

0.8982512372064424

In [112]:
word_threshold = 500
X_train = np.array(word2int(df_train, n_u_words, 'text', word_threshold))
X_valid = np.array(word2int(df_valid, n_u_words, 'text', word_threshold))


Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  5,372
The number of texts less than 50 in length is:  130

Converting words to integers and prepadding ...
The number of texts greater than 500 in length is:  1,336
The number of texts less than 50 in length is:  33


In [113]:
X_train.shape

(11709, 500)

In [114]:
valid_instance = X_valid[1]

In [115]:
np.transpose(X_train).shape

(500, 11709)

In [116]:
df_train_txt = pd.DataFrame(X_train)
df_valid_txt = pd.DataFrame(X_valid)

In [119]:
import lime
import lime.lime_tabular

# create the lime explainer
explainer = lime.lime_tabular.LimeTabularExplainer(df_train_txt.as_matrix(), feature_names=df_train_txt.columns) # X_train.values, , class_names=(0,1)

predict_fn = lambda x: model_text.predict(x)



In [120]:
exp = explainer.explain_instance(np.reshape(df_valid_txt.loc[1], 500), predict_fn, num_features=500)

  return getattr(obj, method)(*args, **kwds)
                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    
  """)


IndexError: index 1 is out of bounds for axis 1 with size 1

In [None]:
# Lime
# ------------------------------------------------------------------------------
print('\nUsing Lime to explain instances...')
import lime
import lime.lime_tabular
import re

# create the lime explainer
explainer = lime.lime_tabular.LimeTabularExplainer(df_train[features].as_matrix(), feature_names=features) # X_train.values, , class_names=(0,1)

def lime_explain_instance(id):

    test_instance_tot = test.loc[test[col_id]==id].head(1)
    test_instance = test_instance_tot[features]
    test_instance = test_instance.clip(-10000000.0, 10000000.0) # convert int to float instead?
    test_instance = test_instance.values[0]

    # prediction function: for classifiers, this should be function that takes a numpy array and outputs probability predictions
    predict_fn_xgb = lambda x: clf.predict_proba(x).astype(float)

    exp = explainer.explain_instance(test_instance, predict_fn_xgb, num_features=200) # test_instance.values
    print('Document id     : %d' % (id))
    print('Probability (=1):', clf.predict_proba([test_instance])[0,1])
    print('True class      : %s' % test_instance_tot[col_target].values[0])

    ll = []
    for i in range(1, len(exp.as_list()), 1):
        id_var = exp.as_map()[1][i][0]
        var = features[id_var]
        value = test_instance[id_var]
        crit = exp.as_list()[i][0]
        w = exp.as_list()[i][1]
        dd = {
            "variable": var,
            "value": value,
            "explanation": w,
            "criteria": crit
        }
        ll.append(dd)

    explainer_df = pd.DataFrame(ll)
    explainer_df = explainer_df.sort_values('explanation', ascending=False)
    explainer_df.head(10)
    explainer_df.tail(10)

    pyplot.bar(range(len(explainer_df)), explainer_df['explanation'].values)
    ind = np.arange(len(explainer_df['variable'].values))    # the x locations for the groups
    pyplot.xticks(ind, explainer_df['variable'].values, rotation='vertical')
    # pyplot.savefig('3_gbm_raw_feature_importance.png', bbox_inches='tight')
    pyplot.show()

    return explainer_df

# check top 15 of largest estimated probabilities
test[['id', 'TARGET', 'predprob']].sort_values('predprob', ascending=False).head(15)

"""
2016030520890380
2014120519399710
2015120012335320
2015060519288510
2015090014583910
2014120013445730
"""
explainer_df = lime_explain_instance(2016030520890380)

explainer_df.head(10)
explainer_df.tail(10)

# Output

In [None]:
df_submission = df_test[['id', 'pred']]

In [None]:
df_submission.head()

In [None]:
submission_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
print submission_time
df_submission.to_csv('../submissions/submission_%s.csv' % submission_time, sep=",", na_rep="", mode="w", index=False, encoding='utf-8')