In [28]:
import pandas as pd
import numpy as np
import eli5
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics

from IPython.display import display

In [29]:
eli5.__version__

'0.6.3'

In [30]:
# Load data with Pandas
data = pd.read_csv('data/train.csv')

In [31]:
col_to_del = ['Embarked', 'Cabin', 'PassengerId'] # Don't want to use this features here

data.drop(col_to_del, axis=1, inplace=True)

In [32]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05


In [33]:
data.shape

(891, 9)

## Simple model (no pipeline)

In [112]:
def builder_no_text(data):
    """
    Build Feature & dummies
    return :
     - data - [DataFrame] Features
     - y - [Series] Target (0/1)
    """
    
    # target
    target = data.Survived
    data.drop('Survived', axis=1, inplace=True)
    
    # Drop text features here 
    data.drop(['Name', 'Ticket'], axis=1, inplace=True)
    
    # Pclass
    mes_dummy_Pclass = pd.get_dummies(data.Pclass, prefix="split_Pclass")
    data = data.join(mes_dummy_Pclass)
    data = data.drop('Pclass', axis=1)
    
    # Sex
    mes_dummy_Sex = pd.get_dummies(data.Sex, prefix="split_Sex")
    data = data.join(mes_dummy_Sex)
    data = data.drop('Sex', axis=1)
    
    # Child
    data['child'] = 0
    data.loc[data.Age <= 10, 'child'] = 1

    return data, target

In [113]:
X, y = builder_no_text(data.copy())

In [117]:
# We split our X in 2 DataFrame (X_train, X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2017)

In [118]:
X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
57,28.5,0,0,7.2292,0,0,1,0,1,0
32,,0,0,7.75,0,0,1,1,0,0
662,47.0,0,0,25.5875,1,0,0,0,1,0
555,62.0,0,0,26.55,1,0,0,0,1,0
196,,0,0,7.75,0,0,1,0,1,0


In [119]:
xgb_model_1 = XGBClassifier()

In [120]:
xgb_model_1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [121]:
pred = xgb_model_1.predict(X_test)

In [122]:
metrics.accuracy_score(y_test, pred)

0.84304932735426008

### Using Eli5 in simple xgb model 

In [184]:
eli5.show_weights(xgb_model_1)

Weight,Feature
0.5095,split_Sex_female
0.2149,split_Pclass_3
0.0592,split_Pclass_1
0.0558,SibSp
0.0468,Fare
0.0419,split_Pclass_2
0.0395,Age
0.0325,Parch


In [130]:
eli5.show_prediction(xgb_model_1, 
                     X_test[X_test.index == 232],
                     feature_names=xgb_model_1.booster().feature_names, # have to add feature_names here
                     show_feature_values=True)

Contribution?,Feature,Value
0.123,SibSp,0.0
0.008,Parch,0.0
-0.026,split_Pclass_1,0.0
-0.229,Fare,13.5
-0.308,split_Pclass_2,1.0
-0.481,<BIAS>,1.0
-0.814,Age,59.0
-1.004,split_Sex_female,0.0


IF you want to pass this info in json or other :

In [131]:
eli5.format_as_dict(eli5.explain_prediction(xgb_model_1,
                                            X_test[X_test.index == 232],
                                            feature_names=xgb_model_1.booster().feature_names)
                   )

{'decision_tree': None,
 'description': '\nFeatures with largest coefficients.\n\nFeature weights are calculated by following decision paths in trees\nof an ensemble. Each leaf has an output score, and expected scores can also be\nassigned to parent nodes. Contribution of one feature on the decision path\nis how much expected score changes from parent to child. Weights of all \nfeatures sum to the output score of the estimator.\n\nCaveats:\n1. Feature weights just show if the feature contributed positively or\n   negatively to the final score, and does show how increasing or decreasing\n   the feature value will change the prediction.\n2. In some cases, feature weight can be close to zero for an important feature.\n   For example, in a single tree that computes XOR function, the feature at the\n   top of the tree will have zero weight because expected scores for both\n   branches are equal, so decision at the top feature does not change the\n   expected score. For an ensemble predictin

## Using pipeline 

In [229]:
def builder_with_text(data):
    """
    Build Feature & dummies
    return :
     - data - [DataFrame] Features
     - y - [Series] Target (0/1)
    """
    
    # target
    target = data.Survived
    data.drop('Survived', axis=1, inplace=True)
    
    # Pclass
    mes_dummy_Pclass = pd.get_dummies(data.Pclass, prefix="split_Pclass")
    data = data.join(mes_dummy_Pclass)
    data = data.drop('Pclass', axis=1)
    
    # Sex
    mes_dummy_Sex = pd.get_dummies(data.Sex, prefix="split_Sex")
    data = data.join(mes_dummy_Sex)
    data = data.drop('Sex', axis=1)
    
    # Child
    data['child'] = 0
    data.loc[data.Age <= 10, 'child'] = 1

    return data, target

In [230]:
X, y = builder_with_text(data.copy())

In [231]:
# We split our X in 2 DataFrame (X_train, X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2017)

In [232]:
X_train.head()

Unnamed: 0,Name,Age,SibSp,Parch,Ticket,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
57,"Novel, Mr. Mansouer",28.5,0,0,2697,7.2292,0,0,1,0,1,0
32,"Glynn, Miss. Mary Agatha",,0,0,335677,7.75,0,0,1,1,0,0
662,"Colley, Mr. Edward Pomeroy",47.0,0,0,5727,25.5875,1,0,0,0,1,0
555,"Wright, Mr. George",62.0,0,0,113807,26.55,1,0,0,0,1,0
196,"Mernagh, Mr. Robert",,0,0,368703,7.75,0,0,1,0,1,0


In [233]:
# Col to use for NLP
col_nlp = ['Name', 'Ticket'] # Ticket is a bit overkill here but it's for the exemple.


count_vec_name = CountVectorizer(analyzer='char_wb',
                                 ngram_range=(3, 4),
                                 max_features=50)
count_vec_ticket = CountVectorizer(analyzer='char_wb',
                                   ngram_range=(1, 3),
                                   max_features=10)



class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(col_nlp, axis=1).values
        return x
        

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]

In [234]:
xgb_model_2 = XGBClassifier()

In [235]:
## My pipeline
clf = Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),
                        ('name', Pipeline([
                            ('Name', cust_txt_col(key='Name')), # Selector
                            ('count_vec_name', count_vec_name)
                        ])),
                        ('ticket', Pipeline([
                            ('Ticket', cust_txt_col(key='Ticket')), # Selector
                            ('count_vec_ticket', count_vec_ticket)
                        ])),
                    ]
        )),
        ('algo', xgb_model_2)
    ])

In [236]:
clf.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('cst', cust_regression_vals()), ('name', Pipeline(steps=[('Name', cust_txt_col(key='Name')), ('count_vec_name', CountVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'u...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [237]:
pred = clf.predict(X_test)

In [238]:
metrics.accuracy_score(y_test, pred)

0.82959641255605376

### Using Eli5 with pipeline

In [239]:
## My Goal now is to get all my features names (no get_feature_names() yet)
# get feature name with text transformer :
features =  X_train.columns.tolist()

# Remove feature with Text processing
for col in col_nlp:
    features.remove(col)

count_vec_name.fit(X_train['Name'])
features.extend(count_vec_name.get_feature_names())

count_vec_ticket.fit(X_train['Ticket'])
features.extend(count_vec_ticket.get_feature_names())

In [240]:
features[0:5]

['Age', 'SibSp', 'Parch', 'Fare', 'split_Pclass_1']

In [241]:
# xgb Feature names 
clf.named_steps['algo'].booster().feature_names[0:5]

['f0', 'f1', 'f2', 'f3', 'f4']

In [242]:
len(X_train.columns)

12

In [245]:
len(features)
# X_train : 12 features
# We Remove our 2 col_nlp feature --> 10 Features
# Add 50 features with count_vec_name --> 60
# add 10 features with count_vec_ticket --> 70

70

In [246]:
# Error : https://github.com/TeamHG-Memex/eli5/issues/213#issuecomment-308677391
#eli5.show_prediction(clf.named_steps['algo'], 
#                     X_test[X_test.index == 666], 
#                     feature_names=features,
#                     target_names=clf.named_steps['algo'].classes_,
#                     vec=clf.named_steps['union'])

In [247]:
eli5.show_weights(clf.named_steps['algo'], 
                  feature_names=features,
                 top=15)

Weight,Feature
0.3903,mr.
0.0916,split_Pclass_3
0.0622,Fare
0.0446,"e,"
0.0376,split_Sex_female
0.0375,2
0.0331,son
0.0320,4
0.0297,1
0.0296,3


In [248]:
# You have to pass an already vectorized document (so transform data by our pipeline process)

eli5.show_prediction(clf.named_steps['algo'], 
                     clf.named_steps['union'].transform(X_test[X_test.index == 232]),
                     feature_names=features)

Contribution?,Feature
0.01,3
0.008,SibSp
-0.023,1
-0.025,Age
-0.028,0
-0.031,
-0.036,7
-0.072,split_Sex_female
-0.159,Fare
-0.475,<BIAS>


Pretty hard to understand where come from features "1", "0"... Maybe have to improve something here

Json output :

In [266]:
eli5.format_as_dict(eli5.explain_prediction(clf.named_steps['algo'],
                                            clf.named_steps['union'].transform(X_test[X_test.index == 232]),
                                            feature_names=features)
                   )

{'decision_tree': None,
 'description': '\nFeatures with largest coefficients.\n\nFeature weights are calculated by following decision paths in trees\nof an ensemble. Each leaf has an output score, and expected scores can also be\nassigned to parent nodes. Contribution of one feature on the decision path\nis how much expected score changes from parent to child. Weights of all \nfeatures sum to the output score of the estimator.\n\nCaveats:\n1. Feature weights just show if the feature contributed positively or\n   negatively to the final score, and does show how increasing or decreasing\n   the feature value will change the prediction.\n2. In some cases, feature weight can be close to zero for an important feature.\n   For example, in a single tree that computes XOR function, the feature at the\n   top of the tree will have zero weight because expected scores for both\n   branches are equal, so decision at the top feature does not change the\n   expected score. For an ensemble predictin

### Improve understanding of Eli5 output :

In [267]:
features_2 =  X_train.columns.tolist()

# Remove feature with Text processing
for col in col_nlp:
    features_2.remove(col)

count_vec_name.fit(X_train['Name'])
features_2.extend(['name_'+col for col in count_vec_name.get_feature_names()])

count_vec_ticket.fit(X_train['Ticket'])
features_2.extend(['ticket_'+col for col in count_vec_ticket.get_feature_names()])

In [268]:
features_2[8:15]

['split_Sex_male',
 'child',
 u'name_ al',
 u'name_ jo',
 u'name_ joh',
 u'name_ ma',
 u'name_ mar']

In [265]:
eli5.show_prediction(clf.named_steps['algo'], 
                     clf.named_steps['union'].transform(X_test[X_test.index == 232]),
                     feature_names=features_2)

Contribution?,Feature
0.01,ticket_3
0.008,SibSp
-0.023,ticket_1
-0.025,Age
-0.028,ticket_0
-0.031,ticket_
-0.036,ticket_7
-0.072,split_Sex_female
-0.159,Fare
-0.475,<BIAS>


Better understanding here