# eli5 text highlighting with a custom vectorizer

This notebook shows how to do highlighted text explanation with eli5 library when using a custom vectorizer. This is necessary, for example, when vectorizer lemmatizes Finnish text with libvoikko.

In [None]:
import re
import eli5
import numpy as np
import pandas as pd
from eli5.base import DocWeightedSpans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder
from sklearn.svm import LinearSVC
from voikko import libvoikko

## VoikkoVectorizer

The following cell implementes a custom scikit-learn Vectorizer that 1) uses libvoikko for lemmatization and 2) implements the eli5 interface required for text highlighting.

In [None]:
class VoikkoVectorizer(TfidfVectorizer):
    """Convert a collection of raw documents to a matrix of TF-IDF features.
    
    Based on the scikit-learn's TfidfVectorizer.
    
    Parameters
    ----------
    voikko : default = None
        An instance of libvoikko.Voikko object. If given, the words
        will be lemmatized using libvoikko.

    Other parameters are the same as in TfidfVectorizer (except for
    tokenizer and analyzer which this class overrides).
    """
    def __init__(self, *, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, stop_words=None,
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False, voikko=None):
        self.voikko = voikko

        if stop_words:
            stop_words = set(self._simple_tokenizer(' '.join(stop_words)))

        super().__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=self._simple_tokenizer,
            analyzer='word', stop_words=stop_words, token_pattern=None, 
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf,
            sublinear_tf=sublinear_tf)

    def get_doc_weighted_spans(self, doc, feature_weights, feature_fn):
        """This function implements eli5's interface required for highlighted text.
        
        Adapted from eli5.sklearn.text."""
        preprocessed_doc = self.build_preprocessor()(self.decode(doc))
        feature_weights_dict = _get_feature_weights_dict(feature_weights, feature_fn)
        
        spans = []
        found_features = {}
        for f_spans, feature in self._span_analyzer(preprocessed_doc):
            if feature not in feature_weights_dict:
                continue

            weight, key = feature_weights_dict[feature]
            spans.append((feature, f_spans, weight))
            found_features[key] = weight

        return found_features, DocWeightedSpans(
            document=preprocessed_doc,
            spans=spans,
            preserve_density=self.analyzer.startswith('char'),
        )

    def _span_tokenizer(self, doc):
        tokens = []
        for m in re.finditer(r'\b\w\w+\b', doc):
            token = m.group()
            if self.voikko is not None:
                analyzed = self.voikko.analyze(token)
                if analyzed:
                    token = analyzed[0].get('BASEFORM', token)

            tokens.append((m.span(), token.lower()))

        return tokens

    def _simple_tokenizer(self, doc):
        return [token for _, token in self._span_tokenizer(doc)]

    def _span_analyzer(self, doc):
        assert self.analyzer == 'word'
        
        tokens = self._span_tokenizer(doc)        
        return self._span_word_ngrams(tokens)
            
    def _span_word_ngrams(self, tokens):
        if self.stop_words is not None:
            tokens = [(s, w) for s, w in tokens if w not in self.stop_words]

        min_n, max_n = self.ngram_range
        if max_n == 1:
            tokens = [([s], w) for s, w in tokens]
        else:
            original_tokens = tokens
            tokens = []
            n_original_tokens = len(original_tokens)
            tokens_append = tokens.append
            space_join = ' '.join
            for n in range(min_n,
                            min(max_n + 1, n_original_tokens + 1)):
                for i in range(n_original_tokens - n + 1):
                    ngram_tokens = original_tokens[i: i + n]
                    tokens_append((
                        [s for s, _ in ngram_tokens],
                        space_join(t for _, t in ngram_tokens)))

        return tokens

def _get_feature_weights_dict(feature_weights,  # type: FeatureWeights
                              feature_fn        # type: Optional[Callable[[str], str]]
                              ):
    # type: (...) -> Dict[str, Tuple[float, Tuple[str, int]]]
    """ Return {feat_name: (weight, (group, idx))} mapping.
    
    Copied from eli5.sklearn.text.
    """
    return {
        # (group, idx) is an unique feature identifier, e.g. ('pos', 2)
        feat_name: (fw.weight, (group, idx))
        for group in ['pos', 'neg']
        for idx, fw in enumerate(getattr(feature_weights, group))
        for feat_name in _get_features(fw.feature, feature_fn)
    }

def _get_features(feature, feature_fn=None):
    """Copied from eli5.sklearn.text."""
    if isinstance(feature, list):
        features = [f['name'] for f in feature]
    else:
        features = [feature]
    if feature_fn:
        features = list(filter(None, map(feature_fn, features)))
    return features

## Loading data

In [None]:
small_classes = [
    'ulkomaankauppa- ja kehitysministeri',
    'puolustusministeri',
    'pääministeri',
    'eurooppa-, kulttuuri- ja urheiluministeri'
]

short_names = {
    'perhe- ja peruspalveluministeri': 'per',
    'maatalous- ja ympäristöministeri': 'maa',
    'sisäministeri': 'sis',
    'oikeus- ja työministeri': 'oik',
    'opetus- ja kulttuuriministeri': 'ope',
    'valtiovarainministeri': 'val',
    'liikenne- ja viestintäministeri': 'lii',
    'sosiaali- ja terveysministeri': 'sos',
    'elinkeinoministeri': 'eli',
    'ulkoministeri': 'ulk',
    'kunta- ja uudistusministeri': 'kun',
    'eurooppa-, kulttuuri- ja urheiluministeri': 'eur',
    'pääministeri': 'pää',
    'puolustusministeri': 'puo',
    'ulkomaankauppa- ja kehitysministeri': 'uke',
}

def load_documents(filename):
    df = pd.read_csv(filename, header=0).rename(columns={'ministry': 'class'})
    df = df[~df['class'].isin(small_classes)].reset_index()
    return df

def load_data():
    train = load_documents('data/vkk/train.csv.bz2')
    dev = load_documents('data/vkk/dev.csv.bz2')
    test = load_documents('data/vkk/test.csv.bz2')
    
    return train, dev, test

In [None]:
train, dev, test = load_data()
print(f'Number of classes: {len(train["class"].unique())}')
print(f'Number of train samples: {len(train)}')
print(f'Number of dev samples: {len(dev)}')
print(f'Number of test samples: {len(test)}')

## Training a classifier

In [None]:
voikko = libvoikko.Voikko('fi')

In [None]:
stop_words_fi = [
    'ei', 'että', 'he', 'hän', 'ja', 'joissa', 'joka', 'jos', 'koska', 'kuin',
    'kuka', 'kun', 'me', 'mikä', 'minä', 'myös', 'ne', 'nuo', 'nämä', 'olla',
    'se', 'sinä', 'tai', 'te', 'tuo', 'tämä', 'vai',
]

enc = LabelEncoder()
y_encoded = enc.fit_transform(train['class'])

vec = VoikkoVectorizer(voikko=voikko, 
                       ngram_range=(1, 2),
                       min_df=2, max_df=0.1,
                       stop_words=stop_words_fi)

clf = LinearSVC(C=0.1, loss='hinge', intercept_scaling=5.0,
                max_iter=100000, multi_class='ovr')
scaler = MaxAbsScaler()
pipe = make_pipeline(vec, scaler, clf)
pipe.fit(train['sentence'], y_encoded);

Checking that the vectorizer lemmatizes Finnish words:

In [None]:
text = 'Ajoimme punaisella autolla aamulla.'

' '.join(vec.inverse_transform(vec.transform([word]))[0][0] for word in text.split())

Performance on the development set:

In [None]:
y_dev_true = dev['class']
y_dev_pred = enc.inverse_transform(pipe.predict(dev['sentence']))

print(classification_report(y_dev_true, y_dev_pred))

## Examining the classifier and predictions

First, show the top features for each class.

In [None]:
target_names = enc.inverse_transform(clf.classes_)

eli5.show_weights(clf, vec=vec, top=10, target_names=target_names)

Next, explain predictions on an example.

In [None]:
doc_test = dev['sentence'].iloc[111]
y_test = dev['class'].iloc[111]

eli5.show_prediction(clf, doc_test, vec=vec, target_names=target_names, targets=[y_test])