## Feature importance analysis with ELI5

Using ELI5, this script allows you to plot feature importance for models, as well as for individual predictions.

In [None]:
import os
import sys
from collections import OrderedDict

import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import eli5
import numpy as np
import random

sys.path.append('../../src/utils/')
sys.path.append('../../src/models/')
sys.path.append('../../src/features/')
sys.path.append('../../src/store/')
sys.path.append('../../src/dataproc/')

from persist import load, save
from load_config import *
from SQLConn import SQLConn
from create_data_sample import sample

In [None]:
local_paths_env = load_local_paths('../../src/pipeline/local_paths.yaml')
env = load_psql_env(local_paths_env['pgpass_path'])
prod_config = load_config('../../src/prod/prod_config.yaml', append_static=False)
connection = SQLConn(env)

# pull data from database
connection.open()
X_train, X_test, y_train, y_test = sample(ignition=prod_config, connection=connection,
                                              local_features_path=local_paths_env['store_features'])
connection.close()

In [None]:
# store model and file names
model_names = []
file_names = []

for file in os.listdir(local_paths_env['store_production_models']):
    mod = ''.join([c for c in file if c.isupper()])
    model_names.append(mod)
    file_names.append(file.rstrip('.pkl'))

## Saving top 1000 features for all models

In [None]:
features_groups = {}

for i, mod in enumerate(model_names):
    if not mod in features_groups.keys():
        print(file_names[i])
        print(mod)
        
        # load model
        model = load(local_paths_env['store_production_models'], file_names[i])
        
        # check if it has a vectorizer - rest of the notebook assumes this is the case
        if hasattr(model, 'vectorizer'):
            clf = model.models[model_names[i].lower()]

            # not all models have the same number of weights - try a high number
            # and decreaase if that fails
            try:
                weights = eli5.explain_weights(clf, vec=model.vectorizer, top=1000, target_names=[0,1])
            except:
                try:
                    weights = eli5.explain_weights(clf, vec=model.vectorizer, target_names=[0,1])

                except:
                    try:
                        weights = eli5.explain_weights(clf, vec=model.vectorizer, top=10, target_names=[0,1])
                    except:
                        weights = eli5.explain_weights(clf, vec=model.vectorizer, top=5, target_names=[0,1])

            features_list = eli5.formatters.as_dataframe.format_as_dataframe(weights)['feature'].tolist()
            features_groups[model_names[i]] = features_list
        else:
            print("This model has no vectorizer.")    

        

        

In [None]:
features_data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in features_groups.items() ]))
features_data.to_csv('features_data.csv')
features_data.head()

## Analyzing individual models and predictions

In [None]:
# index determines which group's model is selected
i = 1

# load model
model = load(local_paths_env['store_production_models'], file_names[i])

# check if it has a vectorizer - rest of the notebook assumes this is the case
if hasattr(model, 'vectorizer'):
    clf = model.models[model_names[i].lower()]
else:
    print("This model has no vectorizer.")

In [None]:
clf.coef_[:,75000:]

### Showing weights for one model

In [None]:
# show weights for class (overall)
eli5.explain_weights(clf, vec=model.vectorizer, top=6, target_names=[0,1])
np.count_nonzero(clf.coef_)

### Showing weights for one paper

In [None]:
# select a paper randomly belonging to the class of the model currently loaded
classdata = X_test[y_test[model_names[i].lower()]]
n = random.randint(0, len(classdata))
testpaper = classdata[model.tokens_col].iloc[n]

In [None]:
# evaluate the prediction for a paper
print(model_names[i])
eli5.show_prediction(clf, testpaper, vec=model.vectorizer, target_names=[True, False])