Score and explain a `damaging` prediction.

Requires the "editquality" repository is checked out under your home directory.

In [1]:
import os.path

from revscoring import Model


model_path = os.path.expanduser("~/editquality/models/enwiki.damaging.gradient_boosting.model")
sm = Model.load(open(model_path), error_on_env_check=False)

Differences between the current environment and the environment in which the model was constructed environment were detected:
 - platform 'Linux-4.9.0-6-amd64-x86_64-with-debian-9.4' mismatch with original environment 'Linux-4.16.0-x86_64-with-debian-9.4'
 - release '4.9.0-6-amd64' mismatch with original environment '4.16.0'
 - version '#1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)' mismatch with original environment '#1 SMP Wed Apr 18 14:02:11 PDT 2018'
 - revscoring_version '2.2.2' mismatch with original environment '2.2.5'


In [2]:
import mwapi
from revscoring.extractors import api

extractor = api.Extractor(mwapi.Session("https://en.wikipedia.org", user_agent="ORES-LIME demo"))

In [3]:
# Load training set
from revscoring.utilities.util import read_observations

observations = list(read_observations(open(os.path.expanduser("~/editquality/datasets/enwiki.labeled_revisions.w_cache.20k_2015.json"))))

Pick a revision, extract the features and score.

In [4]:
import numpy as np

rev_to_score = 846560713
features = [str(f) for f in sm.features]
feature_values = np.array(list(extractor.extract(rev_to_score, sm.features)))

prediction = sm.score(feature_values)

print("https://en.wikipedia.org/?diff={}".format(rev_to_score))
print(np.array(list(zip(features, feature_values))))
print(prediction)

https://en.wikipedia.org/?diff=846560713


NameError: name 'features' is not defined

Set up a LIME explainer for this model.

In [None]:
# Transform training data into a numpy matrix.

train = np.array([np.array([o["cache"][k] for k in features]) for o in observations])

In [None]:
# TODO: Also explain as text, using our own explainer to run variations.

from sklearn.linear_model import LinearRegression
from lime.lime_tabular import LimeTabularExplainer

def score(samples):
    raw_results = [np.array([sm.score(v)["probability"][t] for t in [False, True]]) for v in samples]
    return np.array(raw_results)

categorical_features = [0, 1, 2, 45, 46, 47, 48, 49, 50, 51, 54]

explainer = LimeTabularExplainer(
    train,
    mode="classification",
    feature_names=features,
    categorical_features=categorical_features,
    class_names=["not damaging", "damaging"],
    discretize_continuous=True
)

Create an explainer for this prediction.

In [None]:
exp = explainer.explain_instance(
    np.array(feature_values),
    score,
    num_features=10,
    top_labels=2,
    model_regressor=LinearRegression()
)

In [None]:
%matplotlib inline
fig = exp.as_pyplot_figure(label=int(prediction["prediction"]))

In [None]:
exp.as_list(label=int(prediction["prediction"]))