# Off policy learning

[follows this notebook](https://nbviewer.org/github/david-cortes/contextualbandits/blob/master/example/offpolicy_learning.ipynb)


In [1]:
import pandas as pd, numpy as np, re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import load_svmlight_file
from pathlib import Path

def parse_data(filename):
    with open(filename, "rb") as f:
        infoline = f.readline()
        infoline = re.sub(r"^b'", "", str(infoline))
        n_features = int(re.sub(r"^\d+\s(\d+)\s\d+.*$", r"\1", infoline))
        features, labels = load_svmlight_file(f, n_features=n_features, multilabel=True)
        
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(labels)
    features = np.array(features.todense())
    features = np.ascontiguousarray(features)
    return features, labels

features, y = parse_data(Path("data") / "Bibtex_data.txt")
print(features.shape) # n x k
print(y.shape)



(7395, 1836)
(7395, 159)


In [3]:
from sklearn.linear_model import LogisticRegression

# the 'explorer' policy will be fit with this small sample of the rows
st_seed = 0
end_seed = 3000

# then it will choose actions for this larger sample
st_exploration = 0
end_exploration = 5000

# the new policy will be evaluated with a separate test set
st_test = 5000
end_test = 7395

# separating the covariates data for each case
Xseed = features[st_seed:end_seed, :]
Xexplore_sample = features[st_exploration:end_exploration, :]
Xtest = features[st_test:end_test, :]
nchoices = y.shape[1]



In [9]:

# now constructing an exploration policy as explained above, with fully-labeled data
explorer = LogisticRegression(solver="lbfgs", max_iter=15000)
explorer.fit(Xseed, np.argmax(y[st_seed:end_seed], axis=1))

In [18]:

# letting the exploration policy choose actions for the new policy input
actions_explore_sample = explorer.predict(Xexplore_sample)
rewards_explore_sample = y[st_exploration:end_exploration, :]\
                        [np.arange(end_exploration - st_exploration), actions_explore_sample]

# extracting the probabilities it estimated
ix_internal_actions = {j:i for i,j in enumerate(explorer.classes_)}
ix_internal_actions = [ix_internal_actions[i] for i in actions_explore_sample]
ix_internal_actions = np.array(ix_internal_actions)
prob_actions_explore = explorer.predict_proba(Xexplore_sample)[np.arange(Xexplore_sample.shape[0]),
                                                               ix_internal_actions]


In [39]:
(y[st_exploration:end_exploration, :][np.arange(end_exploration - st_exploration), actions_explore_sample]).shape

(5000,)

In [46]:
np.arange(end_exploration - st_exploration)

array([   0,    1,    2, ..., 4997, 4998, 4999])

In [34]:
y[st_exploration:end_exploration, :]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [51]:
n_classes = 10
n_obs = 8
x__ = np.random.randint(low=0, high=n_classes, size=(n_obs, 3))
x__

array([[8, 9, 1],
       [1, 7, 0],
       [5, 8, 1],
       [4, 9, 2],
       [1, 7, 8],
       [9, 1, 7],
       [7, 5, 6],
       [4, 6, 3]])

In [72]:
m__ = np.arange(n_obs)
z__ = np.random.randint(low=0, high=n_classes-1, size=n_obs)

In [60]:
z__.shape, m__.shape, x__.shape

((8,), (8,), (8, 3))

In [63]:
x = y[st_exploration:end_exploration, :]
m = np.arange(end_exploration - st_exploration)
z = actions_explore_sample

z.shape, m.shape, x.shape

((5000,), (5000,), (5000, 159))

In [68]:
x[m, z]

array([1, 1, 1, ..., 0, 1, 1])

In [71]:
z__

array([0, 3, 6, 5, 6, 1, 2, 1])

In [70]:
z

array([48, 75, 52, ..., 18, 18, 14])

In [64]:
m

array([   0,    1,    2, ..., 4997, 4998, 4999])

In [69]:
x__[m__, z__]

IndexError: index 3 is out of bounds for axis 1 with size 3

In [35]:
len(np.arange(end_exploration - st_exploration)), len(actions_explore_sample)

(5000, 5000)

In [27]:
actions_explore_sample

array([48, 75, 52, ..., 18, 18, 14])

In [26]:
rewards_explore_sample

array([1, 1, 1, ..., 0, 1, 1])

In [24]:
y[st_exploration:end_exploration, :]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
rewards_explore_sample

array([1, 1, 1, ..., 0, 1, 1])

In [22]:
np.argmax(y[st_seed:end_seed], axis=1)

array([ 48,  75,  52, ...,   4, 114,  22])

In [19]:
actions_explore_sample

array([48, 75, 52, ..., 18, 18, 14])