In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import pickle

from collections import defaultdict
from collections import Counter
import pprint
import time
import math
import random

## MDP Data

In [2]:
TIME_PERIOD = 1
num_states = 4

In [3]:
data_dir = './rl_data/train/' + str(TIME_PERIOD) + '_month_period/' 

mdp_data = pd.read_csv('./jmp/train/all_data-' + str(TIME_PERIOD) + '_month_period.csv')
print('Train data:', mdp_data.shape)

Train data: (2385300, 40)


In [5]:
mdp_data.columns

Index(['id', 'action', 'reward', 'bloc', 'date', 'age', 'income_bracket',
       'Type_Cluster_3', 'Type_Cluster_5', 'Type_Cluster_10',
       'num_gifts_to_date', 'num_promotions_to_date',
       'frequency-gifts_per_prom', 'last_gift_amount',
       'total_gifts_amount_to_date', 'num_recent_proms', 'num_recent_gifts',
       'total_recent_gifts_amount', 'recent_amount_per_gift',
       'recent_amount_per_prom', 'months_since_last_gift',
       'months_since_last_prom', 'months_from_first_prom_to_gift',
       'gift_recency_ratio', 'prom_recency_ratio',
       'did_receive_gift_1_months_ago', 'did_receive_gift_2_months_ago',
       'did_receive_gift_3_months_ago', 'did_mail_prom_1_months_ago',
       'did_mail_prom_2_months_ago', 'did_mail_prom_3_months_ago',
       'Cluster 1 Components', 'Cluster 2 Components', 'Cluster 3 Components',
       'Cluster 4 Components', 'Cluster 5 Components', 'State_Cluster_4',
       'State_Cluster_6', 'State_Cluster_9', 'State_Cluster_11'],
      dtyp

In [6]:
mdp_data['state'] = mdp_data['State_Cluster_' + str(num_states)]

In [7]:
all_columns = [
    'num_gifts_to_date', 'num_promotions_to_date',
    'frequency-gifts_per_prom', 'last_gift_amount',
    'total_gifts_amount_to_date', 'num_recent_proms', 'num_recent_gifts',
    'total_recent_gifts_amount', 'recent_amount_per_gift',
    'recent_amount_per_prom', 'months_since_last_gift',
    'months_since_last_prom', 'months_from_first_prom_to_gift',
    'gift_recency_ratio', 'prom_recency_ratio',
    'did_receive_gift_1_months_ago', 'did_receive_gift_2_months_ago',
    'did_receive_gift_3_months_ago', 'did_mail_prom_1_months_ago',
    'did_mail_prom_2_months_ago', 'did_mail_prom_3_months_ago', 
    
    'age', 'income_bracket', 

#     'type', 
    'state',

    'id', 'action', 
    'reward', 'bloc', 'date'
]

type_features = [
    'age', 'income_bracket',
]

state_features = [
    'num_gifts_to_date', 'num_promotions_to_date',
    'frequency-gifts_per_prom', 'last_gift_amount',
    'total_gifts_amount_to_date', 'num_recent_proms', 'num_recent_gifts',
    'total_recent_gifts_amount', 'recent_amount_per_gift',
    'recent_amount_per_prom', 'months_since_last_gift',
    'months_since_last_prom', 'months_from_first_prom_to_gift',
    'gift_recency_ratio', 'prom_recency_ratio',
    'did_receive_gift_1_months_ago', 'did_receive_gift_2_months_ago',
    'did_receive_gift_3_months_ago', 'did_mail_prom_1_months_ago',
    'did_mail_prom_2_months_ago', 'did_mail_prom_3_months_ago', 
]

removed = [
    'id', 'state', # 'type', 
    'action', 'date', 'bloc', 'reward', 
]

print('all_columns', len(all_columns), '\n')
print('type_features', len(type_features))
print('state_features', len(state_features))
print('removed', len(removed))

assert len(all_columns) == len(type_features) + len(state_features) + len(removed)

all_columns 29 

type_features 2
state_features 21
removed 6


In [8]:
# PACKAGE_COST = -0.68 # Cost (negative reward) for taking each action

# Change actions to numbers
ACTION_0 = 'no_action'
ACTION_1 = 'labels_only'
ACTION_2 = 'thank_you_with_labels'
ACTION_3 = 'calendars_with_stickers'
ACTION_4 = 'blank_cards_with_labels'
ACTION_5 = 'greeting_cards_with_labels'
ACTION_6 = 'labels_and_notepad'

ACTIONS = {
    ACTION_0: 0,
    ACTION_1: 1,
    ACTION_2: 2,
    ACTION_3: 3,
    ACTION_4: 4,
    ACTION_5: 5,
    ACTION_6: 6,
}

ACTION_NAMES = {
    0: ACTION_0,
    1: ACTION_1,
    2: ACTION_2,
    3: ACTION_3,
    4: ACTION_4,
    5: ACTION_5,
    6: ACTION_6,
}

mdp_data.loc[mdp_data['action'] == ACTION_0, 'action'] = ACTIONS[ACTION_0]
mdp_data.loc[mdp_data['action'] == ACTION_1, 'action'] = ACTIONS[ACTION_1]
mdp_data.loc[mdp_data['action'] == ACTION_2, 'action'] = ACTIONS[ACTION_2]
mdp_data.loc[mdp_data['action'] == ACTION_3, 'action'] = ACTIONS[ACTION_3]
mdp_data.loc[mdp_data['action'] == ACTION_4, 'action'] = ACTIONS[ACTION_4]
mdp_data.loc[mdp_data['action'] == ACTION_5, 'action'] = ACTIONS[ACTION_5]
mdp_data.loc[mdp_data['action'] == ACTION_6, 'action'] = ACTIONS[ACTION_6]

In [9]:
display(mdp_data)

Unnamed: 0,id,action,reward,bloc,date,age,income_bracket,Type_Cluster_3,Type_Cluster_5,Type_Cluster_10,...,Cluster 1 Components,Cluster 2 Components,Cluster 3 Components,Cluster 4 Components,Cluster 5 Components,State_Cluster_4,State_Cluster_6,State_Cluster_9,State_Cluster_11,state
0,95515,5,0.00,0,09-1994,60,3,1,1,1,...,3.940385,3.094768,-0.681861,2.081938,0.598063,3,5,5,7,3
1,95515,6,-0.68,1,10-1994,60,3,1,1,1,...,-0.926485,3.082934,-0.654150,-1.685633,0.572864,0,0,0,1,0
2,95515,0,0.00,2,12-1994,60,3,1,1,1,...,-0.926485,3.082934,-0.450526,-0.487073,0.547664,0,0,0,1,0
3,95515,6,-0.68,3,02-1995,60,3,1,1,1,...,-0.926485,3.072428,-0.598728,-0.400118,0.522465,0,0,0,1,0
4,95515,2,-0.68,4,04-1995,60,3,1,1,1,...,-0.926485,3.063180,-0.571017,0.767771,0.497266,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385295,185114,0,0.00,20,12-1997,80,5,2,4,8,...,-0.926485,18.638946,0.427811,-0.430789,6.379600,0,0,5,1,0
2385296,185114,0,0.00,21,02-1998,80,5,2,4,8,...,-0.926485,18.638946,0.631435,-1.368910,6.354401,0,0,0,1,0
2385297,185114,0,0.00,22,04-1998,80,5,2,4,8,...,-0.926485,18.638946,0.835059,-1.368910,6.329202,0,0,0,1,0
2385298,185114,0,0.00,23,06-1998,80,5,2,4,8,...,-0.926485,18.638946,1.038683,-1.368910,6.304003,0,0,0,1,0


### Transition Probability

In [11]:
num_states = max(mdp_data['state']) + 1 # 0 based index
num_actions = max(mdp_data['action']) + 1 # 0 based index

print('num_states:', num_states)
print('num_actions:', num_actions)

# transitions[curr_state][action][next_state]
transition_counts = np.zeros((num_states, num_actions, num_states))

# transitions[curr_state][action]
transition_sums = np.zeros((num_states, num_actions))

# rewards[curr_state][action][next_state]
rewards = np.zeros((num_states, num_actions, num_states))


start_time = time.time()
for i, row in mdp_data.iterrows():
    if i + 1 >= len(mdp_data) or mdp_data.iloc[i + 1]['bloc'] == 0:
        # Skip since there is no transition to make
        continue

    curr_s = row['state']
    action = row['action']
    next_s = mdp_data.iloc[i + 1]['state']
    reward = mdp_data.iloc[i + 1]['reward']

    transition_counts[curr_s, action, next_s] += 1
    rewards[curr_s, action, next_s] += reward

    transition_sums[curr_s, action] += 1

    if i % 200000 == 0:
        print('========================== Processed', i, 'rows (', i / len(mdp_data) * 100,'%)==========================', (time.time() - start_time) / 60, 'mins')
        
print('Done')

num_states: 4
num_actions: 7
Done


In [12]:
pickle.dump(transition_counts, open(data_dir + 'transition_counts_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))
pickle.dump(transition_sums, open(data_dir + 'transition_sums_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))

In [13]:
# num_states = 6
# num_actions = 7

# transition_counts = pickle.load(open(data_dir + 'transition_counts_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))
# transition_sums = pickle.load(open(data_dir + 'transition_sums_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))

In [14]:
transitions = np.zeros((num_states, num_actions, num_states))

# Rare transition threshold
# TRANSITION_THRESHOLD = 5

for s in range(num_states):
    for a in range(num_actions):
        if transition_sums[s][a] == 0:
            transitions[s][a][:] = 0
            
            assert np.isclose(np.sum(rewards[s][a][:]), 0)
        else:
            transitions[s][a][:] = transition_counts[s][a][:] / transition_sums[s][a]
            rewards[s][a][:] = rewards[s][a][:] / transition_sums[s][a]            

In [15]:
pickle.dump(transitions, open(data_dir + 'transitions_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))
pickle.dump(rewards, open(data_dir + 'rewards_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))

In [16]:
# num_states = 6
# num_actions = 7

# transitions = pickle.load(open(data_dir + 'transitions_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))
# rewards = pickle.load(open(data_dir + 'rewards_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))

## Reinforcement Learning on MDP

https://towardsdatascience.com/reinforcement-learning-demystified-solving-mdps-with-dynamic-programming-b52c8093c919

https://www.datahubbs.com/reinforcement-learning-markov-decision-processes/  
We can set this up just like we did for the MDP example above in order to solve for ...

In [17]:
# Code from Practical Reinforcement Learning on Coursera https://www.coursera.org/learn/practical-rl/

"""
Implements the following:
- MDP class
- plot_graph 
- plot_graph_with_state_values
- plot_graph_optimal_strategy_and_state_values
"""

# Most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/

import sys
import random
import numpy as np
from gym.utils import seeding

try:
    from graphviz import Digraph
    import graphviz
    has_graphviz = True
except ImportError:
    has_graphviz = False


class MDP:
    def __init__(self, transition_probs, rewards, initial_state=None, seed=None):
        """
        Defines an MDP. Compatible with gym Env.
        :param transition_probs: transition_probs[s][a][s_next] = P(s_next | s, a)
            A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> prob]
            For each state and action, probabilities of next states should sum to 1
            If a state has no actions available, it is considered terminal
        :param rewards: rewards[s][a][s_next] = r(s,a,s')
            A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> reward]
            The reward for anything not mentioned here is zero.
        :param get_initial_state: a state where agent starts or a callable() -> state
            By default, picks initial state at random.

        States and actions can be anything you can use as dict keys, but we recommend that you use strings or integers

        Here's an example from MDP depicted on http://bit.ly/2jrNHNr
        transition_probs = {
              's0':{
                'a0': {'s0': 0.5, 's2': 0.5},
                'a1': {'s2': 1}
              },
              's1':{
                'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},
                'a1': {'s1': 0.95, 's2': 0.05}
              },
              's2':{
                'a0': {'s0': 0.4, 's1': 0.6},
                'a1': {'s0': 0.3, 's1': 0.3, 's2':0.4}
              }
            }
        rewards = {
            's1': {'a0': {'s0': +5}},
            's2': {'a1': {'s0': -1}}
        }
        """
        self._check_param_consistency(transition_probs, rewards)
        self._transition_probs = transition_probs
        self._rewards = rewards
        self._initial_state = initial_state
        self.n_states = len(transition_probs)
        self.reset()
        self.np_random, _ = seeding.np_random(seed)

    def get_all_states(self):
        """ return a tuple of all possiblestates """
        return tuple(self._transition_probs.keys())

    def get_possible_actions(self, state):
        """ return a tuple of possible actions in a given state """
        return tuple(self._transition_probs.get(state, {}).keys())

    def is_terminal(self, state):
        """ return True if state is terminal or False if it isn't """
        return len(self.get_possible_actions(state)) == 0

    def get_next_states(self, state, action):
        """ return a dictionary of {next_state1 : P(next_state1 | state, action), next_state2: ...} """
        assert action in self.get_possible_actions(
            state), "cannot do action %s from state %s" % (action, state)
        return self._transition_probs[state][action]

    def get_transition_prob(self, state, action, next_state):
        """ return P(next_state | state, action) """
        return self.get_next_states(state, action).get(next_state, 0.0)

    def get_reward(self, state, action, next_state):
        """ return the reward you get for taking action in state and landing on next_state"""
        assert action in self.get_possible_actions(
            state), "cannot do action %s from state %s" % (action, state)
        return self._rewards.get(state, {}).get(action, {}).get(next_state,
                                                                0.0)

    def reset(self, state=None):
        """ reset the game, return the initial state"""
        if state:
            self._current_state = state
            return self._current_state
        
        if self._initial_state is None:
            self._current_state = self.np_random.choice(
                tuple(self._transition_probs.keys()))
        elif self._initial_state in self._transition_probs:
            self._current_state = self._initial_state
        elif callable(self._initial_state):
            self._current_state = self._initial_state()
        else:
            raise ValueError(
                "initial state %s should be either a state or a function() -> state" %
                self._initial_state)
        return self._current_state

    def step(self, action):
        """ take action, return next_state, reward, is_done, empty_info """
        possible_states, probs = zip(
            *self.get_next_states(self._current_state, action).items())
        next_state = possible_states[self.np_random.choice(
            np.arange(len(possible_states)), p=probs)]
        reward = self.get_reward(self._current_state, action, next_state)
        is_done = self.is_terminal(next_state)
        self._current_state = next_state
        return next_state, reward, is_done, {}

    def render(self):
        print("Currently at %s" % self._current_state)

    def _check_param_consistency(self, transition_probs, rewards):
        for state in transition_probs:
            assert isinstance(transition_probs[state],
                              dict), "transition_probs for %s should be a dictionary " \
                                     "but is instead %s" % (
                                         state, type(transition_probs[state]))
            for action in transition_probs[state]:
                assert isinstance(transition_probs[state][action],
                                  dict), "transition_probs for %s, %s should be a " \
                                         "a dictionary but is instead %s" % (
                                             state, action,
                                             type(transition_probs[
                                                 state, action]))
                next_state_probs = transition_probs[state][action]
                assert len(
                    next_state_probs) != 0, "from state %s action %s leads to no next states" % (
                    state, action)
                sum_probs = sum(next_state_probs.values())
                assert abs(
                    sum_probs - 1) <= 1e-10, "next state probabilities for state %s action %s " \
                                             "add up to %f (should be 1)" % (
                                                 state, action, sum_probs)
        for state in rewards:
            assert isinstance(rewards[state],
                              dict), "rewards for %s should be a dictionary " \
                                     "but is instead %s" % (
                                         state, type(transition_probs[state]))
            for action in rewards[state]:
                assert isinstance(rewards[state][action],
                                  dict), "rewards for %s, %s should be a " \
                                         "a dictionary but is instead %s" % (
                                             state, action, type(
                                                 transition_probs[
                                                     state, action]))
        msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \
              " you will be sent at the first sign of defiance. "
        assert None not in transition_probs, "please do not use None as a state identifier. " + msg
        assert None not in rewards, "please do not use None as an action identifier. " + msg


def plot_graph(mdp, graph_size='10,10', s_node_size='1,5',
               a_node_size='0,5', rankdir='LR', ):
    """
    Function for pretty drawing MDP graph with graphviz library.
    Requirements:
    graphviz : https://www.graphviz.org/
    for ubuntu users: sudo apt-get install graphviz
    python library for graphviz
    for pip users: pip install graphviz
    :param mdp:
    :param graph_size: size of graph plot
    :param s_node_size: size of state nodes
    :param a_node_size: size of action nodes
    :param rankdir: order for drawing
    :return: dot object
    """
    s_node_attrs = {'shape': 'doublecircle',
                    'color': '#85ff75',
                    'style': 'filled',
                    'width': str(s_node_size),
                    'height': str(s_node_size),
                    'fontname': 'Arial',
                    'fontsize': '24'}

    a_node_attrs = {'shape': 'circle',
                    'color': 'lightpink',
                    'style': 'filled',
                    'width': str(a_node_size),
                    'height': str(a_node_size),
                    'fontname': 'Arial',
                    'fontsize': '20'}

    s_a_edge_attrs = {'style': 'bold',
                      'color': 'red',
                      'ratio': 'auto'}

    a_s_edge_attrs = {'style': 'dashed',
                      'color': 'blue',
                      'ratio': 'auto',
                      'fontname': 'Arial',
                      'fontsize': '16'}

    graph = Digraph(name='MDP')
    graph.attr(rankdir=rankdir, size=graph_size)
    for state_node in mdp._transition_probs:
        state_node = str(state_node)
        
        graph.node(state_node, **s_node_attrs)

        for posible_action in mdp.get_possible_actions(state_node):
            posible_action = str(posible_action)
            
            action_node = state_node + "-" + posible_action
            graph.node(action_node,
                       label=str(posible_action),
                       **a_node_attrs)
            graph.edge(state_node, state_node + "-" +
                       posible_action, **s_a_edge_attrs)

            for posible_next_state in mdp.get_next_states(state_node,
                                                          posible_action):
                probability = mdp.get_transition_prob(
                    state_node, posible_action, posible_next_state)
                reward = mdp.get_reward(
                    state_node, posible_action, posible_next_state)

                if reward != 0:
                    label_a_s_edge = 'p = ' + str(probability) + \
                                     '  ' + 'reward =' + str(reward)
                else:
                    label_a_s_edge = 'p = ' + str(probability)

                graph.edge(action_node, posible_next_state,
                           label=label_a_s_edge, **a_s_edge_attrs)
    return graph


def plot_graph_with_state_values(mdp, state_values):
    """ Plot graph with state values"""
    graph = plot_graph(mdp)
    for state_node in mdp._transition_probs:
        value = state_values[state_node]
        graph.node(str(state_node),
                   label=str(state_node) + '\n' + 'V =' + str(value)[:4])
    return graph


def get_optimal_action_for_plot(mdp, state_values, state, gamma=0.9):
    """ Finds optimal action using formula above. """
    if mdp.is_terminal(state):
        return None
    next_actions = mdp.get_possible_actions(state)
    try:
        q_values = [get_action_value(mdp, state_values, state, action, gamma) for
                    action in next_actions]
        optimal_action = next_actions[np.argmax(q_values)]
    except NameError:
        raise NameError("Implement and run the cell that has the get_action_value function")
        
    return optimal_action


def plot_graph_optimal_strategy_and_state_values(mdp, state_values, gamma=0.9):
    """ Plot graph with state values and """
    graph = plot_graph(mdp)
    opt_s_a_edge_attrs = {'style': 'bold',
                          'color': 'green',
                          'ratio': 'auto',
                          'penwidth': '6'}

    for state_node in mdp._transition_probs:
        value = state_values[state_node]
        graph.node(str(state_node),
                   label=str(state_node) + '\n' + 'V =' + str(value)[:4])
        for action in mdp.get_possible_actions(state_node):
            if action == get_optimal_action_for_plot(mdp,
                                                     state_values,
                                                     state_node,
                                                     gamma):
                graph.edge(str(state_node), str(state_node) + "-" + str(action),
                           **opt_s_a_edge_attrs)
    return graph

In [18]:
def get_action_value(mdp, state_values, state, action, gamma):
    """
    Args:
      mdp (MDP): the MDP that we're using
      state_values (dict- {string->float}) : state-values for the states in the current iteration
      state (string) : start state
      action (string) : action that is taken 
      gamma (float) : discount factor

    Returns:
      q (float) : Returns action-value: Expected return starting from state s, 
                  taking action a, and then following policy π
    """
    
    q = 0
    next_states = mdp.get_next_states(state,action)
    
    for next_state in next_states:
        probability = next_states[next_state]
        reward = mdp.get_reward(state,action,next_state)
        
        q += probability * (reward + gamma * state_values[next_state])
    
    return q

def get_new_state_value(mdp, state_values, state, gamma):
    """
    Args:
      mdp (MDP): the MDP that we're using
      state_values (dict- {string->float}) : state-values for the states in the current iteration
      state (string) : start state
      gamma (float) : discount factor

    Returns:
      v (float) : Returns next state value: Expected return for next iteration, 
                  starting from state s, and then following policy π
    """

    if mdp.is_terminal(state): return 0
    
    v = 0
    actions = mdp.get_possible_actions(state)
    
    for action in actions:
        action_value = get_action_value(mdp, state_values, state, action, gamma)
        if action_value > v:
            v = action_value

    return v

def value_iteration(mdp, gamma, num_iter, min_difference, state_values=None):
    """
    Performs num_iter value iteration steps starting from state_values.

    Args:
      mdp : the MDP that we're using
      gamma (float) : discount factor for MDP
      num_iter (string) : maximum number of iterations
      min_difference (float) : stop Value Iteration if new values are at least this close to old values
      state_values (dict- {string->float}) : state-values for the states

    Returns:
      state_values (dict) : Returns the state-values at the end of Value Iteration
    """

    # Initialize V(s) to 0 if argument is not provided
    state_values = state_values or {s : 0 for s in mdp.get_all_states()}

    new_state_values = {}
    
    for i in range(num_iter):
        for s in state_values:
            new_state_values[s] = get_new_state_value(mdp, state_values, s, gamma)

        # Compute difference
        diff = max(abs(new_state_values[s] - state_values[s]) for s in mdp.get_all_states())
        
        state_values = dict(new_state_values)

        if i % (0.1 * num_iter) == 0:
            print("iter %4i   |   diff: %6.5f   |   " % (i, diff), end="")
            print("  ".join("V(%s) = %.3f" % (s, v) for s,v in state_values.items()), end='\n\n')
        
        # Stopping criteria
        if diff < min_difference:
            print("Terminated")
            break
            
    return state_values

def get_optimal_action(mdp, state_values, state, gamma=1, debug=False):
    """
    Args:
      mdp (MDP): the MDP that we're using
      state_values (dict- {string->float}) : state-values for the states in the current iteration
      state (string) : start state
      gamma (float) : discount factor

    Returns:
      optimal_action (string) : Returns the optimal action - action that results 
                                in the maximum action-value.
    """

    if mdp.is_terminal(state): 
        return None
    
    actions = mdp.get_possible_actions(state)
    
    # Compute optimal action as per formula above. 
    optimal_action = None
    optimal_action_value = - float("inf")
    
    if debug:
        print()
    
    for action in actions:
        action_value = get_action_value(mdp, state_values, state, action, gamma)

        if action_value >= optimal_action_value:
            optimal_action_value = action_value
            optimal_action = action
            
            if debug:
                print('action', action, ':', action_value)

    return optimal_action

In [19]:
transitions_dict = {}
rewards_dict = {}

for curr_s in range(transitions.shape[0]):
    transitions_dict[curr_s] = {}
    rewards_dict[curr_s] = {}
    for a in range(transitions.shape[1]):        
        if math.isclose(sum(transitions[curr_s][a][:]), 0):
            continue
        
        transitions_dict[curr_s][a] = {}
        rewards_dict[curr_s][a] = {}
        for next_s in range(transitions.shape[2]):
            transitions_dict[curr_s][a][next_s] = transitions[curr_s][a][next_s].item()
            rewards_dict[curr_s][a][next_s] = rewards[curr_s][a][next_s].item()

In [20]:
print('Transitions:')
pprint.pprint(transitions_dict)

print('\nRewards:')
pprint.pprint(rewards_dict)

Transitions:
{0: {0: {0: 0.968902794433911,
         1: 0.00018625092957972946,
         2: 5.49373245522943e-05,
         3: 0.030856017311956908},
     1: {0: 0.22882882882882882,
         1: 0.0036036036036036037,
         2: 0.0,
         3: 0.7675675675675676},
     2: {0: 0.42772259666548423,
         1: 0.003458673288400142,
         2: 0.0,
         3: 0.5688187300461156},
     3: {0: 0.5069444444444444,
         1: 0.006944444444444444,
         2: 0.0,
         3: 0.4861111111111111},
     4: {0: 0.8313997465544172,
         1: 0.001385109580832441,
         2: 0.0,
         3: 0.16721514386475045},
     5: {0: 0.5988861386138614,
         1: 0.0013613861386138613,
         2: 0.0,
         3: 0.3997524752475248},
     6: {0: 0.5456280887049294,
         1: 0.0038179515084476637,
         2: 0.0,
         3: 0.4505539597866229}},
 1: {0: {0: 0.03815496126477385,
         1: 0.9570128913818787,
         2: 0.0021785383432928506,
         3: 0.0026536090100545313},
     1: {0: 

# Run KDD Marketing RL

In [21]:
initial_states = np.arange(num_states)
mdp = MDP(transitions_dict, rewards_dict, initial_state=random.choice(initial_states), seed=1)

In [22]:
mdp.reset()

# Discount factor γ / gamma
# If γ=0, the agent will be completely myopic and only learn about actions that produce an immediate reward. 
# If γ=1, the agent will evaluate each of its actions based on the sum total of all of its future rewards.

gamma = 0.95
num_iter = 1000
min_difference = 1e-5

state_values = value_iteration(mdp, gamma, num_iter, min_difference)

iter    0   |   diff: 3.92432   |   V(0) = 2.181  V(1) = 2.046  V(2) = 0.476  V(3) = 3.924

iter  100   |   diff: 0.02322   |   V(0) = 75.744  V(1) = 72.281  V(2) = 58.341  V(3) = 77.993

iter  200   |   diff: 0.00014   |   V(0) = 76.182  V(1) = 72.720  V(2) = 58.779  V(3) = 78.432

Terminated


## Results

In [23]:
print("Final MDP state values after running Value Iteration:")
pprint.pprint(state_values)

Final MDP state values after running Value Iteration:
{0: 76.18460268783707,
 1: 72.7220392495309,
 2: 58.78160738327181,
 3: 78.43398282501718}


In [24]:
print("Optimal action for each MDP state")

for state in range(num_states):
    value = state_values[state]
    optimal_action_id = get_optimal_action(mdp, state_values, state, gamma, debug=True)
    print("    MDP State", state, ": optimal action ", optimal_action_id, ACTION_NAMES[optimal_action_id])

Optimal action for each MDP state

action 0 : 72.4677025653448
action 1 : 76.18461175609477
    MDP State 0 : optimal action  1 labels_only

action 0 : 69.03931699405608
action 1 : 71.1702702209369
action 3 : 72.72204831778859
    MDP State 1 : optimal action  3 calendars_with_stickers

action 0 : 58.78161645152951
    MDP State 2 : optimal action  0 no_action

action 0 : 74.13778531923028
action 1 : 78.43399189327488
    MDP State 3 : optimal action  1 labels_only


In [25]:
pickle.dump(transitions_dict, open(data_dir + 'mdp_transitions_dict_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))
pickle.dump(rewards_dict, open(data_dir + 'mdp_rewards_dict_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))
pickle.dump(state_values, open(data_dir + 'mdp_state_values_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "wb"))

## Evaluate policy

In [26]:
EPISODE_LENGTH = 25 # total num months / TIME_PERIOD of each event

def get_average_reward(start_state, mdp, state_values, gamma):
    """
    Returns the average reward for the episode of length EPISODE_LENGTH 
    starting at `start_state`, when following the OPTIMAL policy. 
    """
    s = start_state

    rewards = []
    for i in range(EPISODE_LENGTH):
        s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
        rewards.append(r)
    
    return np.mean(rewards)

In [27]:
# Measure agent's average reward over random episodes
num_samples = 10000

avg_rewards = []
for _ in range(num_samples):
    start_s = mdp.reset() # select random start state

    r = get_average_reward(start_s, mdp, state_values, gamma)
    avg_rewards.append(r)

print("Average reward over", num_samples, "episodes:", np.mean(avg_rewards))

Average reward over 10000 episodes: 3.922005379194813


## Compare policies

In [28]:
# num_states = 6
# num_actions = 4

# transitions_dict = pickle.load(open('./mdp_data/transitions_dict' + str(num_states) + '.p', "rb"))
# rewards_dict = pickle.load(open('./mdp_data/rewards_dict' + str(num_states) + '.p', "rb"))
# state_values = pickle.load(open('./mdp_data/state_values' + str(num_states) + '.p', "rb"))

In [29]:
mdp_episodes = mdp_data[['id', 'action', 'reward', 'bloc', 'state']]

sample_episodes = mdp_episodes
display(sample_episodes)

Unnamed: 0,id,action,reward,bloc,state
0,95515,5,0.00,0,3
1,95515,6,-0.68,1,0
2,95515,0,0.00,2,0
3,95515,6,-0.68,3,0
4,95515,2,-0.68,4,0
...,...,...,...,...,...
2385295,185114,0,0.00,20,0
2385296,185114,0,0.00,21,0
2385297,185114,0,0.00,22,0
2385298,185114,0,0.00,23,0


In [30]:
# Measure agent's average reward following original data policy

print("Average reward for original dataset policy: ", np.mean(sample_episodes['reward']))

Average reward for original dataset policy:  0.5396969018572084


In [31]:
# Measure agent's average reward following RL optimal policy
start_time = time.time()

avg_rewards = []
for i, row in sample_episodes.iterrows():
    if row['bloc'] != 0:
        continue
    
    if i % 200000 == 0:
        print('========================== Processed', i, 'rows (', i / len(sample_episodes) * 100,'%)==========================', (time.time() - start_time) / 60, 'mins')
  
    s = row['state']
    mdp.reset(s) # Set start state to `s`

    r = get_average_reward(s, mdp, state_values, gamma)
    avg_rewards.append(r)

print("Average reward for optimal RL policy: ", np.mean(avg_rewards))

Average reward for optimal RL policy:  3.6046972442456138


In [32]:
# Measure agent's average reward following random policy
start_time = time.time()

avg_rewards = []

for i, row in sample_episodes.iterrows():
    if row['bloc'] != 0:
        continue
    
    if i % 200000 == 0:
        print('========================== Processed', i, 'rows (', i / len(sample_episodes) * 100,'%)==========================', (time.time() - start_time) / 60, 'mins')
  
    
    s = row['state']
    mdp.reset(state=s) # Set start state to `s`

    rewards = []
    for i in range(EPISODE_LENGTH):
        random_action = random.choice(mdp.get_possible_actions(s))
        
        s, r, done, _ = mdp.step(random_action)
        rewards.append(r)
    
    avg_rewards.append(np.mean(rewards))

print("Average reward for random policy: ", np.mean(avg_rewards))

Average reward for random policy:  0.9424275645978695


## Save data

In [33]:
mdp_data.to_csv(data_dir + 'mdp_data_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)
mdp_episodes.to_csv(data_dir + 'mdp_episodes_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)

In [34]:
# num_states = 6
# num_actions = 4

# mdp_data = pd.read_csv(data_dir + 'mdp_data_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)
# mdp_episodes = pd.read_csv(data_dir + 'mdp_episodes_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)

# Evaluate on test set

### Set up test data

In [40]:
test_data_dir = './rl_data/test/' + str(TIME_PERIOD) + '_month_period/'

test_mdp_data = pd.read_csv('./jmp/test/all_data-' + str(TIME_PERIOD) + '_month_period.csv')
print('Test data:', test_mdp_data.shape)

Test data: (2409175, 40)


In [41]:
test_mdp_data['state'] = test_mdp_data['State_Cluster_' + str(num_states)]

In [42]:
# PACKAGE_COST = -0.68 # Cost (negative reward) for taking each action
# FIXME: different costs for different actions

# Change actions to numbers
ACTION_0 = 'no_action'
ACTION_1 = 'labels_only'
ACTION_2 = 'thank_you_with_labels'
ACTION_3 = 'calendars_with_stickers'
ACTION_4 = 'blank_cards_with_labels'
ACTION_5 = 'greeting_cards_with_labels'
ACTION_6 = 'labels_and_notepad'

ACTIONS = {
    ACTION_0: 0,
    ACTION_1: 1,
    ACTION_2: 2,
    ACTION_3: 3,
    ACTION_4: 4,
    ACTION_5: 5,
    ACTION_6: 6,
}

ACTION_NAMES = {
    0: ACTION_0,
    1: ACTION_1,
    2: ACTION_2,
    3: ACTION_3,
    4: ACTION_4,
    5: ACTION_5,
    6: ACTION_6,
}

test_mdp_data.loc[test_mdp_data['action'] == ACTION_0, 'action'] = ACTIONS[ACTION_0]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_1, 'action'] = ACTIONS[ACTION_1]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_2, 'action'] = ACTIONS[ACTION_2]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_3, 'action'] = ACTIONS[ACTION_3]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_4, 'action'] = ACTIONS[ACTION_4]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_5, 'action'] = ACTIONS[ACTION_5]
test_mdp_data.loc[test_mdp_data['action'] == ACTION_6, 'action'] = ACTIONS[ACTION_6]

In [43]:
display(test_mdp_data)

Unnamed: 0,id,action,reward,bloc,date,age,income_bracket,Type_Cluster_3,Type_Cluster_5,Type_Cluster_10,...,Cluster 1 Components,Cluster 2 Components,Cluster 3 Components,Cluster 4 Components,Cluster 5 Components,State_Cluster_4,State_Cluster_6,State_Cluster_9,State_Cluster_11,state
0,188946,5,0.00,0,09-1994,81,1,1,2,3,...,-0.926485,-0.739844,-0.501068,0.854727,-1.800008,1,2,3,4,1
1,188946,6,-0.68,1,10-1994,81,1,1,2,3,...,-0.926485,-0.755040,-0.501068,-1.685633,-1.800008,1,2,3,5,1
2,188946,0,24.00,2,12-1994,81,1,1,2,3,...,-0.926485,-0.393034,-0.737508,-0.487073,1.342906,0,1,2,3,0
3,188946,6,-0.68,3,02-1995,81,1,1,2,3,...,5.648679,-0.413584,-0.828984,-0.400118,1.317706,3,5,9,11,3
4,188946,2,-0.68,4,04-1995,81,1,1,2,3,...,4.392178,-0.430255,-0.824365,0.767771,1.292507,3,5,9,11,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2409170,123497,0,0.00,20,12-1997,35,3,3,5,10,...,-0.926485,0.359419,-0.592593,-0.430789,1.045474,0,0,1,1,0
2409171,123497,0,0.00,21,02-1998,35,3,3,5,10,...,-0.926485,0.359419,-0.498741,-1.368910,1.020275,0,0,1,1,0
2409172,123497,0,0.00,22,04-1998,35,3,3,5,10,...,-0.926485,0.359419,-0.404888,-1.368910,0.995076,0,0,1,1,0
2409173,123497,0,0.00,23,06-1998,35,3,3,5,10,...,-0.926485,0.359419,-0.311035,-1.368910,0.969877,0,0,1,1,0


### Evaluate RL policy on test set

In [45]:
test_mdp_episodes = test_mdp_data[['id', 'action', 'reward', 'bloc', 'state']]

test_sample_episodes = test_mdp_episodes
display(test_sample_episodes)

Unnamed: 0,id,action,reward,bloc,state
0,188946,5,0.00,0,1
1,188946,6,-0.68,1,1
2,188946,0,24.00,2,0
3,188946,6,-0.68,3,3
4,188946,2,-0.68,4,3
...,...,...,...,...,...
2409170,123497,0,0.00,20,0
2409171,123497,0,0.00,21,0
2409172,123497,0,0.00,22,0
2409173,123497,0,0.00,23,0


In [44]:
print('Num states =', num_states)
print('Num actions =', num_actions)

Num states = 4
Num actions = 7


In [49]:
transitions_dict = pickle.load(open(data_dir + 'mdp_transitions_dict_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))
rewards_dict = pickle.load(open(data_dir + 'mdp_rewards_dict_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))
state_values = pickle.load(open(data_dir + 'mdp_state_values_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.p', "rb"))

In [None]:
initial_states = np.arange(num_states)
mdp = MDP(transitions_dict, rewards_dict, initial_state=random.choice(initial_states), seed=1)

In [50]:
# Measure agent's average reward following original data policy

print("[Test set] Average reward for original dataset policy: ", np.mean(test_sample_episodes['reward']))

[Test set] Average reward for original dataset policy:  0.5396465678084823


In [51]:
# Measure agent's average reward following RL optimal policy
start_time = time.time()

avg_rewards = []
for i, row in test_sample_episodes.iterrows():
    if row['bloc'] != 0:
        continue
    
    if i % 200000 == 0:
        print('========================== Processed', i, 'rows (', i / len(test_sample_episodes) * 100,'%)==========================', (time.time() - start_time) / 60, 'mins')
  
    s = row['state']
    mdp.reset(s) # Set start state to `s`

    r = get_average_reward(s, mdp, state_values, gamma)
    avg_rewards.append(r)

print("[Test set] Average reward for optimal RL policy: ", np.mean(avg_rewards))

[Test set] Average reward for optimal RL policy:  3.5870054323020537


In [52]:
# Measure agent's average reward following random policy
start_time = time.time()

avg_rewards = []

for i, row in test_sample_episodes.iterrows():
    if row['bloc'] != 0:
        continue
    
    if i % 200000 == 0:
        print('========================== Processed', i, 'rows (', i / len(test_sample_episodes) * 100,'%)==========================', (time.time() - start_time) / 60, 'mins')
  
    
    s = row['state']
    mdp.reset(state=s) # Set start state to `s`

    rewards = []
    for i in range(EPISODE_LENGTH):
        random_action = random.choice(mdp.get_possible_actions(s))
        
        s, r, done, _ = mdp.step(random_action)
        rewards.append(r)
    
    avg_rewards.append(np.mean(rewards))

print("[Test set] Average reward for random policy: ", np.mean(avg_rewards))

[Test set] Average reward for random policy:  0.9416974307313686


In [54]:
test_mdp_data.to_csv(test_data_dir + 'mdp_data_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)
test_mdp_episodes.to_csv(test_data_dir + 'mdp_episodes_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)

In [None]:
# num_states = 6
# num_actions = 4

# test_mdp_data = pd.read_csv(test_data_dir + 'mdp_data_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)
# test_mdp_episodes = pd.read_csv(test_data_dir + 'mdp_episodes_nS=' + str(num_states) + '_nA=' + str(num_actions) + '.csv', index=False)