In [1]:
import numpy as np
import random

class RLRecommender:
    def __init__(self, n_products, n_states, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.n_products = n_products  # Number of products
        self.n_states = n_states  # Number of states (user profiles, for simplicity)
        self.learning_rate = learning_rate  # Alpha (Q-learning parameter)
        self.discount_factor = discount_factor  # Gamma (Q-learning parameter)
        self.epsilon = epsilon  # Exploration rate (epsilon-greedy strategy)
        
        # Initialize Q-table: rows represent states, columns represent actions (products)
        self.q_table = np.zeros((n_states, n_products))
    
    def choose_action(self, state):
        """
        Choose an action using epsilon-greedy strategy.
        With probability epsilon, choose a random product (explore).
        With probability (1 - epsilon), choose the product with the highest Q-value (exploit).
        """
        if random.uniform(0, 1) < self.epsilon:
            # Explore: choose a random product
            return random.randint(0, self.n_products - 1)
        else:
            # Exploit: choose the product with the highest Q-value
            return np.argmax(self.q_table[state])
    
    def update_q_table(self, state, action, reward, next_state):
        """
        Update the Q-table using the Q-learning update rule.
        """
        best_next_action = np.argmax(self.q_table[next_state])
        # Q-learning update rule
        self.q_table[state, action] = self.q_table[state, action] + self.learning_rate * (
            reward + self.discount_factor * self.q_table[next_state, best_next_action] - self.q_table[state, action])
    
    def get_recommendation(self, state):
        """
        Get the recommended product based on the current state.
        """
        return self.choose_action(state)
    
    def simulate_interaction(self, state, action):
        """
        Simulate user interaction:
        - If the action (recommended product) leads to a "click" (reward=1) or "no click" (reward=0),
        - we update the Q-table accordingly.
        """
        # Random feedback simulation: 70% chance of a click if product is popular
        # Reward = 1 (click) or 0 (no click)
        reward = 1 if random.random() < 0.7 else 0
        next_state = (state + 1) % self.n_states  # Randomly moving to another state
        self.update_q_table(state, action, reward, next_state)
        return reward


In [2]:

# 1. Define the environment and initial parameters
n_products = 5  # Number of products
n_states = 3  # Number of different user states (e.g., different user profiles)
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1

# 2. Initialize the RL-based recommender system
recommender = RLRecommender(n_products, n_states, learning_rate, discount_factor, epsilon)

# 3. Simulate a few interactions
for episode in range(1000):  # Simulate 1000 interactions
    state = random.randint(0, n_states - 1)  # Random initial state (user profile)
    action = recommender.get_recommendation(state)  # Get recommended product based on current state
    reward = recommender.simulate_interaction(state, action)  # Simulate user feedback (click/no-click)
    
    if episode % 100 == 0:  # Print the Q-table every 100 episodes
        print(f"Episode {episode}, Q-table:")
        print(recommender.q_table)

# 4. Make final recommendations after learning
final_state = 0  # Assume the user is in state 0 (user profile)
recommended_product = recommender.get_recommendation(final_state)
print(f"Final recommendation for user state {final_state}: Product {recommended_product}")


Episode 0, Q-table:
[[0.1 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0. ]]
Episode 100, Q-table:
[[2.21274003 0.25487825 0.         0.26479937 0.38362905]
 [2.0881644  0.17753366 0.         0.         0.        ]
 [2.18431355 0.         0.         0.         0.        ]]
Episode 200, Q-table:
[[3.76788232 0.25487825 0.39784667 0.26479937 0.38362905]
 [3.6337533  0.17753366 0.         0.         0.32372187]
 [3.66231761 0.84645774 0.         0.32409548 0.        ]]
Episode 300, Q-table:
[[4.66473601 0.7169351  0.39784667 0.65363762 0.86058433]
 [4.57185109 0.58938888 0.         0.50359021 1.18605493]
 [4.65188001 1.26060499 0.         0.32409548 0.        ]]
Episode 400, Q-table:
[[5.22442336 1.19518572 0.80800613 0.65363762 0.86058433]
 [5.39722506 1.09811144 0.         0.9097289  1.55844151]
 [5.50230512 1.67877504 0.         0.32409548 0.        ]]
Episode 500, Q-table:
[[5.52923749 1.67910643 0.80800613 0.65363762 0.86058433]
 [5.64346574 1.09811144 1.12826973 0.90972