In [1]:
# Step 1: Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import random

In [2]:
# Step 2: Define the environment class
class EcommercePricingEnv:
    def __init__(self, base_price, max_price, demand_sensitivity):
        self.base_price = base_price
        self.max_price = max_price
        self.demand_sensitivity = demand_sensitivity
        self.current_price = base_price
        self.state = [self.current_price, random.uniform(0.5, 1.5)]  # [price, market_trend_factor]
        self.total_profit = 0

    def reset(self):
        self.current_price = self.base_price
        self.state = [self.current_price, random.uniform(0.5, 1.5)]
        self.total_profit = 0
        return self.state

    def step(self, action):
        # Adjust price based on action (e.g., -1: decrease, 0: no change, 1: increase)
        self.current_price = max(min(self.current_price + action, self.max_price), self.base_price)

        # Simulate demand based on price and market trend factor
        market_trend_factor = self.state[1]
        demand = max(0, market_trend_factor * (self.max_price - self.current_price) * self.demand_sensitivity)

        # Calculate profit
        profit = demand * self.current_price
        self.total_profit += profit

        # Update state with new price and market trend factor
        self.state = [self.current_price, random.uniform(0.5, 1.5)]
        
        # Define a reward as the profit achieved
        reward = profit

        # Return state, reward, and a flag indicating if episode is over
        done = self.total_profit > 100000  # Example: Stop after a certain profit threshold
        return self.state, reward, done

In [3]:
# Step 3: Define the agent class
class RLAgent:
    def __init__(self, action_space):
        self.action_space = action_space
        self.q_table = {}
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.exploration_rate = 1.0
        self.exploration_decay = 0.99

    def choose_action(self, state):
        state_tuple = tuple(state)
        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = {a: 0 for a in self.action_space}

        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(self.action_space)
        return max(self.q_table[state_tuple], key=self.q_table[state_tuple].get)

    def update_q_value(self, state, action, reward, next_state):
        state_tuple = tuple(state)
        next_state_tuple = tuple(next_state)

        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = {a: 0 for a in self.action_space}
        if next_state_tuple not in self.q_table:
            self.q_table[next_state_tuple] = {a: 0 for a in self.action_space}

        max_next_q = max(self.q_table[next_state_tuple].values())
        self.q_table[state_tuple][action] += self.learning_rate * (
            reward + self.discount_factor * max_next_q - self.q_table[state_tuple][action]
        )

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay

In [4]:
# Step 4: Train the model
env = EcommercePricingEnv(base_price=10, max_price=100, demand_sensitivity=0.05)
agent = RLAgent(action_space=[-1, 0, 1])

episodes = 1000
for episode in range(episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(action)

        agent.update_q_value(state, action, reward, next_state)
        state = next_state
        total_reward += reward

        if done:
            break

    agent.decay_exploration()
    print(f"Episode {episode + 1}: Total Profit = {env.total_profit:.2f}, Exploration Rate = {agent.exploration_rate:.2f}")


Episode 1: Total Profit = 100005.49, Exploration Rate = 0.99
Episode 2: Total Profit = 100113.79, Exploration Rate = 0.98
Episode 3: Total Profit = 100037.66, Exploration Rate = 0.97
Episode 4: Total Profit = 100038.43, Exploration Rate = 0.96
Episode 5: Total Profit = 100020.17, Exploration Rate = 0.95
Episode 6: Total Profit = 100016.63, Exploration Rate = 0.94
Episode 7: Total Profit = 100014.89, Exploration Rate = 0.93
Episode 8: Total Profit = 100036.76, Exploration Rate = 0.92
Episode 9: Total Profit = 100019.74, Exploration Rate = 0.91
Episode 10: Total Profit = 100056.02, Exploration Rate = 0.90
Episode 11: Total Profit = 100003.06, Exploration Rate = 0.90
Episode 12: Total Profit = 100038.62, Exploration Rate = 0.89
Episode 13: Total Profit = 100024.86, Exploration Rate = 0.88
Episode 14: Total Profit = 100049.97, Exploration Rate = 0.87
Episode 15: Total Profit = 100022.88, Exploration Rate = 0.86
Episode 16: Total Profit = 100038.65, Exploration Rate = 0.85
Episode 17: Total

In [5]:
# Step 5: Evaluate the agent
print("Training complete. Evaluating the agent...")
state = env.reset()

total_reward = 0
while True:
    action = agent.choose_action(state)
    next_state, reward, done = env.step(action)
    state = next_state
    total_reward += reward

    if done:
        break

print(f"Total Profit during evaluation: {env.total_profit:.2f}")


Training complete. Evaluating the agent...
Total Profit during evaluation: 100014.94
