In [None]:

# Cell 1: Imports
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces
import random
import torch
import torch.nn as nn
import torch.optim as optim
import copy
import pandas as pd

# Cell 2: Ornstein-Uhlenbeck Noise
class OrnsteinUhlenbeckNoise:
    def __init__(self, mu, theta=0.15, sigma=0.2):
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.zeros_like(mu)

    def __call__(self):
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.randn(*self.mu.shape)
        self.state += dx
        return self.state

    def reset(self):
        self.state = np.zeros_like(self.mu)

# Cell 3: Actor and Critic Networks
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(Actor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.fc(x)

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

# Cell 4: Replay Buffer
class ReplayBuffer:
    def __init__(self, max_size=100000):
        self.buffer = []
        self.max_size = max_size

    def push(self, experience):
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

# Cell 5: MADDPG Agent
class MADDPGAgent:
    def __init__(self, obs_dim, act_dim, num_agents=2):
        self.num_agents = num_agents
        self.actors = [Actor(obs_dim, act_dim) for _ in range(num_agents)]
        self.critics = [Critic(obs_dim * num_agents + act_dim * num_agents) for _ in range(num_agents)]
        self.target_actors = [copy.deepcopy(actor) for actor in self.actors]
        self.target_critics = [copy.deepcopy(critic) for critic in self.critics]

        self.actor_opts = [optim.Adam(actor.parameters(), lr=1e-4) for actor in self.actors]
        self.critic_opts = [optim.Adam(critic.parameters(), lr=1e-3) for critic in self.critics]
        self.replay_buffer = ReplayBuffer()
        self.tau = 0.01
        self.gamma = 0.99
        self.noises = [OrnsteinUhlenbeckNoise(mu=np.zeros(act_dim)) for _ in range(num_agents)]

    def select_action(self, obs_list):
        with torch.no_grad():
            actions = []
            for i, (actor, obs) in enumerate(zip(self.actors, obs_list)):
                obs_tensor = torch.tensor(obs, dtype=torch.float32)
                action = actor(obs_tensor).numpy() + self.noises[i]().astype(np.float32)
                actions.append(np.clip(action, -1, 1))
            return actions

    def update(self, batch_size=64):
        if len(self.replay_buffer.buffer) < batch_size:
            return

        samples = self.replay_buffer.sample(batch_size)
        obs_n = [torch.tensor([s[0][i] for s in samples], dtype=torch.float32) for i in range(self.num_agents)]
        act_n = [torch.tensor([s[1][i] for s in samples], dtype=torch.float32) for i in range(self.num_agents)]
        rew_n = [torch.tensor([s[2][i] for s in samples], dtype=torch.float32).unsqueeze(1) for i in range(self.num_agents)]
        next_obs_n = [torch.tensor([s[3][i] for s in samples], dtype=torch.float32) for i in range(self.num_agents)]

        all_obs = torch.cat(obs_n, dim=1)
        all_acts = torch.cat(act_n, dim=1)
        next_acts = [self.target_actors[i](next_obs_n[i]) for i in range(self.num_agents)]
        all_next_obs = torch.cat(next_obs_n, dim=1)
        all_next_acts = torch.cat(next_acts, dim=1)

        for i in range(self.num_agents):
            target_q = self.target_critics[i](torch.cat([all_next_obs, all_next_acts], dim=1)).detach()
            current_q = self.critics[i](torch.cat([all_obs, all_acts], dim=1))
            critic_loss = nn.MSELoss()(current_q, rew_n[i] + self.gamma * target_q)

            self.critic_opts[i].zero_grad()
            critic_loss.backward()
            self.critic_opts[i].step()

            new_act = self.actors[i](obs_n[i])
            all_new_acts = act_n.copy()
            all_new_acts[i] = new_act
            actor_input = torch.cat(obs_n, dim=1)
            new_actions = torch.cat(all_new_acts, dim=1)
            actor_loss = -self.critics[i](torch.cat([actor_input, new_actions], dim=1)).mean()

            self.actor_opts[i].zero_grad()
            actor_loss.backward()
            self.actor_opts[i].step()

            for target_param, param in zip(self.target_critics[i].parameters(), self.critics[i].parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for target_param, param in zip(self.target_actors[i].parameters(), self.actors[i].parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

# Cell 6: Budgeting Environment
class BudgetingEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.num_agents = 2
        self.max_days = 30
        self.agents = ['budgeter', 'impulse_buyer']
        self.observation_space = spaces.Box(low=0, high=1000, shape=(7,), dtype=np.float32)
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.reset()

    def reset(self):
        self.balances = {'budgeter': 1000.0, 'impulse_buyer': 1000.0}
        self.happiness = {'budgeter': 50.0, 'impulse_buyer': 50.0}
        self.day = 0
        self.history = {agent: {'balance': [], 'happiness': []} for agent in self.agents}
        return {agent: self._get_obs(agent) for agent in self.agents}

    def step(self, actions):
        rewards = {}
        obs = {}
        dones = {}

        for agent in self.agents:
            spend = actions[agent][0] * 100
            self.balances[agent] -= spend
            self.happiness[agent] += spend * random.uniform(0.05, 0.2)
            self.happiness[agent] = min(100, max(0, self.happiness[agent]))

            penalty = (1000 - self.balances[agent]) * 0.01
            reward = self.happiness[agent] - 50 - penalty + 0.1 * (self.balances[agent] / 1000)
            rewards[agent] = reward
            obs[agent] = self._get_obs(agent)
            dones[agent] = self.day >= self.max_days - 1

            self.history[agent]['balance'].append(self.balances[agent])
            self.history[agent]['happiness'].append(self.happiness[agent])

        self.day += 1
        return obs, rewards, dones, {}

    def _get_obs(self, agent):
        return np.array([
            self.balances[agent],
            self.happiness[agent],
            random.uniform(10, 50),
            random.uniform(5, 30),
            random.uniform(10, 40),
            random.uniform(20, 80),
            self.day
        ], dtype=np.float32)

# Cell 7: Training + Visualization
env = BudgetingEnv()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
agent = MADDPGAgent(obs_dim, act_dim)

episode_rewards = []
all_results = []

for episode in range(30):
    obs_dict = env.reset()
    obs_list = [obs_dict[agent_name] for agent_name in env.agents]
    total_rewards = [0 for _ in range(env.num_agents)]

    for step in range(env.max_days):
        actions_list = agent.select_action(obs_list)
        actions_dict = {agent_name: actions_list[i] for i, agent_name in enumerate(env.agents)}
        next_obs_dict, reward_dict, done_dict, _ = env.step(actions_dict)
        next_obs_list = [next_obs_dict[agent_name] for agent_name in env.agents]
        reward_list = [reward_dict[agent_name] for agent_name in env.agents]
        agent.replay_buffer.push((obs_list, actions_list, reward_list, next_obs_list))
        agent.update()
        obs_list = next_obs_list
        for i in range(env.num_agents):
            total_rewards[i] += reward_list[i]

    episode_rewards.append(total_rewards)
    all_results.append({'Episode': episode+1, 'Budgeter': total_rewards[0], 'Impulse Buyer': total_rewards[1]})

# Graphs
plt.figure(figsize=(10, 5))
for agent in env.agents:
    plt.plot(range(1, env.max_days + 1), env.history[agent]['happiness'], label=agent)
plt.xlabel("Day")
plt.ylabel("Happiness")
plt.title("Happiness over Days")
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
for agent in env.agents:
    plt.plot(range(1, env.max_days + 1), env.history[agent]['balance'], label=agent)
plt.xlabel("Day")
plt.ylabel("Balance")
plt.title("Balance over Days")
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot([r[0] for r in episode_rewards], label="Budgeter")
plt.plot([r[1] for r in episode_rewards], label="Impulse Buyer")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Rewards per Episode")
plt.legend()
plt.show()

reward_df = pd.DataFrame(all_results)
print(reward_df)
