In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import random
import numpy as np
import gym
import torch
import json
import time
import warnings
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import google.generativeai as genai
from typing import List, Dict, Tuple

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ==================================
# 📌 GEMINI API INTEGRATION
# ==================================
class GeminiAnalyzer:
    def __init__(self, api_key: str):
        """Initialize Gemini API for strategy analysis"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash')
        
    def analyze_strategy_evolution(self, agent_id: str, move_history: List[str], 
                                 opponent_history: List[str], probabilities: List[float],
                                 rewards: List[float], opponent_name: str) -> str:
        """Analyze how the PPO agent's strategy evolves over time with enhanced strategy detection"""
        
        # Add rate limiting to respect API limits
        time.sleep(0.5)
        
        # Calculate additional metrics for strategy analysis
        recent_moves = move_history[-20:] if len(move_history) >= 20 else move_history
        recent_opp_moves = opponent_history[-20:] if len(opponent_history) >= 20 else opponent_history
        
        # Pattern analysis
        consecutive_cooperations = 0
        consecutive_defections = 0
        max_coop_streak = 0
        max_defect_streak = 0
        current_coop_streak = 0
        current_defect_streak = 0
        
        for move in move_history:
            if move == 'C':
                current_coop_streak += 1
                current_defect_streak = 0
                max_coop_streak = max(max_coop_streak, current_coop_streak)
            else:
                current_defect_streak += 1
                current_coop_streak = 0
                max_defect_streak = max(max_defect_streak, current_defect_streak)
        
        # Tit-for-tat analysis
        tit_for_tat_matches = 0
        if len(move_history) > 1 and len(opponent_history) > 0:
            for i in range(1, min(len(move_history), len(opponent_history))):
                if move_history[i] == opponent_history[i-1]:
                    tit_for_tat_matches += 1
        
        tit_for_tat_percentage = (tit_for_tat_matches / max(1, len(move_history)-1)) * 100
        
        # Response to opponent's last move
        responses_to_coop = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'C']
        responses_to_defect = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'D']
        
        coop_after_coop = responses_to_coop.count('C') / max(1, len(responses_to_coop)) * 100
        coop_after_defect = responses_to_defect.count('C') / max(1, len(responses_to_defect)) * 100
        
        # Create enhanced analysis prompt
        prompt = f"""
        You are analyzing a PPO reinforcement learning agent playing the Prisoner's Dilemma. Your task is to identify what strategy the agent has learned and how it's adapting.

        GAME CONTEXT:
        - Agent ID: {agent_id}
        - Opponent Strategy: {opponent_name}
        - Total Moves Played: {len(move_history)}
        
        MOVE SEQUENCES (most recent 30 moves):
        Agent:    {' '.join(move_history[-30:])}
        Opponent: {' '.join(opponent_history[-30:])}
        
        BEHAVIORAL METRICS:
        - Agent Cooperation Rate: {move_history.count('C') / len(move_history) * 100:.1f}%
        - Opponent Cooperation Rate: {opponent_history.count('C') / len(opponent_history) * 100:.1f}%
        - Tit-for-Tat Adherence: {tit_for_tat_percentage:.1f}%
        - Cooperate after Opponent Cooperates: {coop_after_coop:.1f}%
        - Cooperate after Opponent Defects: {coop_after_defect:.1f}%
        - Max Cooperation Streak: {max_coop_streak}
        - Max Defection Streak: {max_defect_streak}
        - Current Streak: {current_coop_streak if move_history[-1] == 'C' else current_defect_streak} {'cooperations' if move_history[-1] == 'C' else 'defections'}
        
        DECISION PROBABILITIES (last 15):
        Cooperation Probabilities: {[f'{p:.3f}' for p in probabilities[-15:]]}
        
        PERFORMANCE METRICS:
        - Recent Rewards (last 15): {[f'{r:.1f}' for r in rewards[-15:]]}
        - Average Recent Reward: {np.mean(rewards[-20:]) if len(rewards) >= 20 else np.mean(rewards):.2f}
        - Reward Trend: {'Improving' if len(rewards) >= 10 and np.mean(rewards[-5:]) > np.mean(rewards[-10:-5]) else 'Declining' if len(rewards) >= 10 else 'Establishing'}
        
        STRATEGIC ANALYSIS QUESTIONS:
        1. **Primary Strategy Identification**: What well-known strategy does this agent most closely resemble? 
           Consider: Tit-for-Tat, Always Cooperate, Always Defect, Generous Tit-for-Tat, Random etc.
        
        2. **Adaptation Pattern**: How is the agent modifying its approach based on the opponent's behavior (the agent  has already been trained)?
        
        3. **Learning Evidence**: What evidence shows the agent is learning vs. following a fixed strategy?
        
        4. **Exploitation vs Cooperation**: Is the agent being exploitative, cooperative, or strategic?
        
        5. **Prediction**: Based on current patterns, how will the agent likely behave in the next 10 moves?
        
        6. **Strategy Effectiveness**: How well is this learned strategy performing against {opponent_name}?
        
        Please provide a concise but comprehensive analysis (max 250 words) focusing on strategy identification and learning patterns.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini analysis temporarily unavailable: {str(e)[:100]}..."
    
    def compare_strategies(self, results: Dict[str, Dict]) -> str:
        """Compare PPO performance across different opponent strategies with enhanced analysis"""
        
        # Add rate limiting
        time.sleep(1.0)
        
        # Calculate cross-strategy metrics
        best_performance = max(results.values(), key=lambda x: x['total_reward'])
        worst_performance = min(results.values(), key=lambda x: x['total_reward'])
        
        prompt = f"""
        Analyze the PPO agent's learned strategy by comparing its performance across different Prisoner's Dilemma opponents.

        PERFORMANCE SUMMARY:
        {json.dumps(results, indent=2)}
        
        COMPARATIVE ANALYSIS:
        - Best Performance: {[k for k, v in results.items() if v['total_reward'] == best_performance['total_reward']][0]} (Score: {best_performance['total_reward']:.1f})
        - Worst Performance: {[k for k, v in results.items() if v['total_reward'] == worst_performance['total_reward']][0]} (Score: {worst_performance['total_reward']:.1f})
        - Performance Range: {best_performance['total_reward'] - worst_performance['total_reward']:.1f} points
        
        STRATEGIC QUESTIONS TO ANALYZE:
        
        1. **Overall Strategy Classification**: Based on all matchups, what is the PPO agent's primary learned strategy?
           - Is it closer to Tit-for-Tat, Always Defect, Generous strategies, or something unique?
        
        2. **Opponent-Specific Adaptations**: How does the agent's cooperation rate change based on opponent type?
           - Against cooperative opponents vs. aggressive opponents
           - Does it show strategic flexibility or fixed behavior?
        
        3. **Exploitation Patterns**: 
           - Which opponents does it successfully exploit?
           - Which opponents exploit it?
           - What does this reveal about its learned weaknesses/strengths?
        
        4. **Learning Sophistication**: 
           - Does the agent show sophisticated counter-strategies?
           - Or is it using simple heuristics?
        
        5. **Strategic Robustness**: 
           - Is this a well-rounded strategy or does it have clear vulnerabilities?
           - How would it perform against novel opponents?
        
        6. **Reward Optimization**: 
           - Is the agent maximizing mutual benefit or self-interest?
           - How does this align with game theory predictions?
        
        Provide comprehensive strategic insights about what the PPO agent has learned and how sophisticated its strategy is (max 400 words).
        Focus on identifying the core strategic principles it has discovered through reinforcement learning.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini comparison temporarily unavailable: {str(e)[:100]}..."

# ==================================
# 📌 ENHANCED PRISONER'S DILEMMA ENVIRONMENT
# ==================================
class PrisonersDilemmaEnv(gym.Env):
    def __init__(self, strategy, agent_id="PPO_AGENT", history_len=10):
        super(PrisonersDilemmaEnv, self).__init__()
        self.history_len = history_len
        self.agent_id = agent_id
        self.action_space = gym.spaces.Discrete(2)  # 0: Cooperate, 1: Defect
        self.observation_space = gym.spaces.Box(low=0, high=5, shape=(3 * self.history_len,), dtype=np.float32)

        self.strategy = strategy
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        
        # Enhanced tracking for Gemini analysis
        self.full_agent_history = []
        self.full_opponent_history = []
        self.full_reward_history = []
        self.action_probabilities = []

    def step(self, action):
        opponent_action = 0 if self.strategy(self.agent_history, self.opponent_history, False) == 'C' else 1
        reward = self.get_reward(action, opponent_action)

        # Update histories
        self.agent_history.append(action)
        self.opponent_history.append(opponent_action)
        self.reward_history.append(reward)
        
        # Update full histories for analysis
        self.full_agent_history.append(action)
        self.full_opponent_history.append(opponent_action)
        self.full_reward_history.append(reward)

        # Maintain sliding window
        if len(self.agent_history) > self.history_len:
            self.agent_history.pop(0)
            self.opponent_history.pop(0)
            self.reward_history.pop(0)

        obs = self._get_obs()
        return obs, reward, False, {}

    def reset(self):
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        return self._get_obs()

    def _get_obs(self):
        pad = lambda x: x + [0] * (self.history_len - len(x))
        return np.array(pad(self.agent_history) + pad(self.opponent_history) + pad(self.reward_history), dtype=np.float32)

    def get_reward(self, action, opponent_action):
        # Classic Prisoner's Dilemma payoff matrix
        if action == 0 and opponent_action == 0:      # Both cooperate
            return 3
        elif action == 1 and opponent_action == 1:    # Both defect
            return 1
        elif action == 0 and opponent_action == 1:    # Agent cooperates, opponent defects
            return 0
        else:                                         # Agent defects, opponent cooperates
            return 5
    
    def add_action_probability(self, prob):
        """Store action probability for analysis"""
        self.action_probabilities.append(prob)

# ==================================
# 📌 STRATEGIES (with proper conversion)
# ==================================
def to_symbolic(hist):
    return ['C' if a == 0 else 'D' for a in hist]

def tit_for_tat(agent_hist, opp_hist, __):
    return 'C' if not agent_hist else ('C' if agent_hist[-1] == 0 else 'D')

def tit_for_two_tats(agent_hist, opp_hist, __):
    if len(agent_hist) >= 2 and agent_hist[-2:] == [1, 1]:
        return 'D'
    return 'C'

def generous_tit_for_tat(agent_hist, opp_hist, __, cooperation_prob=0.1):
    if not agent_hist:
        return 'C'
    if agent_hist[-1] == 1:
        return 'C' if random.random() < cooperation_prob else 'D'
    return 'C'

def always_cooperate(*args): return 'C'
def always_defect(*args): return 'D'
def random_strategy(*args): return random.choice(['C', 'D'])
def friedman(agent_hist, opp_hist, __): return 'D' if 1 in agent_hist else 'C'

def joss(agent_hist, opp_hist, __):
    if agent_hist and random.random() < 0.1:
        return 'D'
    if agent_hist: 
        return 'C' if agent_hist[-1] == 0 else 'D'
    return 'C'

def grasskamp(agent_hist, opp_hist, __):
    return 'D' if len(agent_hist) >= 3 and agent_hist[-3:] == [1, 1, 1] else 'C'

def sample(agent_hist, opp_hist, __): return random.choice(['C', 'C', 'D'])
def tester(agent_hist, opp_hist, __): return 'D' if not agent_hist else 'C'
def opportunist(agent_hist, opp_hist, __): return 'D' if agent_hist and agent_hist[-1] == 0 else 'C'
def backstabber(agent_hist, opp_hist, __): return 'C' if len(agent_hist) < 5 else 'D'
def chaos_agent(*args): return random.choice(['C', 'D'])

# ==================================
# 📌 TRAIN PPO AGAINST STRATEGIES
# ==================================
def train_ppo(history_len=10):
    strategies = {
        "tit_for_tat": tit_for_tat,
        "generous_tit_for_tat": generous_tit_for_tat,
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    envs = [lambda s=s, i=i: PrisonersDilemmaEnv(s, agent_id=f"PPO_vs_{name}", history_len=history_len)
            for i, (name, s) in enumerate(strategies.items())]
    vec_env = DummyVecEnv(envs)

    model = PPO("MlpPolicy", vec_env, verbose=1, ent_coef=0.01)
    model.learn(total_timesteps=250000)
    model.save("ppo_all_strategies_full_history")
    return model

# ==================================
# 📌 ENHANCED TEST WITH DETAILED LOGGING AND GEMINI ANALYSIS
# ==================================
def test_ppo_with_gemini(gemini_api_key: str, history_len=10, print_steps=True, analyze_every=100, num_episodes=500):
    """Test PPO with detailed step logging and Gemini analysis integration"""
    
    strategies = {
        "tit_for_tat": tit_for_tat,
        "generous_tit_for_tat": generous_tit_for_tat,
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    print("\n🔹 Loading PPO General Model")
    model = PPO.load("ppo_all_strategies_full_history")
    results = {}
    detailed_results = {}

    for name, strategy in strategies.items():
        print(f"\n{'='*60}")
        print(f"🧪 TESTING PPO AGAINST: {name.upper()}")
        print(f"{'='*60}")
        
        agent_id = f"PPO_vs_{name}"
        env = DummyVecEnv([lambda s=strategy, aid=agent_id: PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)])
        obs = env.reset()
        total_reward = 0
        
        # Detailed logging header
        if print_steps:
            print(f"\n📋 DETAILED STEP-BY-STEP LOG:")
            print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
            print("-" * 80)

        for i in range(num_episodes):
            # Get action and probabilities more robustly
            action, _ = model.predict(obs, deterministic=False)
            
            # Get action probabilities with proper device handling
            try:
                # Ensure tensor is on the same device as the model
                device = next(model.policy.parameters()).device
                obs_tensor = torch.FloatTensor(obs).to(device)
                
                with torch.no_grad():
                    # Extract features and get logits
                    features = model.policy.extract_features(obs_tensor)
                    latent_pi = model.policy.mlp_extractor.forward_actor(features)
                    logits = model.policy.action_net(latent_pi)
                    action_probs = torch.softmax(logits, dim=-1)
                    prob_cooperate = float(action_probs[0][0].cpu())
            except Exception as e:
                # Fallback: use a simple heuristic based on recent actions
                prob_cooperate = 0.5  # Default probability
                if i == 0:  # Only print warning once
                    print(f"Warning: Using default probability due to: {str(e)[:50]}...")
            
            # Take step
            obs, reward, done, _ = env.step(action)
            action_val = int(action[0])
            
            # Store probability
            env.envs[0].add_action_probability(prob_cooperate)
            
            opponent_action = env.envs[0].opponent_history[-1]
            total_reward += reward[0]
            
            # Detailed step logging
            if print_steps:
                # Determine notes for interesting patterns
                notes = ""
                if i > 0:
                    prev_opp = env.envs[0].full_opponent_history[-2] if len(env.envs[0].full_opponent_history) >= 2 else None
                    if prev_opp is not None:
                        if action_val == prev_opp:
                            notes += "TFT? "
                        if action_val == 0 and opponent_action == 1:
                            notes += "EXPLOITED "
                        elif action_val == 1 and opponent_action == 0:
                            notes += "EXPLOITING "
                
                print(f"{i+1:>3d} | {'C' if action_val==0 else 'D':>5} | {'C' if opponent_action==0 else 'D':>8} | "
                      f"{reward[0]:>6.1f} | {total_reward:>10.1f} | {prob_cooperate:>8.3f} | {notes}")
            
            # Periodic Gemini analysis
            if (i + 1) % analyze_every == 0:
                print(f"\n{'🤖 GEMINI ANALYSIS':<20} (Step {i+1})")
                print("-" * 60)
                
                agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
                opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
                
                analysis = analyzer.analyze_strategy_evolution(
                    agent_id=agent_id,
                    move_history=agent_moves,
                    opponent_history=opponent_moves,
                    probabilities=env.envs[0].action_probabilities,
                    rewards=env.envs[0].full_reward_history,
                    opponent_name=name
                )
                print(analysis)
                print("-" * 60)
                
                if print_steps:
                    print(f"\n📋 CONTINUING STEP LOG:")
                    print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
                    print("-" * 80)

        # Store detailed results
        agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
        opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
        
        detailed_results[name] = {
            'total_reward': float(total_reward),
            'average_reward': float(total_reward / num_episodes),
            'cooperation_rate': agent_moves.count('C') / len(agent_moves),
            'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
            'avg_reward': float(np.mean(env.envs[0].full_reward_history)),
            'final_probabilities': env.envs[0].action_probabilities[-10:] if len(env.envs[0].action_probabilities) >= 10 else env.envs[0].action_probabilities,
            'agent_id': agent_id,
            'total_episodes': num_episodes
        }
        
        results[name] = total_reward
        
        # Summary for this opponent
        print(f"\n📊 MATCH SUMMARY:")
        print(f"Agent: {agent_id}")
        print(f"Total Reward: {total_reward:.1f}")
        print(f"Average Reward per Episode: {total_reward/num_episodes:.2f}")
        print(f"Agent Cooperation Rate: {detailed_results[name]['cooperation_rate']:.1%}")
        print(f"Opponent Cooperation Rate: {detailed_results[name]['opponent_cooperation_rate']:.1%}")

    # Final comprehensive analysis
    print("\n" + "="*80)
    print("🧠 COMPREHENSIVE GEMINI STRATEGY ANALYSIS")
    print("="*80)
    
    final_analysis = analyzer.compare_strategies(detailed_results)
    print(final_analysis)
    
    print("\n" + "="*80)
    print("📊 FINAL PPO PERFORMANCE SUMMARY")
    print("="*80)
    for strategy, score in results.items():
        details = detailed_results[strategy]
        print(f"{strategy:>20}: {score:>6.1f} pts | Avg: {details['average_reward']:>5.2f} | "
              f"Coop: {details['cooperation_rate']:>5.1%} | "
              f"vs {details['opponent_cooperation_rate']:>5.1%} opp")
    
    return results, detailed_results

# ==================================
# 📌 MAIN EXECUTION WITH GEMINI
# ==================================
def main():
    # You need to set your Gemini API key here
    GEMINI_API_KEY = ""  # Replace with your actual API key
    
    if GEMINI_API_KEY == "your_gemini_api_key_here":
        print("⚠️  Please set your Gemini API key in the GEMINI_API_KEY variable")
        print("You can get one from: https://makersuite.google.com/app/apikey")
        return
    
    print("🚀 Training PPO Agent...")
    train_ppo()
    
    print("\n🧪 Testing PPO with Enhanced Analysis...")
    results, detailed_results = test_ppo_with_gemini(
        gemini_api_key=GEMINI_API_KEY,
        print_steps=True,       # Enable detailed step-by-step logging
        analyze_every=100,      # Analyze every 100 steps
        num_episodes=500        # Number of episodes per opponent
    )

# ==================================
# 🚀 RUN TRAINING & TESTING
# ==================================
if __name__ == "__main__":
    main()

🚀 Training PPO Agent...
Using cpu device
------------------------------
| time/              |       |
|    fps             | 4752  |
|    iterations      | 1     |
|    time_elapsed    | 2     |
|    total_timesteps | 10240 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1965        |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.008374095 |
|    clip_fraction        | 0.0428      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.689      |
|    explained_variance   | -0.0236     |
|    learning_rate        | 0.0003      |
|    loss                 | 170         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00363    |
|    value_loss           | 802         |
----------------------------

In [1]:
import random
import numpy as np
import gym
import torch
import json
import time
import warnings
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import google.generativeai as genai
from typing import List, Dict, Tuple

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ==================================
# 📌 GEMINI API INTEGRATION
# ==================================
class GeminiAnalyzer:
    def __init__(self, api_key: str):
        """Initialize Gemini API for strategy analysis"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash')
        
    def analyze_training_progress(self, step: int, total_steps: int, 
                                recent_rewards: List[float], 
                                recent_episodes: Dict[str, List[str]]) -> str:
        """Analyze PPO training progress with streamlined prompt"""
        
        # Add rate limiting
        time.sleep(0.5)
        
        # Calculate training metrics
        avg_reward = np.mean(recent_rewards) if recent_rewards else 0
        reward_trend = "improving" if len(recent_rewards) >= 10 and np.mean(recent_rewards[-5:]) > np.mean(recent_rewards[-10:-5]) else "stable"
        
        prompt = f"""
        Analyze PPO agent training progress in Prisoner's Dilemma:

        TRAINING STATUS:
        - Step: {step:,}/{total_steps:,} ({step/total_steps*100:.1f}% complete)
        - Recent Average Reward: {avg_reward:.2f}
        - Performance Trend: {reward_trend}
        
        RECENT BEHAVIOR SAMPLES:
        """
        
        # Add sample episodes from different strategies
        for strategy_name, episodes in recent_episodes.items():
            if episodes:
                latest_episode = episodes[-1] if episodes else "No data"
                prompt += f"\nVs {strategy_name}: {latest_episode[:50]}{'...' if len(latest_episode) > 50 else ''}"
        
        prompt += f"""
        
        Provide brief insights on:
        1. What strategy is the agent learning?
        2. Is it adapting well across different opponents?
        3. Training progress assessment
        
        Keep response under 150 words, focus on key strategic insights.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Training analysis unavailable: {str(e)[:50]}..."
        
    def analyze_strategy_evolution(self, agent_id: str, move_history: List[str], 
                                 opponent_history: List[str], probabilities: List[float],
                                 rewards: List[float], opponent_name: str) -> str:
        """Analyze how the PPO agent's strategy evolves over time with simplified prompt"""
        
        # Add rate limiting to respect API limits
        time.sleep(0.5)
        
        # Calculate key metrics only
        coop_rate = move_history.count('C') / len(move_history) * 100
        recent_moves = move_history[-20:] if len(move_history) >= 20 else move_history
        
        # Tit-for-tat analysis
        tit_for_tat_matches = 0
        if len(move_history) > 1 and len(opponent_history) > 0:
            for i in range(1, min(len(move_history), len(opponent_history))):
                if move_history[i] == opponent_history[i-1]:
                    tit_for_tat_matches += 1
        
        tit_for_tat_percentage = (tit_for_tat_matches / max(1, len(move_history)-1)) * 100
        
        prompt = f"""
        Analyze PPO agent strategy vs {opponent_name}:

        BEHAVIOR:
        - Moves: {len(move_history)} total
        - Agent: {' '.join(recent_moves)}
        - Opponent: {' '.join(opponent_history[-20:] if len(opponent_history) >= 20 else opponent_history)}
        - Cooperation Rate: {coop_rate:.1f}%
        - Tit-for-Tat Match: {tit_for_tat_percentage:.1f}%
        - Recent Reward: {np.mean(rewards[-10:]) if len(rewards) >= 10 else np.mean(rewards):.2f}

        What strategy has the agent learned? How effective is it against {opponent_name}?
        Keep response under 200 words.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Strategy analysis unavailable: {str(e)[:50]}..."
    
    def compare_strategies(self, results: Dict[str, Dict]) -> str:
        """Compare PPO performance with simplified prompt"""
        
        # Add rate limiting
        time.sleep(1.0)
        
        # Find best and worst performance
        best_performance = max(results.values(), key=lambda x: x['total_reward'])
        worst_performance = min(results.values(), key=lambda x: x['total_reward'])
        
        prompt = f"""
        Final PPO Strategy Assessment:

        PERFORMANCE RESULTS:
        """
        
        for name, data in results.items():
            prompt += f"\n- {name}: {data['total_reward']:.1f} pts, {data['cooperation_rate']*100:.1f}% coop"
        
        prompt += f"""
        
        Best vs: {[k for k, v in results.items() if v['total_reward'] == best_performance['total_reward']][0]}
        Worst vs: {[k for k, v in results.items() if v['total_reward'] == worst_performance['total_reward']][0]}
        
        What core strategy has PPO learned? Is it robust or exploitable? 
        How sophisticated is the learned behavior?
        Keep response under 250 words.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Strategy comparison unavailable: {str(e)[:50]}..."

# ==================================
# 📌 TRAINING CALLBACK FOR GEMINI ANALYSIS
# ==================================
class GeminiTrainingCallback(BaseCallback):
    def __init__(self, gemini_analyzer: GeminiAnalyzer, analysis_interval: int = 25000):
        super().__init__()
        self.gemini_analyzer = gemini_analyzer
        self.analysis_interval = analysis_interval
        self.recent_rewards = []
        self.recent_episodes = {
            'tit_for_tat': [],
            'generous_tit_for_tat': [],
            'tit_for_two_tats': [],
            'always_defect': [],
            'always_cooperate': []
        }
        
    def _on_step(self) -> bool:
        # Collect reward data
        if hasattr(self.training_env, 'get_attr'):
            try:
                rewards = self.training_env.get_attr('full_reward_history')
                for env_rewards in rewards:
                    if env_rewards:
                        self.recent_rewards.extend(env_rewards[-10:])  # Last 10 rewards
                
                # Keep only recent data
                if len(self.recent_rewards) > 100:
                    self.recent_rewards = self.recent_rewards[-100:]
                    
            except:
                pass  # Silently handle if attributes don't exist
        
        # Periodic analysis
        if self.num_timesteps % self.analysis_interval == 0:
            print(f"\n{'🤖 GEMINI TRAINING ANALYSIS':<30} (Step {self.num_timesteps:,})")
            print("-" * 70)
            
            analysis = self.gemini_analyzer.analyze_training_progress(
                step=self.num_timesteps,
                total_steps=self.analysis_interval * 10,  # Assuming 250k total
                recent_rewards=self.recent_rewards,
                recent_episodes=self.recent_episodes
            )
            print(analysis)
            print("-" * 70)
            
        return True

# ==================================
# 📌 ENHANCED PRISONER'S DILEMMA ENVIRONMENT
# ==================================
class PrisonersDilemmaEnv(gym.Env):
    def __init__(self, strategy, agent_id="PPO_AGENT", history_len=10):
        super(PrisonersDilemmaEnv, self).__init__()
        self.history_len = history_len
        self.agent_id = agent_id
        self.action_space = gym.spaces.Discrete(2)  # 0: Cooperate, 1: Defect
        self.observation_space = gym.spaces.Box(low=0, high=5, shape=(3 * self.history_len,), dtype=np.float32)

        self.strategy = strategy
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        
        # Enhanced tracking for Gemini analysis
        self.full_agent_history = []
        self.full_opponent_history = []
        self.full_reward_history = []
        self.action_probabilities = []

    def step(self, action):
        opponent_action = 0 if self.strategy(self.agent_history, self.opponent_history, False) == 'C' else 1
        reward = self.get_reward(action, opponent_action)

        # Update histories
        self.agent_history.append(action)
        self.opponent_history.append(opponent_action)
        self.reward_history.append(reward)
        
        # Update full histories for analysis
        self.full_agent_history.append(action)
        self.full_opponent_history.append(opponent_action)
        self.full_reward_history.append(reward)

        # Maintain sliding window
        if len(self.agent_history) > self.history_len:
            self.agent_history.pop(0)
            self.opponent_history.pop(0)
            self.reward_history.pop(0)

        obs = self._get_obs()
        return obs, reward, False, {}

    def reset(self):
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        return self._get_obs()

    def _get_obs(self):
        pad = lambda x: x + [0] * (self.history_len - len(x))
        return np.array(pad(self.agent_history) + pad(self.opponent_history) + pad(self.reward_history), dtype=np.float32)

    def get_reward(self, action, opponent_action):
        # Classic Prisoner's Dilemma payoff matrix
        if action == 0 and opponent_action == 0:      # Both cooperate
            return 3
        elif action == 1 and opponent_action == 1:    # Both defect
            return 1
        elif action == 0 and opponent_action == 1:    # Agent cooperates, opponent defects
            return 0
        else:                                         # Agent defects, opponent cooperates
            return 5
    
    def add_action_probability(self, prob):
        """Store action probability for analysis"""
        self.action_probabilities.append(prob)

# ==================================
# 📌 STRATEGIES (with proper conversion)
# ==================================
def to_symbolic(hist):
    return ['C' if a == 0 else 'D' for a in hist]

def tit_for_tat(agent_hist, opp_hist, __):
    return 'C' if not agent_hist else ('C' if agent_hist[-1] == 0 else 'D')

def tit_for_two_tats(agent_hist, opp_hist, __):
    if len(agent_hist) >= 2 and agent_hist[-2:] == [1, 1]:
        return 'D'
    return 'C'

def generous_tit_for_tat(agent_hist, opp_hist, __, cooperation_prob=0.1):
    if not agent_hist:
        return 'C'
    if agent_hist[-1] == 1:
        return 'C' if random.random() < cooperation_prob else 'D'
    return 'C'

def always_cooperate(*args): return 'C'
def always_defect(*args): return 'D'
def random_strategy(*args): return random.choice(['C', 'D'])
def friedman(agent_hist, opp_hist, __): return 'D' if 1 in agent_hist else 'C'

def joss(agent_hist, opp_hist, __):
    if agent_hist and random.random() < 0.1:
        return 'D'
    if agent_hist: 
        return 'C' if agent_hist[-1] == 0 else 'D'
    return 'C'

def grasskamp(agent_hist, opp_hist, __):
    return 'D' if len(agent_hist) >= 3 and agent_hist[-3:] == [1, 1, 1] else 'C'

def sample(agent_hist, opp_hist, __): return random.choice(['C', 'C', 'D'])
def tester(agent_hist, opp_hist, __): return 'D' if not agent_hist else 'C'
def opportunist(agent_hist, opp_hist, __): return 'D' if agent_hist and agent_hist[-1] == 0 else 'C'
def backstabber(agent_hist, opp_hist, __): return 'C' if len(agent_hist) < 5 else 'D'
def chaos_agent(*args): return random.choice(['C', 'D'])

# ==================================
# 📌 TRAIN PPO AGAINST STRATEGIES WITH GEMINI ANALYSIS
# ==================================
def train_ppo_with_gemini(gemini_api_key: str, history_len=10, total_timesteps=250000):
    """Train PPO with Gemini analysis during training"""
    
    strategies = {
        "tit_for_tat": tit_for_tat,
        "generous_tit_for_tat": generous_tit_for_tat,
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    # Create environments
    envs = [lambda s=s, i=i: PrisonersDilemmaEnv(s, agent_id=f"PPO_vs_{name}", history_len=history_len)
            for i, (name, s) in enumerate(strategies.items())]
    vec_env = DummyVecEnv(envs)

    # Create model
    model = PPO("MlpPolicy", vec_env, verbose=1, ent_coef=0.01)
    
    # Create callback for Gemini analysis
    gemini_callback = GeminiTrainingCallback(analyzer, analysis_interval=25000)
    
    print(f"🚀 Starting PPO Training with Gemini Analysis (every 25,000 steps)")
    print(f"Total timesteps: {total_timesteps:,}")
    print("-" * 70)
    
    # Train with callback
    model.learn(total_timesteps=total_timesteps, callback=gemini_callback)
    
    # Save model
    model.save("ppo_all_strategies_with_gemini")
    print(f"\n✅ Training completed! Model saved as 'ppo_all_strategies_with_gemini'")
    
    return model

# ==================================
# 📌 ENHANCED TEST WITH DETAILED LOGGING AND GEMINI ANALYSIS
# ==================================
def test_ppo_with_gemini(gemini_api_key: str, history_len=10, print_steps=True, analyze_every=100, num_episodes=500):
    """Test PPO with detailed step logging and Gemini analysis integration"""
    
    strategies = {
        "tit_for_tat": tit_for_tat,
        "generous_tit_for_tat": generous_tit_for_tat,
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    print("\n🔹 Loading PPO Model")
    try:
        model = PPO.load("ppo_all_strategies_with_gemini")
    except:
        print("⚠️  Model not found, loading fallback model...")
        model = PPO.load("ppo_all_strategies_full_history")
        
    results = {}
    detailed_results = {}

    for name, strategy in strategies.items():
        print(f"\n{'='*60}")
        print(f"🧪 TESTING PPO AGAINST: {name.upper()}")
        print(f"{'='*60}")
        
        agent_id = f"PPO_vs_{name}"
        env = DummyVecEnv([lambda s=strategy, aid=agent_id: PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)])
        obs = env.reset()
        total_reward = 0
        
        # Detailed logging header
        if print_steps:
            print(f"\n📋 DETAILED STEP-BY-STEP LOG:")
            print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
            print("-" * 80)

        for i in range(num_episodes):
            # Get action and probabilities more robustly
            action, _ = model.predict(obs, deterministic=False)
            
            # Get action probabilities with proper device handling
            try:
                # Ensure tensor is on the same device as the model
                device = next(model.policy.parameters()).device
                obs_tensor = torch.FloatTensor(obs).to(device)
                
                with torch.no_grad():
                    # Extract features and get logits
                    features = model.policy.extract_features(obs_tensor)
                    latent_pi = model.policy.mlp_extractor.forward_actor(features)
                    logits = model.policy.action_net(latent_pi)
                    action_probs = torch.softmax(logits, dim=-1)
                    prob_cooperate = float(action_probs[0][0].cpu())
            except Exception as e:
                # Fallback: use a simple heuristic based on recent actions
                prob_cooperate = 0.5  # Default probability
                if i == 0:  # Only print warning once
                    print(f"Warning: Using default probability due to: {str(e)[:50]}...")
            
            # Take step
            obs, reward, done, _ = env.step(action)
            action_val = int(action[0])
            
            # Store probability
            env.envs[0].add_action_probability(prob_cooperate)
            
            opponent_action = env.envs[0].opponent_history[-1]
            total_reward += reward[0]
            
            # Detailed step logging
            if print_steps:
                # Determine notes for interesting patterns
                notes = ""
                if i > 0:
                    prev_opp = env.envs[0].full_opponent_history[-2] if len(env.envs[0].full_opponent_history) >= 2 else None
                    if prev_opp is not None:
                        if action_val == prev_opp:
                            notes += "TFT? "
                        if action_val == 0 and opponent_action == 1:
                            notes += "EXPLOITED "
                        elif action_val == 1 and opponent_action == 0:
                            notes += "EXPLOITING "
                
                print(f"{i+1:>3d} | {'C' if action_val==0 else 'D':>5} | {'C' if opponent_action==0 else 'D':>8} | "
                      f"{reward[0]:>6.1f} | {total_reward:>10.1f} | {prob_cooperate:>8.3f} | {notes}")
            
            # Periodic Gemini analysis
            if (i + 1) % analyze_every == 0:
                print(f"\n{'🤖 GEMINI ANALYSIS':<20} (Step {i+1})")
                print("-" * 60)
                
                agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
                opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
                
                analysis = analyzer.analyze_strategy_evolution(
                    agent_id=agent_id,
                    move_history=agent_moves,
                    opponent_history=opponent_moves,
                    probabilities=env.envs[0].action_probabilities,
                    rewards=env.envs[0].full_reward_history,
                    opponent_name=name
                )
                print(analysis)
                print("-" * 60)
                
                if print_steps:
                    print(f"\n📋 CONTINUING STEP LOG:")
                    print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
                    print("-" * 80)

        # Store detailed results
        agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
        opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
        
        detailed_results[name] = {
            'total_reward': float(total_reward),
            'average_reward': float(total_reward / num_episodes),
            'cooperation_rate': agent_moves.count('C') / len(agent_moves),
            'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
            'avg_reward': float(np.mean(env.envs[0].full_reward_history)),
            'final_probabilities': env.envs[0].action_probabilities[-10:] if len(env.envs[0].action_probabilities) >= 10 else env.envs[0].action_probabilities,
            'agent_id': agent_id,
            'total_episodes': num_episodes
        }
        
        results[name] = total_reward
        
        # Summary for this opponent
        print(f"\n📊 MATCH SUMMARY:")
        print(f"Agent: {agent_id}")
        print(f"Total Reward: {total_reward:.1f}")
        print(f"Average Reward per Episode: {total_reward/num_episodes:.2f}")
        print(f"Agent Cooperation Rate: {detailed_results[name]['cooperation_rate']:.1%}")
        print(f"Opponent Cooperation Rate: {detailed_results[name]['opponent_cooperation_rate']:.1%}")

    # Final comprehensive analysis
    print("\n" + "="*80)
    print("🧠 COMPREHENSIVE GEMINI STRATEGY ANALYSIS")
    print("="*80)
    
    final_analysis = analyzer.compare_strategies(detailed_results)
    print(final_analysis)
    
    print("\n" + "="*80)
    print("📊 FINAL PPO PERFORMANCE SUMMARY")
    print("="*80)
    for strategy, score in results.items():
        details = detailed_results[strategy]
        print(f"{strategy:>20}: {score:>6.1f} pts | Avg: {details['average_reward']:>5.2f} | "
              f"Coop: {details['cooperation_rate']:>5.1%} | "
              f"vs {details['opponent_cooperation_rate']:>5.1%} opp")
    
    return results, detailed_results

# ==================================
# 📌 MAIN EXECUTION WITH GEMINI
# ==================================
def main():
    # You need to set your Gemini API key here
    GEMINI_API_KEY = "AIzaSyALn5gEk1DeBNoIFRBdk52K3d_S5JKyC3M"  # Replace with your actual API key
    
    if not GEMINI_API_KEY or GEMINI_API_KEY == "":
        print("⚠️  Please set your Gemini API key in the GEMINI_API_KEY variable")
        print("You can get one from: https://makersuite.google.com/app/apikey")
        return
    
    print("🚀 Training PPO Agent with Gemini Analysis...")
    train_ppo_with_gemini(GEMINI_API_KEY, total_timesteps=250000)
    
    print("\n🧪 Testing PPO with Enhanced Analysis...")
    results, detailed_results = test_ppo_with_gemini(
        gemini_api_key=GEMINI_API_KEY,
        print_steps=True,       # Enable detailed step-by-step logging
        analyze_every=100,      # Analyze every 100 steps
        num_episodes=500        # Number of episodes per opponent
    )

# ==================================
# 🚀 RUN TRAINING & TESTING
# ==================================
if __name__ == "__main__":
    main()

  File "/usr/local/lib/python3.11/dist-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/usr/local/lib/python3.11/dist-packages/ale_py/__init__.py", line 68, in <module>
    register_v0_v4_envs()
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 179, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 64, in _register_rom_configs
    gymnasium.register(
    ^^^^^^^^^^^^^^^^^^
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {trace

🚀 Training PPO Agent with Gemini Analysis...
Using cpu device
🚀 Starting PPO Training with Gemini Analysis (every 25,000 steps)
Total timesteps: 250,000
----------------------------------------------------------------------
------------------------------
| time/              |       |
|    fps             | 4968  |
|    iterations      | 1     |
|    time_elapsed    | 2     |
|    total_timesteps | 10240 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1737        |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.010028106 |
|    clip_fraction        | 0.0441      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.683      |
|    explained_variance   | 0.0172      |
|    learning_rate        | 0.0003      |
|    loss    

In [4]:
import random
import numpy as np
import gym
import torch
import json
import time
import warnings
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import google.generativeai as genai
from typing import List, Dict, Tuple

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ==================================
# 📌 GEMINI API INTEGRATION
# ==================================
class GeminiAnalyzer:
    def __init__(self, api_key: str):
        """Initialize Gemini API for strategy analysis"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash')
        
    def analyze_strategy_evolution(self, agent_id: str, move_history: List[str], 
                                 opponent_history: List[str], probabilities: List[float],
                                 rewards: List[float], opponent_name: str, 
                                 is_training: bool = False, training_step: int = 0) -> str:
        """Analyze how the PPO agent's strategy evolves over time with enhanced strategy detection"""
        
        # Add rate limiting to respect API limits
        time.sleep(0.5)
        
        # Calculate additional metrics for strategy analysis
        recent_moves = move_history[-20:] if len(move_history) >= 20 else move_history
        recent_opp_moves = opponent_history[-20:] if len(opponent_history) >= 20 else opponent_history
        
        # Pattern analysis
        consecutive_cooperations = 0
        consecutive_defections = 0
        max_coop_streak = 0
        max_defect_streak = 0
        current_coop_streak = 0
        current_defect_streak = 0
        
        for move in move_history:
            if move == 'C':
                current_coop_streak += 1
                current_defect_streak = 0
                max_coop_streak = max(max_coop_streak, current_coop_streak)
            else:
                current_defect_streak += 1
                current_coop_streak = 0
                max_defect_streak = max(max_defect_streak, current_defect_streak)
        
        # Tit-for-tat analysis
        tit_for_tat_matches = 0
        if len(move_history) > 1 and len(opponent_history) > 0:
            for i in range(1, min(len(move_history), len(opponent_history))):
                if move_history[i] == opponent_history[i-1]:
                    tit_for_tat_matches += 1
        
        tit_for_tat_percentage = (tit_for_tat_matches / max(1, len(move_history)-1)) * 100
        
        # Response to opponent's last move
        responses_to_coop = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'C']
        responses_to_defect = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'D']
        
        coop_after_coop = responses_to_coop.count('C') / max(1, len(responses_to_coop)) * 100
        coop_after_defect = responses_to_defect.count('C') / max(1, len(responses_to_defect)) * 100
        
        # Create enhanced analysis prompt with training/testing context
        training_context = ""
        if is_training:
            training_context = f"""
        TRAINING CONTEXT:
        - Training Step: {training_step:,}
        - Phase: LEARNING (agent is actively updating its policy)
        - Focus: How is the agent's strategy EVOLVING during training?
        """
        else:
            training_context = """
        TESTING CONTEXT:
        - Phase: EVALUATION (agent is using learned policy)
        - Focus: What strategy has the agent LEARNED from training?
        """
        
        prompt = f"""
        You are analyzing a PPO reinforcement learning agent playing the Prisoner's Dilemma. Your task is to identify what strategy the agent has learned and how it's adapting.

        GAME CONTEXT:
        - Agent ID: {agent_id}
        - Opponent Strategy: {opponent_name}
        - Total Moves Played: {len(move_history)}
        {training_context}
        
        MOVE SEQUENCES (most recent 30 moves):
        Agent:    {' '.join(move_history[-30:])}
        Opponent: {' '.join(opponent_history[-30:])}
        
        BEHAVIORAL METRICS:
        - Agent Cooperation Rate: {move_history.count('C') / len(move_history) * 100:.1f}%
        - Opponent Cooperation Rate: {opponent_history.count('C') / len(opponent_history) * 100:.1f}%
        - Tit-for-Tat Adherence: {tit_for_tat_percentage:.1f}%
        - Cooperate after Opponent Cooperates: {coop_after_coop:.1f}%
        - Cooperate after Opponent Defects: {coop_after_defect:.1f}%
        - Max Cooperation Streak: {max_coop_streak}
        - Max Defection Streak: {max_defect_streak}
        - Current Streak: {current_coop_streak if move_history[-1] == 'C' else current_defect_streak} {'cooperations' if move_history[-1] == 'C' else 'defections'}
        
        DECISION PROBABILITIES (last 15):
        Cooperation Probabilities: {[f'{p:.3f}' for p in probabilities[-15:]]}
        
        PERFORMANCE METRICS:
        - Recent Rewards (last 15): {[f'{r:.1f}' for r in rewards[-15:]]}
        - Average Recent Reward: {np.mean(rewards[-20:]) if len(rewards) >= 20 else np.mean(rewards):.2f}
        - Reward Trend: {'Improving' if len(rewards) >= 10 and np.mean(rewards[-5:]) > np.mean(rewards[-10:-5]) else 'Declining' if len(rewards) >= 10 else 'Establishing'}
        
        STRATEGIC ANALYSIS QUESTIONS:
        1. **Primary Strategy Identification**: What well-known strategy does this agent most closely resemble? 
           Consider: Tit-for-Tat, Always Cooperate, Always Defect, Generous Tit-for-Tat, Random etc.
        
        2. **Adaptation Pattern**: How is the agent modifying its approach based on the opponent's behavior?
        
        3. **Learning Evidence**: What evidence shows the agent is {'learning and evolving' if is_training else 'applying learned strategy'}?
        
        4. **Exploitation vs Cooperation**: Is the agent being exploitative, cooperative, or strategic?
        
        5. **Prediction**: Based on current patterns, how will the agent likely behave in the next 10 moves?
        
        6. **Strategy Effectiveness**: How well is this {'evolving' if is_training else 'learned'} strategy performing against {opponent_name}?
        
        {'7. **Training Progress**: What changes do you observe compared to earlier in training? Is the strategy stabilizing?' if is_training else ''}
        
        Please provide a concise but comprehensive analysis (max 250 words) focusing on strategy identification and {'learning evolution' if is_training else 'learned patterns'}.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini analysis temporarily unavailable: {str(e)[:100]}..."
    
    def compare_strategies(self, results: Dict[str, Dict], is_training: bool = False) -> str:
        """Compare PPO performance across different opponent strategies with enhanced analysis"""
        
        # Add rate limiting
        time.sleep(1.0)
        
        # Calculate cross-strategy metrics
        best_performance = max(results.values(), key=lambda x: x['total_reward'])
        worst_performance = min(results.values(), key=lambda x: x['total_reward'])
        
        training_context = "TRAINING ANALYSIS" if is_training else "TESTING ANALYSIS"
        phase_description = "learning and adaptation" if is_training else "final learned strategy"
        
        prompt = f"""
        {training_context}: Analyze the PPO agent's {phase_description} by comparing its performance across different Prisoner's Dilemma opponents.

        PERFORMANCE SUMMARY:
        {json.dumps(results, indent=2)}
        
        COMPARATIVE ANALYSIS:
        - Best Performance: {[k for k, v in results.items() if v['total_reward'] == best_performance['total_reward']][0]} (Score: {best_performance['total_reward']:.1f})
        - Worst Performance: {[k for k, v in results.items() if v['total_reward'] == worst_performance['total_reward']][0]} (Score: {worst_performance['total_reward']:.1f})
        - Performance Range: {best_performance['total_reward'] - worst_performance['total_reward']:.1f} points
        
        STRATEGIC QUESTIONS TO ANALYZE:
        
        1. **Overall Strategy Classification**: Based on all matchups, what is the PPO agent's {'emerging' if is_training else 'primary learned'} strategy?
           - Is it closer to Tit-for-Tat, Always Defect, Generous strategies, or something unique?
        
        2. **Opponent-Specific Adaptations**: How does the agent's cooperation rate change based on opponent type?
           - Against cooperative opponents vs. aggressive opponents
           - Does it show strategic flexibility or fixed behavior?
        
        3. **Exploitation Patterns**: 
           - Which opponents does it successfully exploit?
           - Which opponents exploit it?
           - What does this reveal about its {'developing' if is_training else 'learned'} weaknesses/strengths?
        
        4. **Learning Sophistication**: 
           - Does the agent show sophisticated counter-strategies?
           - Or is it using simple heuristics?
        
        5. **Strategic Robustness**: 
           - Is this a well-rounded strategy or does it have clear vulnerabilities?
           - How would it perform against novel opponents?
        
        6. **Reward Optimization**: 
           - Is the agent maximizing mutual benefit or self-interest?
           - How does this align with game theory predictions?
        
        {'7. **Training Evolution**: How is the strategy developing? What patterns suggest continued learning?' if is_training else '7. **Final Assessment**: How sophisticated and effective is the final learned strategy?'}
        
        Provide comprehensive strategic insights about what the PPO agent has {'learned so far and how it\'s evolving' if is_training else 'learned and how sophisticated its strategy is'} (max 400 words).
        Focus on identifying the core strategic principles it has discovered through reinforcement learning.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini comparison temporarily unavailable: {str(e)[:100]}..."

# ==================================
# 📌 ENHANCED PRISONER'S DILEMMA ENVIRONMENT
# ==================================
class PrisonersDilemmaEnv(gym.Env):
    def __init__(self, strategy, agent_id="PPO_AGENT", history_len=10):
        super(PrisonersDilemmaEnv, self).__init__()
        self.history_len = history_len
        self.agent_id = agent_id
        self.action_space = gym.spaces.Discrete(2)  # 0: Cooperate, 1: Defect
        self.observation_space = gym.spaces.Box(low=0, high=5, shape=(3 * self.history_len,), dtype=np.float32)

        self.strategy = strategy
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        
        # Enhanced tracking for Gemini analysis
        self.full_agent_history = []
        self.full_opponent_history = []
        self.full_reward_history = []
        self.action_probabilities = []

    def step(self, action):
        opponent_action = 0 if self.strategy(self.agent_history, self.opponent_history, False) == 'C' else 1
        reward = self.get_reward(action, opponent_action)

        # Update histories
        self.agent_history.append(action)
        self.opponent_history.append(opponent_action)
        self.reward_history.append(reward)
        
        # Update full histories for analysis
        self.full_agent_history.append(action)
        self.full_opponent_history.append(opponent_action)
        self.full_reward_history.append(reward)

        # Maintain sliding window
        if len(self.agent_history) > self.history_len:
            self.agent_history.pop(0)
            self.opponent_history.pop(0)
            self.reward_history.pop(0)

        obs = self._get_obs()
        return obs, reward, False, {}

    def reset(self):
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        return self._get_obs()

    def _get_obs(self):
        pad = lambda x: x + [0] * (self.history_len - len(x))
        return np.array(pad(self.agent_history) + pad(self.opponent_history) + pad(self.reward_history), dtype=np.float32)

    def get_reward(self, action, opponent_action):
        # Classic Prisoner's Dilemma payoff matrix
        if action == 0 and opponent_action == 0:      # Both cooperate
            return 3
        elif action == 1 and opponent_action == 1:    # Both defect
            return 1
        elif action == 0 and opponent_action == 1:    # Agent cooperates, opponent defects
            return 0
        else:                                         # Agent defects, opponent cooperates
            return 5
    
    def add_action_probability(self, prob):
        """Store action probability for analysis"""
        self.action_probabilities.append(prob)

# ==================================
# 📌 GEMINI TRAINING CALLBACK
# ==================================
class GeminiTrainingCallback(BaseCallback):
    def __init__(self, gemini_analyzer: GeminiAnalyzer, analysis_frequency: int = 10000, verbose: int = 0):
        super(GeminiTrainingCallback, self).__init__(verbose)
        self.gemini_analyzer = gemini_analyzer
        self.analysis_frequency = analysis_frequency
        self.last_analysis_step = 0
        
    def _on_step(self) -> bool:
        # Check if it's time for analysis
        if self.num_timesteps - self.last_analysis_step >= self.analysis_frequency:
            self.last_analysis_step = self.num_timesteps
            
            # Get environment data for analysis
            try:
                # Access the first environment (assuming DummyVecEnv)
                env = self.training_env.envs[0]
                
                if hasattr(env, 'full_agent_history') and len(env.full_agent_history) > 50:
                    # Convert to symbolic moves
                    agent_moves = ['C' if a == 0 else 'D' for a in env.full_agent_history]
                    opponent_moves = ['C' if a == 0 else 'D' for a in env.full_opponent_history]
                    
                    # Get strategy name from environment
                    strategy_name = getattr(env, 'strategy_name', 'Unknown')
                    
                    print(f"\n{'='*60}")
                    print(f"🧠 GEMINI TRAINING ANALYSIS - Step {self.num_timesteps:,}")
                    print(f"{'='*60}")
                    
                    # Perform Gemini analysis
                    analysis = self.gemini_analyzer.analyze_strategy_evolution(
                        agent_id=env.agent_id,
                        move_history=agent_moves,
                        opponent_history=opponent_moves,
                        probabilities=env.action_probabilities[-100:] if len(env.action_probabilities) > 100 else env.action_probabilities,
                        rewards=env.full_reward_history,
                        opponent_name=strategy_name,
                        is_training=True,
                        training_step=self.num_timesteps
                    )
                    
                    print(analysis)
                    print("="*60)
                    
            except Exception as e:
                if self.verbose > 0:
                    print(f"Training analysis error: {str(e)[:100]}...")
        
        return True

# ==================================
# 📌 STRATEGIES (with proper conversion)
# ==================================
def to_symbolic(hist):
    return ['C' if a == 0 else 'D' for a in hist]

def tit_for_tat(agent_hist, opp_hist, __):
    return 'C' if not agent_hist else ('C' if agent_hist[-1] == 0 else 'D')

def tit_for_two_tats(agent_hist, opp_hist, __):
    if len(agent_hist) >= 2 and agent_hist[-2:] == [1, 1]:
        return 'D'
    return 'C'

def generous_tit_for_tat(agent_hist, opp_hist, __, cooperation_prob=0.1):
    if not agent_hist:
        return 'C'
    if agent_hist[-1] == 1:
        return 'C' if random.random() < cooperation_prob else 'D'
    return 'C'

def always_cooperate(*args): return 'C'
def always_defect(*args): return 'D'
def random_strategy(*args): return random.choice(['C', 'D'])
def friedman(agent_hist, opp_hist, __): return 'D' if 1 in agent_hist else 'C'

def joss(agent_hist, opp_hist, __):
    if agent_hist and random.random() < 0.1:
        return 'D'
    if agent_hist: 
        return 'C' if agent_hist[-1] == 0 else 'D'
    return 'C'

def grasskamp(agent_hist, opp_hist, __):
    return 'D' if len(agent_hist) >= 3 and agent_hist[-3:] == [1, 1, 1] else 'C'

def sample(agent_hist, opp_hist, __): return random.choice(['C', 'C', 'D'])
def tester(agent_hist, opp_hist, __): return 'D' if not agent_hist else 'C'
def opportunist(agent_hist, opp_hist, __): return 'D' if agent_hist and agent_hist[-1] == 0 else 'C'
def backstabber(agent_hist, opp_hist, __): return 'C' if len(agent_hist) < 5 else 'D'
def chaos_agent(*args): return random.choice(['C', 'D'])

# ==================================
# 📌 ENHANCED TRAIN PPO WITH GEMINI ANALYSIS
# ==================================
def train_ppo_with_gemini(gemini_api_key: str, history_len=10, total_timesteps=150000, analysis_frequency=30000):
    """Train PPO with Gemini analysis during training"""
    
    strategies = {
        
        "tit_for_two_tats": tit_for_two_tats,
       
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    print("🚀 Training PPO Agent with Gemini Analysis...")
    print(f"Total timesteps: {total_timesteps:,}")
    print(f"Analysis frequency: every {analysis_frequency:,} steps")
    print("="*60)

    # Create environments with strategy names for better tracking
    envs = []
    for i, (name, strategy) in enumerate(strategies.items()):
        def make_env(s=strategy, n=name, aid=f"PPO_vs_{name}_training"):
            env = PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)
            env.strategy_name = n  # Add strategy name for callback
            return env
        envs.append(make_env)
    
    vec_env = DummyVecEnv(envs)

    # Create PPO model
    model = PPO("MlpPolicy", vec_env, verbose=1, ent_coef=0.01)
    
    # Create Gemini callback for training analysis
    gemini_callback = GeminiTrainingCallback(
        gemini_analyzer=analyzer,
        analysis_frequency=analysis_frequency,
        verbose=1
    )
    
    # Train with Gemini analysis
    model.learn(total_timesteps=total_timesteps, callback=gemini_callback)
    
    # Final training analysis
    print("\n" + "="*80)
    print("🎯 FINAL GEMINI TRAINING ANALYSIS")
    print("="*80)
    
    # Collect final training data from all environments
    training_results = {}
    for i, (name, _) in enumerate(strategies.items()):
        env = vec_env.envs[i]
        if hasattr(env, 'full_agent_history') and len(env.full_agent_history) > 0:
            agent_moves = ['C' if a == 0 else 'D' for a in env.full_agent_history]
            opponent_moves = ['C' if a == 0 else 'D' for a in env.full_opponent_history]
            
            training_results[name] = {
                'total_reward': float(np.sum(env.full_reward_history)),
                'average_reward': float(np.mean(env.full_reward_history)),
                'cooperation_rate': agent_moves.count('C') / len(agent_moves),
                'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
                'total_episodes': len(env.full_agent_history),
                'agent_id': env.agent_id
            }
    
    if training_results:
        final_analysis = analyzer.compare_strategies(training_results, is_training=True)
        print(final_analysis)
    
    # Save model
    model.save("ppo_all_strategies_full_history_with_gemini")
    print("\n✅ Model saved as 'ppo_all_strategies_full_history_with_gemini'")
    
    return model, training_results

# ==================================
# 📌 ENHANCED TEST WITH DETAILED LOGGING AND GEMINI ANALYSIS
# ==================================
def test_ppo_with_gemini(gemini_api_key: str, model_path: str = None, history_len=10, 
                        print_steps=True, analyze_every=100, num_episodes=500):
    """Test PPO with detailed step logging and Gemini analysis integration"""
    
    strategies = {
    
        "tit_for_two_tats": tit_for_two_tats,
       
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    # Load model
    if model_path:
        print(f"\n🔹 Loading PPO Model from: {model_path}")
        model = PPO.load(model_path)
    else:
        print("\n🔹 Loading Default PPO Model")
        try:
            model = PPO.load("ppo_all_strategies_full_history_with_gemini")
        except:
            model = PPO.load("ppo_all_strategies_full_history")
    
    results = {}
    detailed_results = {}

    for name, strategy in strategies.items():
        print(f"\n{'='*60}")
        print(f"🧪 TESTING PPO AGAINST: {name.upper()}")
        print(f"{'='*60}")
        
        agent_id = f"PPO_vs_{name}_test"
        env = DummyVecEnv([lambda s=strategy, aid=agent_id: PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)])
        obs = env.reset()
        total_reward = 0
        
        # Detailed logging header
        if print_steps:
            print(f"\n📋 DETAILED STEP-BY-STEP LOG:")
            print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
            print("-" * 80)

        for i in range(num_episodes):
            # Get action and probabilities more robustly
            action, _ = model.predict(obs, deterministic=False)
            
            # Get action probabilities with proper device handling
            try:
                # Ensure tensor is on the same device as the model
                device = next(model.policy.parameters()).device
                obs_tensor = torch.FloatTensor(obs).to(device)
                
                with torch.no_grad():
                    # Extract features and get logits
                    features = model.policy.extract_features(obs_tensor)
                    latent_pi = model.policy.mlp_extractor.forward_actor(features)
                    logits = model.policy.action_net(latent_pi)
                    action_probs = torch.softmax(logits, dim=-1)
                    prob_cooperate = float(action_probs[0][0].cpu())
            except Exception as e:
                # Fallback: use a simple heuristic based on recent actions
                prob_cooperate = 0.5  # Default probability
                if i == 0:  # Only print warning once
                    print(f"Warning: Using default probability due to: {str(e)[:50]}...")
            
            # Take step
            obs, reward, done, _ = env.step(action)
            action_val = int(action[0])
            
            # Store probability
            env.envs[0].add_action_probability(prob_cooperate)
            
            opponent_action = env.envs[0].opponent_history[-1]
            total_reward += reward[0]
            
            # Detailed step logging
            if print_steps:
                # Determine notes for interesting patterns
                notes = ""
                if i > 0:
                    prev_opp = env.envs[0].full_opponent_history[-2] if len(env.envs[0].full_opponent_history) >= 2 else None
                    if prev_opp is not None:
                        if action_val == prev_opp:
                            notes += "TFT? "
                        if action_val == 0 and opponent_action == 1:
                            notes += "EXPLOITED "
                        elif action_val == 1 and opponent_action == 0:
                            notes += "EXPLOITING "
                
                print(f"{i+1:>3d} | {'C' if action_val==0 else 'D':>5} | {'C' if opponent_action==0 else 'D':>8} | "
                      f"{reward[0]:>6.1f} | {total_reward:>10.1f} | {prob_cooperate:>8.3f} | {notes}")
            
            # Periodic Gemini analysis
            if (i + 1) % analyze_every == 0:
                print(f"\n{'🤖 GEMINI ANALYSIS':<20} (Step {i+1})")
                print("-" * 60)
                
                agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
                opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
                
                analysis = analyzer.analyze_strategy_evolution(
                    agent_id=agent_id,
                    move_history=agent_moves,
                    opponent_history=opponent_moves,
                    probabilities=env.envs[0].action_probabilities,
                    rewards=env.envs[0].full_reward_history,
                    opponent_name=name,
                    is_training=False
                )
                print(analysis)
                print("-" * 60)
                
                if print_steps:
                    print(f"\n📋 CONTINUING STEP LOG:")
                    print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
                    print("-" * 80)

        # Store detailed results
        agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
        opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
        
        detailed_results[name] = {
            'total_reward': float(total_reward),
            'average_reward': float(total_reward / num_episodes),
            'cooperation_rate': agent_moves.count('C') / len(agent_moves),
            'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
            'avg_reward': float(np.mean(env.envs[0].full_reward_history)),
            'final_probabilities': env.envs[0].action_probabilities[-10:] if len(env.envs[0].action_probabilities) >= 10 else env.envs[0].action_probabilities,
            'agent_id': agent_id,
            'total_episodes': num_episodes
        }
        
        results[name] = total_reward
        
        # Summary for this opponent
        print(f"\n📊 MATCH SUMMARY:")
        print(f"Agent: {agent_id}")
        print(f"Total Reward: {total_reward:.1f}")
        print(f"Average Reward per Episode: {total_reward/num_episodes:.2f}")
        print(f"Agent Cooperation Rate: {detailed_results[name]['cooperation_rate']:.1%}")
        print(f"Opponent Cooperation Rate: {detailed_results[name]['opponent_cooperation_rate']:.1%}")

        # Final comprehensive analysis
        print("\n" + "="*80)
        print("🧠 COMPREHENSIVE GEMINI STRATEGY ANALYSIS")
        print("="*80)
        
        final_analysis = analyzer.compare_strategies(detailed_results)
        print(final_analysis)
        
        print("\n" + "="*80)
        print("📊 FINAL PPO PERFORMANCE SUMMARY")
        print("="*80)
        for strategy, score in results.items():
            details = detailed_results[strategy]
            print(f"{strategy:>20}: {score:>6.1f} pts | Avg: {details['average_reward']:>5.2f} | "
                  f"Coop: {details['cooperation_rate']:>5.1%} | "
                  f"vs {details['opponent_cooperation_rate']:>5.1%} opp")
        
        return results, detailed_results

# ==================================
# 📌 MAIN EXECUTION WITH GEMINI
# ==================================
def main():
    # You need to set your Gemini API key here
    GEMINI_API_KEY = "AIzaSyALn5gEk1DeBNoIFRBdk52K3d_S5JKyC3M"  # Replace with your actual API key
    
    if not GEMINI_API_KEY or GEMINI_API_KEY == "":
        print("⚠️  Please set your Gemini API key in the GEMINI_API_KEY variable")
        print("You can get one from: https://makersuite.google.com/app/apikey")
        return
    
    print("🚀 Training PPO Agent with Gemini Analysis...")
    train_ppo_with_gemini(GEMINI_API_KEY, total_timesteps=250000)
    
    print("\n🧪 Testing PPO with Enhanced Analysis...")
    results, detailed_results = test_ppo_with_gemini(
        gemini_api_key=GEMINI_API_KEY,
        print_steps=True,       # Enable detailed step-by-step logging
        analyze_every=100,      # Analyze every 100 steps
        num_episodes=500        # Number of episodes per opponent
    )

# ==================================
# 🚀 RUN TRAINING & TESTING
# ==================================
if __name__ == "__main__":
    main()

SyntaxError: f-string expression part cannot include a backslash (4206120097.py, line 199)

In [6]:
import random
import numpy as np
import gym
import torch
import json
import time
import warnings
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import google.generativeai as genai
from typing import List, Dict, Tuple

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ==================================
# 📌 ENHANCED GEMINI API INTEGRATION WITH PPO METRICS
# ==================================
class GeminiAnalyzer:
    def __init__(self, api_key: str):
        """Initialize Gemini API for strategy analysis"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash')
        
    def analyze_strategy_evolution(self, agent_id: str, move_history: List[str], 
                                 opponent_history: List[str], probabilities: List[float],
                                 rewards: List[float], opponent_name: str, 
                                 ppo_metrics: Dict = None,
                                 is_training: bool = False, training_step: int = 0) -> str:
        """Analyze how the PPO agent's strategy evolves with comprehensive RL metrics"""
        
        # Add rate limiting to respect API limits
        time.sleep(0.5)
        
        # Calculate additional metrics for strategy analysis
        recent_moves = move_history[-20:] if len(move_history) >= 20 else move_history
        recent_opp_moves = opponent_history[-20:] if len(opponent_history) >= 20 else opponent_history
        
        # Pattern analysis
        consecutive_cooperations = 0
        consecutive_defections = 0
        max_coop_streak = 0
        max_defect_streak = 0
        current_coop_streak = 0
        current_defect_streak = 0
        
        for move in move_history:
            if move == 'C':
                current_coop_streak += 1
                current_defect_streak = 0
                max_coop_streak = max(max_coop_streak, current_coop_streak)
            else:
                current_defect_streak += 1
                current_coop_streak = 0
                max_defect_streak = max(max_defect_streak, current_defect_streak)
        
        # Tit-for-tat analysis
        tit_for_tat_matches = 0
        if len(move_history) > 1 and len(opponent_history) > 0:
            for i in range(1, min(len(move_history), len(opponent_history))):
                if move_history[i] == opponent_history[i-1]:
                    tit_for_tat_matches += 1
        
        tit_for_tat_percentage = (tit_for_tat_matches / max(1, len(move_history)-1)) * 100
        
        # Response to opponent's last move
        responses_to_coop = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'C']
        responses_to_defect = [move_history[i] for i in range(len(move_history)) if i > 0 and opponent_history[i-1] == 'D']
        
        coop_after_coop = responses_to_coop.count('C') / max(1, len(responses_to_coop)) * 100
        coop_after_defect = responses_to_defect.count('C') / max(1, len(responses_to_defect)) * 100
        
        # Value function and policy analysis
        value_trend = "N/A"
        policy_entropy_trend = "N/A"
        exploration_analysis = "N/A"
        advantage_analysis = "N/A"
        
        if ppo_metrics:
            # Value function analysis
            if 'value_estimates' in ppo_metrics and len(ppo_metrics['value_estimates']) > 10:
                recent_values = ppo_metrics['value_estimates'][-10:]
                early_values = ppo_metrics['value_estimates'][:10] if len(ppo_metrics['value_estimates']) > 20 else ppo_metrics['value_estimates'][:5]
                if len(early_values) > 0:
                    value_trend = f"Recent: {np.mean(recent_values):.3f}, Early: {np.mean(early_values):.3f} ({'Rising' if np.mean(recent_values) > np.mean(early_values) else 'Falling'})"
            
            # Policy entropy analysis
            if 'policy_entropy' in ppo_metrics and len(ppo_metrics['policy_entropy']) > 5:
                entropy_recent = np.mean(ppo_metrics['policy_entropy'][-10:])
                policy_entropy_trend = f"Current: {entropy_recent:.4f} ({'High exploration' if entropy_recent > 0.5 else 'Low exploration' if entropy_recent < 0.2 else 'Moderate exploration'})"
            
            # Exploration vs exploitation analysis
            if 'action_probabilities' in ppo_metrics and len(ppo_metrics['action_probabilities']) > 10:
                recent_probs = ppo_metrics['action_probabilities'][-20:]
                prob_variance = np.var(recent_probs)
                avg_confidence = np.mean([max(p, 1-p) for p in recent_probs])
                exploration_analysis = f"Prob variance: {prob_variance:.4f}, Avg confidence: {avg_confidence:.3f} ({'Exploring' if prob_variance > 0.05 else 'Exploiting'})"
            
            # Advantage analysis
            if 'advantages' in ppo_metrics and len(ppo_metrics['advantages']) > 5:
                recent_advantages = ppo_metrics['advantages'][-15:]
                avg_advantage = np.mean(recent_advantages)
                advantage_std = np.std(recent_advantages)
                advantage_analysis = f"Mean: {avg_advantage:.3f}, Std: {advantage_std:.3f} ({'High variance' if advantage_std > 1.0 else 'Stable learning'})"

        # Create enhanced analysis prompt with training/testing context
        training_context = ""
        if is_training:
            training_context = f"""
        TRAINING CONTEXT:
        - Training Step: {training_step:,}
        - Phase: LEARNING (agent is actively updating its policy)
        - Focus: How is the agent's strategy EVOLVING during training?
        """
        else:
            training_context = """
        TESTING CONTEXT:
        - Phase: EVALUATION (agent is using learned policy)
        - Focus: What strategy has the agent LEARNED from training?
        """
        
        # Enhanced prompt with PPO-specific metrics
        prompt = f"""
        You are analyzing a PPO (Proximal Policy Optimization) reinforcement learning agent playing the Prisoner's Dilemma. 
        Your task is to identify what strategy the agent has learned and how its policy is evolving using both behavioral and internal RL metrics.

        GAME CONTEXT:
        - Agent ID: {agent_id}
        - Opponent Strategy: {opponent_name}
        - Total Moves Played: {len(move_history)}
        {training_context}
        
        MOVE SEQUENCES (most recent 30 moves):
        Agent:    {' '.join(move_history[-30:])}
        Opponent: {' '.join(opponent_history[-30:])}
        
        BEHAVIORAL METRICS:
        - Agent Cooperation Rate: {move_history.count('C') / len(move_history) * 100:.1f}%
        - Opponent Cooperation Rate: {opponent_history.count('C') / len(opponent_history) * 100:.1f}%
        - Tit-for-Tat Adherence: {tit_for_tat_percentage:.1f}%
        - Cooperate after Opponent Cooperates: {coop_after_coop:.1f}%
        - Cooperate after Opponent Defects: {coop_after_defect:.1f}%
        - Max Cooperation Streak: {max_coop_streak}
        - Max Defection Streak: {max_defect_streak}
        - Current Streak: {current_coop_streak if move_history[-1] == 'C' else current_defect_streak} {'cooperations' if move_history[-1] == 'C' else 'defections'}
        
        POLICY NETWORK METRICS:
        - Decision Probabilities (last 15): {[f'{p:.3f}' for p in probabilities[-15:]]}
        - Probability Variance (recent): {np.var(probabilities[-20:]) if len(probabilities) >= 20 else np.var(probabilities):.4f}
        - Decision Confidence Trend: {'High' if np.mean([max(p, 1-p) for p in probabilities[-10:]]) > 0.8 else 'Moderate' if np.mean([max(p, 1-p) for p in probabilities[-10:]]) > 0.6 else 'Low'}
        
        PPO LEARNING METRICS:
        - Value Function Trend: {value_trend}
        - Policy Entropy: {policy_entropy_trend}
        - Exploration Analysis: {exploration_analysis}
        - Advantage Estimates: {advantage_analysis}
        
        PERFORMANCE METRICS:
        - Recent Rewards (last 15): {[f'{r:.1f}' for r in rewards[-15:]]}
        - Average Recent Reward: {np.mean(rewards[-20:]) if len(rewards) >= 20 else np.mean(rewards):.2f}
        - Reward Trend: {'Improving' if len(rewards) >= 10 and np.mean(rewards[-5:]) > np.mean(rewards[-10:-5]) else 'Declining' if len(rewards) >= 10 else 'Establishing'}
        - Reward Variance: {np.var(rewards[-20:]) if len(rewards) >= 20 else np.var(rewards):.3f}
        
        STRATEGIC ANALYSIS QUESTIONS (Enhanced with RL perspective):
        
        1. **Policy Learning Assessment**: How has the PPO policy network learned to map states to actions?
           - Is the policy becoming more deterministic (low entropy) or maintaining exploration?
           - Are decision probabilities showing clear patterns or randomness?
        
        2. **Value Function Understanding**: What does the value function trend suggest about state evaluation?
           - Is the agent learning to accurately predict future rewards?
           - How does value estimation correlate with actual performance?
        
        3. **Exploration vs Exploitation Balance**: 
           - Is the agent still exploring (high probability variance) or exploiting learned strategy?
           - How does policy entropy relate to strategic behavior?
        
        4. **Advantage Learning**: What do advantage estimates reveal about action selection?
           - Are advantages stable (indicating consistent strategy) or volatile (still learning)?
           - How do advantages correlate with opponent responses?
        
        6. **Policy Gradient Insights**: Based on probability patterns and entropy:
           - Is the policy converging to a stable strategy?
           - Are there signs of continued policy updates or has it plateaued?
        
        7. **Opponent Modeling**: Does the agent show evidence of learning opponent patterns?(name the opponent based on ur observation and explain how the agent is playing with the opponet)
           - How quickly does it adapt to opponent strategy changes?
           - Is it using history effectively for decision making?
        
        8. **Risk and Uncertainty**: How does the agent handle uncertainty?
           - Does it maintain cautious (cooperative) or aggressive (defective) defaults?
           - How does probability variance relate to performance variance?
        
        {'9. **Training Dynamics**: What changes in RL metrics suggest about learning progress?' if is_training else '9. **Final Policy Assessment**: How sophisticated and robust is the learned policy?'}
        
        Please provide a comprehensive analysis (max 500 words) that integrates both behavioral patterns and PPO learning dynamics and give ur answer in points similar to question.
        Focus on how the internal RL metrics explain the observed strategic behavior and {'learning trajectory' if is_training else 'final learned strategy'}.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini analysis temporarily unavailable: {str(e)[:100]}..."
    
    def compare_strategies(self, results: Dict[str, Dict], is_training: bool = False) -> str:
        """Compare PPO performance across different opponent strategies with enhanced RL analysis"""
        
        # Add rate limiting
        time.sleep(1.0)
        
        # Calculate cross-strategy metrics
        best_performance = max(results.values(), key=lambda x: x['total_reward'])
        worst_performance = min(results.values(), key=lambda x: x['total_reward'])
        
        training_context = "TRAINING ANALYSIS" if is_training else "TESTING ANALYSIS"
        phase_description = "policy learning and adaptation" if is_training else "final learned policy"
        
        prompt = f"""
        {training_context}: Analyze the PPO agent's {phase_description} by comparing its performance across different Prisoner's Dilemma opponents.
        Focus on how PPO's learning mechanisms produced different strategic adaptations.

        PERFORMANCE SUMMARY:
        {json.dumps(results, indent=2)}
        
        COMPARATIVE ANALYSIS:
        - Best Performance: {[k for k, v in results.items() if v['total_reward'] == best_performance['total_reward']][0]} (Score: {best_performance['total_reward']:.1f})
        - Worst Performance: {[k for k, v in results.items() if v['total_reward'] == worst_performance['total_reward']][0]} (Score: {worst_performance['total_reward']:.1f})
        - Performance Range: {best_performance['total_reward'] - worst_performance['total_reward']:.1f} points
        - Performance Variance: {np.var([v['total_reward'] for v in results.values()]):.2f}
        
        PPO LEARNING ANALYSIS QUESTIONS:
        
        1. **Policy Generalization**: How well did PPO learn a generalizable strategy vs. opponent-specific adaptations?(explain how it is performing with the opponent at hand and also name the opponent )
           - Does the agent show consistent strategic principles across opponents?(if more than one opponent can be observed)
           - Or did it overfit to specific opponent patterns?
        
        2. **Multi-Task Learning**: How effectively did PPO handle the multi-environment training?
           - Did training against diverse opponents create a robust policy?
           - Are there signs of catastrophic forgetting or interference?
        
        3. **Value Function Accuracy**: How do performance differences relate to value estimation quality?
           - Which opponents did the agent learn to evaluate most accurately?(Imp)
           - Where do we see value function errors affecting performance?
        
        4. **Policy Gradient Effectiveness**: What does the cooperation rate variation tell us about policy learning?
           - Did PPO successfully learn different action probabilities for different contexts?
           - How well did it balance exploration vs exploitation across environments?
        
        5. **Strategic Sophistication**: Based on the results, how sophisticated is the learned policy?
           - Simple reactive strategies (Tit-for-Tat variants)?
           - Complex multi-step reasoning?
           - Opponent modeling and counter-strategies?
        
        6. **Robustness Analysis**: How robust is the learned strategy?
           - Consistent performance across opponents suggests robust learning
           - High variance suggests overfitting or instability
        
        7. **Exploitation vs Cooperation Trade-off**: How did PPO resolve the exploration-exploitation dilemma?
           - Did it learn when to be cooperative vs when to exploit?
           - How does this relate to the classical game theory solutions?
        
        8. **Learning Efficiency**: Which strategic patterns did PPO learn quickly vs slowly?
           - What does this suggest about the difficulty of different strategic concepts?
           - How well did the reward structure guide learning?
        
        {'9. **Training Trajectory**: What patterns suggest continued learning potential?' if is_training else '9. **Final Assessment**: How does this compare to human expert strategies?'}
        
        Provide comprehensive insights about PPO's learning effectiveness and the sophistication of its {'evolving' if is_training else 'final'} strategic intelligence (max 600 words).
        Integrate game theory concepts with reinforcement learning analysis to explain what the agent has discovered.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Gemini comparison temporarily unavailable: {str(e)[:100]}..."

# ==================================
# 📌 ENHANCED PRISONER'S DILEMMA ENVIRONMENT WITH PPO METRICS
# ==================================
class PrisonersDilemmaEnv(gym.Env):
    def __init__(self, strategy, agent_id="PPO_AGENT", history_len=10):
        super(PrisonersDilemmaEnv, self).__init__()
        self.history_len = history_len
        self.agent_id = agent_id
        self.action_space = gym.spaces.Discrete(2)  # 0: Cooperate, 1: Defect
        self.observation_space = gym.spaces.Box(low=0, high=5, shape=(3 * self.history_len,), dtype=np.float32)

        self.strategy = strategy
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        
        # Enhanced tracking for Gemini analysis
        self.full_agent_history = []
        self.full_opponent_history = []
        self.full_reward_history = []
        self.action_probabilities = []
        
        # PPO-specific metrics tracking
        self.value_estimates = []
        self.policy_entropy = []
        self.advantages = []
        self.step_count = 0

    def step(self, action):
        opponent_action = 0 if self.strategy(self.agent_history, self.opponent_history, False) == 'C' else 1
        reward = self.get_reward(action, opponent_action)

        # Update histories
        self.agent_history.append(action)
        self.opponent_history.append(opponent_action)
        self.reward_history.append(reward)
        
        # Update full histories for analysis
        self.full_agent_history.append(action)
        self.full_opponent_history.append(opponent_action)
        self.full_reward_history.append(reward)
        self.step_count += 1

        # Maintain sliding window
        if len(self.agent_history) > self.history_len:
            self.agent_history.pop(0)
            self.opponent_history.pop(0)
            self.reward_history.pop(0)

        obs = self._get_obs()
        return obs, reward, False, {}

    def reset(self):
        self.agent_history = []
        self.opponent_history = []
        self.reward_history = []
        self.step_count = 0
        return self._get_obs()

    def _get_obs(self):
        pad = lambda x: x + [0] * (self.history_len - len(x))
        return np.array(pad(self.agent_history) + pad(self.opponent_history) + pad(self.reward_history), dtype=np.float32)

    def get_reward(self, action, opponent_action):
        # Classic Prisoner's Dilemma payoff matrix
        if action == 0 and opponent_action == 0:      # Both cooperate
            return 3
        elif action == 1 and opponent_action == 1:    # Both defect
            return 1
        elif action == 0 and opponent_action == 1:    # Agent cooperates, opponent defects
            return 0
        else:                                         # Agent defects, opponent cooperates
            return 5
    
    def add_action_probability(self, prob):
        """Store action probability for analysis"""
        self.action_probabilities.append(prob)
    
    def add_ppo_metrics(self, value_est=None, entropy=None, advantage=None):
        """Store PPO-specific metrics for analysis"""
        if value_est is not None:
            self.value_estimates.append(value_est)
        if entropy is not None:
            self.policy_entropy.append(entropy)
        if advantage is not None:
            self.advantages.append(advantage)

# ==================================
# 📌 ENHANCED GEMINI TRAINING CALLBACK WITH PPO METRICS
# ==================================
class GeminiTrainingCallback(BaseCallback):
    def __init__(self, gemini_analyzer: GeminiAnalyzer, analysis_frequency: int = 10000, verbose: int = 0):
        super(GeminiTrainingCallback, self).__init__(verbose)
        self.gemini_analyzer = gemini_analyzer
        self.analysis_frequency = analysis_frequency
        self.last_analysis_step = 0
        
    def _on_step(self) -> bool:
        # Check if it's time for analysis
        if self.num_timesteps - self.last_analysis_step >= self.analysis_frequency:
            self.last_analysis_step = self.num_timesteps
            
            # Get environment data for analysis
            try:
                # Access the first environment (assuming DummyVecEnv)
                env = self.training_env.envs[0]
                
                if hasattr(env, 'full_agent_history') and len(env.full_agent_history) > 50:
                    # Convert to symbolic moves
                    agent_moves = ['C' if a == 0 else 'D' for a in env.full_agent_history]
                    opponent_moves = ['C' if a == 0 else 'D' for a in env.full_opponent_history]
                    
                    # Get strategy name from environment
                    strategy_name = getattr(env, 'strategy_name', 'Unknown')
                    
                    # Collect PPO metrics
                    ppo_metrics = {
                        'value_estimates': getattr(env, 'value_estimates', []),
                        'policy_entropy': getattr(env, 'policy_entropy', []),
                        'advantages': getattr(env, 'advantages', []),
                        'action_probabilities': getattr(env, 'action_probabilities', [])
                    }
                    
                    print(f"\n{'='*60}")
                    print(f"🧠 GEMINI TRAINING ANALYSIS - Step {self.num_timesteps:,}")
                    print(f"{'='*60}")
                    
                    # Perform Gemini analysis with PPO metrics
                    analysis = self.gemini_analyzer.analyze_strategy_evolution(
                        agent_id=env.agent_id,
                        move_history=agent_moves,
                        opponent_history=opponent_moves,
                        probabilities=env.action_probabilities[-100:] if len(env.action_probabilities) > 100 else env.action_probabilities,
                        rewards=env.full_reward_history,
                        opponent_name=strategy_name,
                        ppo_metrics=ppo_metrics,
                        is_training=True,
                        training_step=self.num_timesteps
                    )
                    
                    print(analysis)
                    print("="*60)
                    
            except Exception as e:
                if self.verbose > 0:
                    print(f"Training analysis error: {str(e)[:100]}...")
        
        return True

# ==================================
# 📌 STRATEGIES (with proper conversion)
# ==================================
def to_symbolic(hist):
    return ['C' if a == 0 else 'D' for a in hist]

def tit_for_tat(agent_hist, opp_hist, __):
    return 'C' if not agent_hist else ('C' if agent_hist[-1] == 0 else 'D')

def tit_for_two_tats(agent_hist, opp_hist, __):
    if len(agent_hist) >= 2 and agent_hist[-2:] == [1, 1]:
        return 'D'
    return 'C'

def generous_tit_for_tat(agent_hist, opp_hist, __, cooperation_prob=0.1):
    if not agent_hist:
        return 'C'
    if agent_hist[-1] == 1:
        return 'C' if random.random() < cooperation_prob else 'D'
    return 'C'

def always_cooperate(*args): return 'C'
def always_defect(*args): return 'D'
def random_strategy(*args): return random.choice(['C', 'D'])
def friedman(agent_hist, opp_hist, __): return 'D' if 1 in agent_hist else 'C'

def joss(agent_hist, opp_hist, __):
    if agent_hist and random.random() < 0.1:
        return 'D'
    if agent_hist: 
        return 'C' if agent_hist[-1] == 0 else 'D'
    return 'C'

def grasskamp(agent_hist, opp_hist, __):
    return 'D' if len(agent_hist) >= 3 and agent_hist[-3:] == [1, 1, 1] else 'C'

def sample(agent_hist, opp_hist, __): return random.choice(['C', 'C', 'D'])
def tester(agent_hist, opp_hist, __): return 'D' if not agent_hist else 'C'
def opportunist(agent_hist, opp_hist, __): return 'D' if agent_hist and agent_hist[-1] == 0 else 'C'
def backstabber(agent_hist, opp_hist, __): return 'C' if len(agent_hist) < 5 else 'D'
def chaos_agent(*args): return random.choice(['C', 'D'])

# ==================================
# 📌 ENHANCED TRAIN PPO WITH COMPREHENSIVE METRICS
# ==================================
def train_ppo_with_gemini(gemini_api_key: str, history_len=10, total_timesteps=200000, analysis_frequency=40000):
    """Train PPO with comprehensive Gemini analysis including PPO metrics"""
    
    strategies = {
        "tit_for_tat": tit_for_tat,
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    print("🚀 Training PPO Agent with Enhanced Gemini Analysis...")
    print(f"Total timesteps: {total_timesteps:,}")
    print(f"Analysis frequency: every {analysis_frequency:,} steps")
    print("="*60)

    # Create environments with strategy names for better tracking
    envs = []
    for i, (name, strategy) in enumerate(strategies.items()):
        def make_env(s=strategy, n=name, aid=f"PPO_vs_{name}_training"):
            env = PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)
            env.strategy_name = n  # Add strategy name for callback
            return env
        envs.append(make_env)
    
    vec_env = DummyVecEnv(envs)

    # Create PPO model with metrics tracking
    model = PPO("MlpPolicy", vec_env, verbose=1, ent_coef=0.01, tensorboard_log="./ppo_tensorboard/")
    
    # Create Gemini callback for training analysis
    gemini_callback = GeminiTrainingCallback(
        gemini_analyzer=analyzer,
        analysis_frequency=analysis_frequency,
        verbose=1
    )
    
    # Train with Gemini analysis
    model.learn(total_timesteps=total_timesteps, callback=gemini_callback)
    
    # Final training analysis with PPO metrics
    print("\n" + "="*80)
    print("🎯 FINAL GEMINI TRAINING ANALYSIS WITH PPO METRICS")
    print("="*80)
    
    # Collect final training data from all environments
    training_results = {}
    for i, (name, _) in enumerate(strategies.items()):
        env = vec_env.envs[i]
        if hasattr(env, 'full_agent_history') and len(env.full_agent_history) > 0:
            agent_moves = ['C' if a == 0 else 'D' for a in env.full_agent_history]
            opponent_moves = ['C' if a == 0 else 'D' for a in env.full_opponent_history]
            
            training_results[name] = {
                'total_reward': float(np.sum(env.full_reward_history)),
                'average_reward': float(np.mean(env.full_reward_history)),
                'cooperation_rate': agent_moves.count('C') / len(agent_moves),
                'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
                'total_episodes': len(env.full_agent_history),
                'agent_id': env.agent_id,
                'reward_variance': float(np.var(env.full_reward_history)),
                'probability_variance': float(np.var(env.action_probabilities)) if env.action_probabilities else 0,
                'final_confidence': float(np.mean([max(p, 1-p) for p in env.action_probabilities[-20:]])) if len(env.action_probabilities) >= 20 else 0
            }
    
    if training_results:
        final_analysis = analyzer.compare_strategies(training_results, is_training=True)
        print(final_analysis)
    
    # Save model
    model.save("ppo_enhanced_with_metrics")
    print("\n✅ Model saved as 'ppo_enhanced_with_metrics'")
    
    return model, training_results

# ==================================
# 📌 ENHANCED TEST WITH PPO METRICS EXTRACTION
# ==================================
def test_ppo_with_enhanced_analysis(gemini_api_key: str, model_path: str = None, history_len=10, 
                                  print_steps=True, analyze_every=100, num_episodes=500):
    """Test PPO with comprehensive PPO metrics analysis"""
    
    strategies = {
        "tit_for_tat": tit_for_tat,
      
        "tit_for_two_tats": tit_for_two_tats,
        "always_defect": always_defect,
        "always_cooperate": always_cooperate,
    }

    # Initialize Gemini analyzer
    analyzer = GeminiAnalyzer(gemini_api_key)
    
    # Load model
    if model_path:
        print(f"\n🔹 Loading PPO Model from: {model_path}")
        model = PPO.load(model_path)
    else:
        print("\n🔹 Loading Default PPO Model")
        try:
            model = PPO.load("ppo_enhanced_with_metrics")
        except:
            try:
                model = PPO.load("ppo_all_strategies_full_history_with_gemini")
            except:
                model = PPO.load("ppo_all_strategies_full_history")
    
    results = {}
    detailed_results = {}

    for name, strategy in strategies.items():
        print(f"\n{'='*60}")
        print(f"🧪 TESTING PPO AGAINST: {name.upper()}")
        print(f"{'='*60}")
        
        agent_id = f"PPO_vs_{name}_test"
        env = DummyVecEnv([lambda s=strategy, aid=agent_id: PrisonersDilemmaEnv(s, agent_id=aid, history_len=history_len)])
        obs = env.reset()
        total_reward = 0
        
        # Detailed logging header
        if print_steps:
            print(f"\n📋 DETAILED STEP-BY-STEP LOG:")
            print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
            print("-" * 80)

        for i in range(num_episodes):
            # Get action and probabilities more robustly
            action, _ = model.predict(obs, deterministic=False)
            
            # Get action probabilities with proper device handling
            try:
                # Ensure tensor is on the same device as the model
                device = next(model.policy.parameters()).device
                obs_tensor = torch.FloatTensor(obs).to(device)
                
                with torch.no_grad():
                    # Extract features and get logits
                    features = model.policy.extract_features(obs_tensor)
                    latent_pi = model.policy.mlp_extractor.forward_actor(features)
                    logits = model.policy.action_net(latent_pi)
                    action_probs = torch.softmax(logits, dim=-1)
                    prob_cooperate = float(action_probs[0][0].cpu())
            except Exception as e:
                # Fallback: use a simple heuristic based on recent actions
                prob_cooperate = 0.5  # Default probability
                if i == 0:  # Only print warning once
                    print(f"Warning: Using default probability due to: {str(e)[:50]}...")
            
            # Take step
            obs, reward, done, _ = env.step(action)
            action_val = int(action[0])
            
            # Store probability
            env.envs[0].add_action_probability(prob_cooperate)
            
            opponent_action = env.envs[0].opponent_history[-1]
            total_reward += reward[0]
            
            # Detailed step logging
            if print_steps:
                # Determine notes for interesting patterns
                notes = ""
                if i > 0:
                    prev_opp = env.envs[0].full_opponent_history[-2] if len(env.envs[0].full_opponent_history) >= 2 else None
                    if prev_opp is not None:
                        if action_val == prev_opp:
                            notes += "TFT? "
                        if action_val == 0 and opponent_action == 1:
                            notes += "EXPLOITED "
                        elif action_val == 1 and opponent_action == 0:
                            notes += "EXPLOITING "
                
                print(f"{i+1:>3d} | {'C' if action_val==0 else 'D':>5} | {'C' if opponent_action==0 else 'D':>8} | "
                      f"{reward[0]:>6.1f} | {total_reward:>10.1f} | {prob_cooperate:>8.3f} | {notes}")
            
            # Periodic Gemini analysis
            if (i + 1) % analyze_every == 0:
                print(f"\n{'🤖 GEMINI ANALYSIS':<20} (Step {i+1})")
                print("-" * 60)
                
                agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
                opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
                
                analysis = analyzer.analyze_strategy_evolution(
                    agent_id=agent_id,
                    move_history=agent_moves,
                    opponent_history=opponent_moves,
                    probabilities=env.envs[0].action_probabilities,
                    rewards=env.envs[0].full_reward_history,
                    opponent_name=name,
                    is_training=False
                )
                print(analysis)
                print("-" * 60)
                
                if print_steps:
                    print(f"\n📋 CONTINUING STEP LOG:")
                    print(f"{'Step':<4} | {'Agent':<5} | {'Opponent':<8} | {'Reward':<6} | {'Cumulative':<10} | {'P(Coop)':<8} | {'Notes'}")
                    print("-" * 80)

        # Store detailed results
        agent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_agent_history]
        opponent_moves = ['C' if a == 0 else 'D' for a in env.envs[0].full_opponent_history]
        
        detailed_results[name] = {
            'total_reward': float(total_reward),
            'average_reward': float(total_reward / num_episodes),
            'cooperation_rate': agent_moves.count('C') / len(agent_moves),
            'opponent_cooperation_rate': opponent_moves.count('C') / len(opponent_moves),
            'avg_reward': float(np.mean(env.envs[0].full_reward_history)),
            'final_probabilities': env.envs[0].action_probabilities[-10:] if len(env.envs[0].action_probabilities) >= 10 else env.envs[0].action_probabilities,
            'agent_id': agent_id,
            'total_episodes': num_episodes
        }
        
        results[name] = total_reward
        
        # Summary for this opponent
        print(f"\n📊 MATCH SUMMARY:")
        print(f"Agent: {agent_id}")
        print(f"Total Reward: {total_reward:.1f}")
        print(f"Average Reward per Episode: {total_reward/num_episodes:.2f}")
        print(f"Agent Cooperation Rate: {detailed_results[name]['cooperation_rate']:.1%}")
        print(f"Opponent Cooperation Rate: {detailed_results[name]['opponent_cooperation_rate']:.1%}")

        # Final comprehensive analysis
        print("\n" + "="*80)
        print("🧠 COMPREHENSIVE GEMINI STRATEGY ANALYSIS")
        print("="*80)
        
        final_analysis = analyzer.compare_strategies(detailed_results)
        print(final_analysis)
        
        print("\n" + "="*80)
        print("📊 FINAL PPO PERFORMANCE SUMMARY")
        print("="*80)
        for strategy, score in results.items():
            details = detailed_results[strategy]
            print(f"{strategy:>20}: {score:>6.1f} pts | Avg: {details['average_reward']:>5.2f} | "
                  f"Coop: {details['cooperation_rate']:>5.1%} | "
                  f"vs {details['opponent_cooperation_rate']:>5.1%} opp")
        
        return results, detailed_results

# ==================================
# 📌 MAIN EXECUTION WITH GEMINI
# ==================================
def main():
    # You need to set your Gemini API key here
    GEMINI_API_KEY = "AIzaSyALn5gEk1DeBNoIFRBdk52K3d_S5JKyC3M"  # Replace with your actual API key
    
    if not GEMINI_API_KEY or GEMINI_API_KEY == "":
        print("⚠️  Please set your Gemini API key in the GEMINI_API_KEY variable")
        print("You can get one from: https://makersuite.google.com/app/apikey")
        return
    
    print("🚀 Training PPO Agent with Gemini Analysis...")
    train_ppo_with_gemini(GEMINI_API_KEY, total_timesteps=200000)
    
    print("\n🧪 Testing PPO with Enhanced Analysis...")
    results, detailed_results = test_ppo_with_enhanced_analysis(
        gemini_api_key=GEMINI_API_KEY,
        print_steps=True,       # Enable detailed step-by-step logging
        analyze_every=100,      # Analyze every 100 steps
        num_episodes=500        # Number of episodes per opponent
    )

# ==================================
# 🚀 RUN TRAINING & TESTING
# ==================================
if __name__ == "__main__":
    main()

🚀 Training PPO Agent with Gemini Analysis...
🚀 Training PPO Agent with Enhanced Gemini Analysis...
Total timesteps: 200,000
Analysis frequency: every 40,000 steps
Using cpu device
Logging to ./ppo_tensorboard/PPO_2
-----------------------------
| time/              |      |
|    fps             | 4581 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 8192 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1870       |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 16384      |
| train/                  |            |
|    approx_kl            | 0.00584854 |
|    clip_fraction        | 0.014      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.688     |
|    explained_variance   | 0.0015     |
|    learning_rate        | 0.0003     |
|    loss                 | 179        |
|

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Here's an analysis of the PPO agent's learning trajectory in the Prisoner's Dilemma, integrating behavioral and RL metrics:

**1. Policy Learning Assessment:**

*   The `Decision Confidence Trend` is `Low`, which means the agent's policy network outputs varying probabilities for cooperation and defection. The `Probability Variance` is `nan`, so we can't use it to assess the deterministic behavior.

**2. Value Function Understanding:**

*   `Value Function Trend` is N/A. Without value function data, we cannot assess the agent's ability to accurately predict future rewards or how value estimation correlates with performance.

**3. Exploration vs Exploitation Balance:**

*   The `Low Decision Confidence Trend` suggests the agent is still exploring and refining its strategy and haven't yet converged to an optimal policy. Policy entropy is N/A.

**4. Advantage Learning:**

*   `Advantage Estimates` is N/A, we cannot determine how stable the advantages are or how they correlate with the oppo